In [1]:
import os
# Define the path to the directory you want to switch to
new_directory = "baselines/Cluster-Analysis"

# Change the current working directory
os.chdir(new_directory)

In [2]:
import os
import psutil

# Get the current process
process = psutil.Process(os.getpid())

# Memory usage in MB
mem = process.memory_info().rss / (1024 * 1024)

print(f"Memory usage of the notebook: {mem:.2f} MB")



Memory usage of the notebook: 61.01 MB


In [135]:

import fasttext.util
import fasttext
import os

import numpy as np

class BaseWordEmbedder:
    def __init__(self, embedding_model=None):
        self.embedding_model = embedding_model
    def embed(self, words):
        raise NotImplemented("Subclass should implement the method")
class FasttextEmbedder(BaseWordEmbedder):
    def __init__(self, embedding_model=None, path = None):
        super().__init__()
        self.embedding_model = embedding_model
        self.model_path = path
        self.model = None
        if self.model_path is not None and os.path.exists(self.model_path):
            self.model = fasttext.load_model(self.model_path)
        elif self.embedding_model is not None:
            self.model = self.embedding_model
        else:
            raise ValueError("No Model or path given")
    def embed(self, words):
        word_embeddings = []
        for word in words:
            word_embeddings.append(self.model.get_word_vector(word))
        return np.array(word_embeddings)
            
def get_word_embedding_model(name = None, model= None, path = None):
    if name == "fasttext":
        return FasttextEmbedder(embedding_model= model, path=path)
    else:
        return BaseWordEmbedder()

from sklearn.decomposition import PCA 
class DimReductionBase:
    def __init__(self, model = None, nr_dims = None):
        self.model = model
        self.nr_dims = nr_dims
    def fit(self, X):
        return self
    def transform(self, X):
        return X
class PCAReduction(DimReductionBase):
    def __init__(self, nr_dims = 5):
        super().__init__()
        self.nr_dims = nr_dims
        self.model = PCA(n_components = self.nr_dims)
    def fit(self, X):
        return self
    def transform(self, X):
        pass
    def fit_transform(self, X):
        return self.model.fit_transform(X)
    
    
from sklearn.cluster import KMeans
class ClusterBase:
    def fit(self, X):
        return self
    def transform(self, X):
        return X
    
class KMeansCluster(ClusterBase):
    def __init__(self, n_clusters = None, random_state= 42):
        super().__init__()
       
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.model = KMeans(n_clusters=self.n_clusters, random_state= self.random_state)
        self.m_clusters = None
    def fit(self, data, weights = None):
        return self.model.fit(data, sample_weight = weights)
    def transform(self, data, weights = None):
        return self.model.predict(data, sample_weight=weights)
    def fit_transform(self, data, weights=None):
        self.model = self.fit(data, weights)
        self.m_clusters = self.transform(data, weights)
        centers = np.array(self.model.cluster_centers_)
        return self.m_clusters, centers
    
import pandas as pd    
class Sia:
    def __init__(self,
                 vocab = None,
                 embedding_model_name = "fasttext",
                 embedding_model = None, 
                 vectorizer = None,
                 nr_dimensions = None,
                 reduction_model = None,
                 nr_topics = 10,
                 rerank = None,
                 weighting = None
                ):
        self.embedding_model_name = embedding_model_name
        self.embedding_model = embedding_model
        self.nr_dimensions = nr_dimensions
        self.nr_topics = nr_topics
        self.vocab = vocab
        self.vocab_embeddings = None
        self.vectorizer = vectorizer
        if self.vectorizer is None:
            self.vectorizer = CountVectorizer()
            pass
        self.reduction_model = reduction_model
        if self.reduction_model is None:
            self.reduction_model = PCAReduction(nr_dimensions)
        if self.vocab is None:
            pass#raise TypeError("Vocab should not be NoneType")
        self.cluster_model = None
        self._labels = None
        self.rerank = rerank
        self.weighting = weighting
        self.feat_mat = None
            
    def fit(self, documents, vocab = None, embeddings =None,y = None):
        self.fit_transform(documents = documents, vocab=vocab, embeddings=embeddings, y = y)
        return self
    def fit_transform(self, documents, vocab=None, embeddings=None, y = None):
        if vocab is not None:
            self.vocab = vocab
        elif self.vocab is None:
            if self.vectorizer is None:
                raise ValueError("Please provide the vectorozer")
            else:
                #self.vocab = self.vectorizer.get_feature_names_out()
                #if len(self.vocab) > 0:
                #    self.vocab = self.vectorizer.get_feature_names_out()
                #else:
                self.feat_mat = self.vectorizer.fit_transform(documents)
                self.vocab = self.vectorizer.get_feature_names_out()
                
        if documents is not None:
            #check_documents_type(documents)
            #check_embeddings_shape(embeddings, self.vocab)
            pass
        if self.vocab is None:
            pass
            
        docs_ids = range(len(documents))
        docs = pd.DataFrame({"Document": documents,
                          "ID": docs_ids,
                          "Topic": None})
        vocab_embeddings = None
        if embeddings is None and self.embedding_model is None:
            self.embedding_model = get_word_embedding_model(name = "fasttext", path="embeds/fasttext/wiki.en.bin")
            
        elif self.embedding_model is not None:
            print("creating vocabulary embeddings")
            vocab_embeddings = self.embedding_model.embed(self.vocab)
        else:
            vocab_embeddings = embeddings
        self.vocab_embeddings = vocab_embeddings 
        print(f"vocab embeddings shape: {self.vocab_embeddings.shape}; documents shape : {len(documents)}")
        if self.reduction_model is not None:
            print("reducing the dimensions")
            vocab_embeddings = self.reduction_model.fit_transform(vocab_embeddings)
        elif self.nr_dimensions is not None:
            print("reducing the dimensions with PCA")
            self.reduction_model = PCAReduction(nr_dims = self.nr_dimensions)
            vocab_embeddings = self.reduction_model.fit_transform(vocab_embeddings)
        #weighting
        weights = None
        if self.weighting is not None:
            if self.weighting == "wgt":
                weights = self.feat_mat.toarray().sum(axis=0)
                print(f"weights shape(before) : {weights[0].shape}")
                print(np.squeeze(weights))
                #scale
        if weights is not None:
            scaled_weights = 1/(1+np.exp(weights))
            weights = scaled_weights.reshape(-1)
            print(f"weights shape : {weights.shape}")
            
            
        #start clustering
        self.cluster_model = KMeansCluster(n_clusters = self.nr_topics)
        clusters, centers = self.cluster_model.fit_transform(vocab_embeddings, weights=weights)
        print(f"Finished clustering: {clusters.shape}, centers: {centers.shape}")
    
        sorted_tops = self._sort_closest_centers(centers, clusters, vocab_embeddings)
        print(f"Shape of the sorted topk words {sorted_tops.shape}")
        top_k_indices = None
        if self.rerank:
            top_k_indices = self._find_top_k_words(100, sorted_tops)
        else:
            top_k_indices = self._find_top_k_words(10, sorted_tops)
        ##rerank
        self.labels_ = clusters
        self.top_k = top_k_indices
        if self.rerank is not None:
            print("reranking")
            self.top_k = self._rerank(np.array(top_k_indices), documents)
        return self.top_k
    def get_topic_words(self):
        if self.top_k is None:
            raise ValueError("Fit the model first")
        words = []
        for topic in self.top_k:
            words.append(self.vocab[topic])
        return words
        
    def _rerank(self, topic_word_indices, documents, feat_mat = None, k= 20):
        if feat_mat is None:
            feat_mat = self.vectorizer.transform(documents).toarray()
        feat_mat = feat_mat.T
        topk = []
        print(f"feat matrix: {feat_mat.shape}/ topic word indices: {topic_word_indices.shape}")
        for topic_words_idx in topic_word_indices:
            count = feat_mat[topic_words_idx].sum(axis=1)
            count = count.argsort()[-k:][::-1].astype(int)
            topk.append(topic_words_idx[count])
        #print(topk)
        return topk
    #def 
        
    def _get_document_stats(self, weighting = None):
        pass
    def _find_top_k_words(self, k, top_vals):
        topk_words = []
        for top in range(top_vals.shape[0]):
            ind, unique = [], set()
            for i in top_vals[top]:
                word = self.vocab[i]
                #print(word)
                if word not in unique:
                    ind.append(i)
                    unique.add(word)
                    if len(unique) == k:
                        break
            topk_words.append(ind)
        return topk_words
    def _sort_closest_centers(self, centers, clusters, vocab_embedding, k=20):
        top_idx = []
        for topic in range(centers.shape[0]):
            diic = np.where(clusters==topic)[0]
            dist = np.sum((vocab_embedding[diic]-centers[topic])**2, axis=1)
            topk = dist.argsort()[:k]
            #print(words[diic[topk]])
            #print(diic[topk].shape)
            top_idx = np.vstack((top_idx, diic[topk])) if topic > 0 else diic[topk]
        return top_idx
def sia_dataset_preprocess(docs):
    vocab = set()
    mapping = {}
    for i, doc in enumerate(docs):
        words = doc.split()
        for word in words:
            if word not in vocab:
                vocab.add(word)
                mapping[word] = set()
                mapping[word].add(i)
            else:
                mapping[word].add(i)
    return mapping
    

In [137]:
test_dev = sia_dataset_preprocess(test_corpus)

In [19]:
from cemtom.dataset._20newgroup import fetch_dataset
from cemtom.preprocessing import Preprocessor
dataset = fetch_dataset(remove=("headers", "quotes", "footers"))
token_dict={
    "doc_start_token": '<s>',
    "doc_end_token":'</s>',
    "unk_token":'<unk>',
    "email_token":'<email>',
    "url_token":'<url>',
    "number_token":'<number>',
    "alpha_num_token":'<alpha_num>'
}
preprocessor = Preprocessor(stopwords_list="english", remove_spacy_stopwords = False,
                            token_dict=token_dict, use_spacy_tokenizer=True,min_df=5,
                           max_df = 0.80)
#preprocessor = Preprocessor(stopwords_list="english", remove_spacy_stopwords = False)
ds = preprocessor.preprocess(None,dataset=dataset)

train_corpus, test_corpus = ds.get_partitioned()

Starting to extract the dataset



100%|██████████| 18846/18846 [00:00<00:00, 228431.80it/s]
100%|██████████| 18846/18846 [00:04<00:00, 4690.34it/s]


Filtering 18846
vocab created 20225


In [6]:
embedding_model = FasttextEmbedder(embedding_model=ft)
#embedding_model = get_word_embedding_model(name = "fasttext", path="embeds/fasttext/wiki.en.bin")

In [21]:
[doc for doc in train_corpus+test_corpus if "aaaaarrrrgh" in doc]

[]

In [138]:
model_output = {}
vocab = None#preprocessor.vectorizer.get_feature_names_out()
sia_model = Sia(vocab, embedding_model = embedding_model, vectorizer=preprocessor.vectorizer,nr_dimensions=10, 
                nr_topics = 20, weighting="wgt", rerank = "tf"
               )
model_output['model'] = sia_model.fit_transform(train_corpus)
model_output['topics'] = sia_model.get_topic_words()


creating vocabulary embeddings
vocab embeddings shape: (18381, 300); documents shape : 15507
reducing the dimensions
weights shape(before) : ()
[33 36 70 ...  9  5 11]
weights shape : (18381,)


  scaled_weights = 1/(1+np.exp(weights))


Finished clustering: (18381,), centers: (20, 10)
Shape of the sorted topk words (20, 20)
reranking
feat matrix: (18381, 15507)/ topic word indices: (20, 20)


In [29]:
from octis.evaluation_metrics.coherence_metrics import Coherence

In [115]:
test_docs_split = [doc.split() for doc in test_corpus[:]]
coherence = Coherence(texts = test_docs_split)
c_npmi = coherence.score(model_output)
c_npmi

-0.39066353673805476

In [147]:
print(len(test_corpus))
sia_npmi = average_npmi_topics(model_output['topics'], 20, sia_dataset_preprocess(test_corpus[:]), len(test_corpus[:]))
sia_npmi

2737
-0.88134 iran bosnian karabagh iranian serbian serbia cypriot rus kars poland bulgaria golan nicosia afghanistan sarajevo komsomol shah hungary salah bulgarian
-0.93956 fan pop movie movies comic episode hype hop episodes pals teapot videos cartoon mags slim rap dorm hardcore rave banger
-0.94623 davidians propaganda believers fascist commandment fundamentalist fundamentalists inquisition aryan communism caste baptize crusades crusade religiously westerners anarchist testimonies sectarian imperialism
-0.66024 period land middle cross beginning hundred remains numerous twenty collected twelve thirty fifty partly risen sixty eighty collectively ninety onwards
-0.87268 world la plays film paris prize swedish les breton italian del rawlinson dutch ulysses mater kaiser everest figaro contra germano
-0.87213 truth reality desire thoughts minds minded genuine transgression contemplate undeniable satire eloquent dilemma sincerity recollection anachronism anecdote overtly actuality earnest

-0.87297

In [12]:
model = get_word_embedding_model(name = "fasttext", path="embeds/fasttext/wiki.en.bin")



In [13]:
model

<__main__.FasttextEmbedder at 0x151fbf6c1820>

In [55]:
words = ["prayer", "model", "computer", "science","bible", "something", "ability", "hence",  "jesus", "god"]
dim_reduction = PCAReduction(nr_dims = 2)
words_red = dim_reduction.fit_transform(model.embed(words))

In [56]:
km = KMeansCluster(n_clusters = 2)
m,c = km.fit_transform(words_red)
m, c

(array([1, 0, 0, 0, 1, 0, 0, 0, 1, 1], dtype=int32),
 array([[-1.3804448 , -0.05626277],
        [ 2.0706675 ,  0.0843941 ]], dtype=float32))

In [76]:
topic = 1
diic = np.where(m==topic)[0]
dist = np.sum((words_red[diic]-c[topic])**2, axis=1)
topk = dist.argsort()
diic[topk], topk

(array([9, 8, 0, 4]), array([3, 2, 0, 1]))

In [24]:
import re
def number_tokenizer(document):
    # Regular expression pattern for numbers
    pattern = r'\b\d+(?:,\d{3})*(?:\.\d+)?\b'
    
    # Replace numbers with <number>
    doc_no_numbers = re.sub(pattern, '<number>', document)
    
    # Tokenize by splitting on whitespace
    tokens = doc_no_numbers.split()
    
    return tokens


In [42]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.datasets import fetch_20newsgroups
 
documents = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']


# Example documents
documents = documents
nr_topics = 20

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_df=0.8, min_df=5, stop_words="english", strip_accents="ascii", lowercase=True, token_pattern=r'\b[a-zA-Z]+\b')

# Fit and transform the documents
X = vectorizer.fit_transform(documents)
words=vectorizer.get_feature_names_out()
feat_mat = X.toarray().T
dim_reduction = PCAReduction(nr_dims = 5)
words_red = dim_reduction.fit_transform(embedding_model.embed(words))
km = KMeansCluster(n_clusters = nr_topics)
m,c = km.fit_transform(words_red)
print((m.shape,c.shape))
top_idx = []
top_words = []
sia_model._sort_closest_centers(c, m, words_red)
for topic in range(nr_topics): 
    diic = np.where(m==topic)[0]
    dist = np.sum((words_red[diic]-c[topic])**2, axis=1)
    topk = dist.argsort()[:15]
    #print(words[diic[topk]])
    #print(diic[topk].shape)
    top_idx = np.vstack((top_idx, diic[topk])) if topic > 0 else diic[topk]
    top_words.append(words[diic[topk]])
#top_idx = np.array(top_idx)
top_idx.shape, top_words[2]

((21624,), (20, 5))


((20, 15),
 array(['labor', 'labour', 'general', 'kingdom', 'defence', 'recognized',
        'represented', 'states', 'members', 'formally', 'nurses',
        'students', 'defense', 'representing', 'supervision'], dtype=object))

In [85]:
freq = feat_mat.sum(axis=0)
freq.shape

(18846,)

In [32]:
def rerank(topic_word_indices, feat_mat):
    topk = []
    print(f"feat matrix: {feat_mat.shape}/ topic word indices: {topic_word_indices.shape}")
    for topic_words_idx in topic_word_indices:
        count = feat_mat[topic_words_idx].sum(axis=1)
        count = count.argsort()[-10:][::-1].astype(int)
        topk.append(topic_words_idx[count])
    return topk
        

In [33]:

tki = sia_model._sort_closest_centers(c, m, words_red)
#f_mat = vectorizer.transform(documents[:1000])
#print(f_mat.T.shape)
reranked = rerank(tki, feat_mat)
#reranked
#f_mat.T.sum(axis = 1).reshape(-1).shape
reranked

feat matrix: (21624, 18846)/ topic word indices: (20, 15)


[array([ 7082,  5043,  1575, 19361,  2941,  1257,  8583, 16283, 14203,
         1752]),
 array([ 7118, 13790, 19196, 17219, 15322,  8983,  5690,  3140, 17483,
         1461]),
 array([ 1442, 11908, 11559, 13125, 19691, 15011,   135,  3851, 18878,
         6728]),
 array([12037, 16409,  2014, 15228,  6736,  5962, 16947,  6922,  4312,
        10573]),
 array([12583, 19459,  3272, 12867, 16295, 16886, 21040, 14729, 11506,
        21039]),
 array([ 2555, 15370,  5642, 20662, 13924, 21304,  6459, 16565, 21591,
        19949]),
 array([18596, 14962,   559, 19319, 16592, 10216,   428,  1319,  3018,
        19209]),
 array([  807,  7067,  4990, 17035,  5906, 15497, 18368,  6434,  5161,
        10018]),
 array([15085, 13764, 10106, 13472,  9792,   594,  9216, 19253, 18282,
        11213]),
 array([ 9452,  6418, 13548, 20870,  5452, 20150, 13906, 15535,  8280,
         6369]),
 array([   14, 16644, 14293,   373, 10804,  6936, 12646, 10030,  6989,
        13295]),
 array([ 6298,  8214,  4470, 154

In [81]:
feat_mat.shape, top_idx.shape

((3565, 100), (10, 15))

In [91]:
count = feat_mat[top_idx[0]].sum(axis=1)
count, count.argsort()[-10:][::-1].astype(int)

(array([1, 1, 5, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2]),
 array([ 2, 14,  9,  5,  3, 13, 12, 11, 10,  8]))

In [28]:
!pwd

/scratch-old/abijuru/thesis/tms/atlas/baselines/Cluster-Analysis


In [118]:
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
#from sklearn_extra.cluster import KMedoids
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralClustering
import pdb

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

#from spherecluster import SphericalKMeans
#from spherecluster import VonMisesFisherMixture

from sklearn.metrics.pairwise import rbf_kernel

from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import networkx as nx
import scipy.stats

def PCA_dim_reduction(intersection, dim):
    intersection = intersection - np.mean(intersection, axis = 0)
    sigma = np.cov(intersection.T)
    eigVals, eigVec = np.linalg.eig(sigma)
    sorted_index = eigVals.argsort()[::-1]
    eigVals = eigVals[sorted_index]
    eigVec = eigVec[:,sorted_index]
    eigVec = eigVec[:,:dim]
    transformed = intersection.dot(eigVec)
    return transformed

def TSNE_dim_reduction(intersection, dim):
    X_embedded = TSNE(n_components=dim).fit_transform(intersection)
    return X_embedded

def Agglo_model(vocab_embeddings, topics, rand):
    agglo = AgglomerativeClustering(n_clusters=topics).fit(vocab_embeddings)
    m_clusters = agglo.labels_
    return m_clusters, find_words_for_cluster(m_clusters, topics)

def DBSCAN_model(vocab_embeddings, e=0.5):
    dbscan = DBSCAN(eps=e, min_samples=10).fit(vocab_embeddings)
    m_clusters = dbscan.labels_
    clusters = len(np.unique(m_clusters[m_clusters>= 0]))
    return m_clusters, find_words_for_cluster(m_clusters, clusters)

def SpectralClustering_Model(vocab_embeddings, topics, rand, pmi):
    precomp = rbf_kernel(vocab_embeddings)

    #print(precomp)
    #pmax, pmin = pmi.max(), pmi.min()
    #pmi = (pmi - pmin)/(pmax - pmin)

    #precomp = precomp * pmi
    #print(precomp)

    #nearest_neighbors
    #precomputed
    SC = SpectralClustering(n_clusters=topics, random_state=rand, affinity = "nearest_neighbors").fit(vocab_embeddings)
    m_clusters = SC.labels_

    return m_clusters, find_words_for_cluster(m_clusters, topics)

def KMedoids_model(vocab_embeddings, vocab, topics,  rand):
    kmedoids = KMedoids(n_clusters=topics, random_state=rand).fit(vocab_embeddings)
    m_clusters = kmedoids.predict(vocab_embeddings)
    centers = np.array(kmedoids.cluster_centers_)
    indices = []

    for i in range(20):
        topk_vals = sort_closest_center(centers[i], m_clusters, vocab_embeddings, i)
        indices.append(find_top_k_words(100, topk_vals, vocab))

    return m_clusters, indices

def KMeans_model(vocab_embeddings, vocab, topics, rerank, rand, weights):
    kmeans = KMeans(n_clusters=topics, random_state=rand).fit(vocab_embeddings, sample_weight=weights)
    m_clusters = kmeans.predict(vocab_embeddings, sample_weight=weights)
    centers = np.array(kmeans.cluster_centers_)

    indices = []

    for i in range(topics):
        topk_vals = sort_closest_center(centers[i], m_clusters, vocab_embeddings, i)
        if rerank:
            indices.append(find_top_k_words(100, topk_vals, vocab))
        else:
            indices.append(find_top_k_words(10, topk_vals, vocab))
        #print(indices)
    return m_clusters, indices


def SphericalKMeans_model(vocab_embeddings,vocab,topics, rerank, rand, weights):

    spkmeans = SphericalKMeans(n_clusters=topics, random_state=rand).fit(vocab_embeddings, sample_weight=weights)
    m_clusters = spkmeans.predict(vocab_embeddings,  sample_weight=weights)
    centers = np.array(spkmeans.cluster_centers_)

    indices = []

    for i in range(topics):
        topk_vals = sort_closest_cossine_center(centers[i], m_clusters, vocab_embeddings, i)
        if rerank:
            indices.append(find_top_k_words(100, topk_vals, vocab))
        else:
            indices.append(find_top_k_words(10, topk_vals, vocab))
        #print(indices)
    return m_clusters, indices

def GMM_model(vocab_embeddings, vocab,  topics, rerank, rand):
    GMM = GaussianMixture(n_components=topics, random_state=rand).fit(vocab_embeddings)
    indices = []
    for i in range(GMM.n_components):
        density = scipy.stats.multivariate_normal(cov=GMM.covariances_[i], mean=GMM.means_[i]).logpdf(vocab_embeddings)
        topk_vals = density.argsort()[-1*len(density):][::-1].astype(int)
        if rerank:
            indices.append(find_top_k_words(100, topk_vals, vocab))
        else:
            indices.append(find_top_k_words(10, topk_vals, vocab))

    return GMM.predict(vocab_embeddings), indices

def VonMisesFisherMixture_Model(vocab_embeddings, vocab, topics, rerank, rand):
    #vmf_soft = VonMisesFisherMixture(n_clusters=topics, posterior_type='hard', n_jobs=-1, random_state=rand).fit(vocab_embeddings)
    print("fitting vmf...")
    vmf_soft = VonMisesFisherMixture(n_clusters=topics, posterior_type='soft', n_jobs=-1, random_state=rand).fit(vocab_embeddings)

    llh = vmf_soft.log_likelihood(vocab_embeddings)
    indices = []
    for i in range(topics):

        topk_vals = llh[i, :].argsort()[::-1].astype(int)
        if rerank:
            indices.append(find_top_k_words(100, topk_vals, vocab))
        else:
            indices.append(find_top_k_words(10, topk_vals, vocab))

    return vmf_soft.predict(vocab_embeddings), indices

def sort_closest_center(center_vec, m_clusters,vocab_embeddings, c_ind):
    data_idx_within_i_cluster = np.array([ idx for idx, clu_num in enumerate(m_clusters) if clu_num == c_ind ])
    one_cluster_tf_matrix = np.zeros((len(data_idx_within_i_cluster) , center_vec.shape[0]))

    for row_num, data_idx in enumerate(data_idx_within_i_cluster):
        one_row = vocab_embeddings[data_idx]
        one_cluster_tf_matrix[row_num] = one_row

    dist_X =  np.sum((one_cluster_tf_matrix - center_vec)**2, axis = 1)
    #topk = min(10, len(data_idx_within_i_cluster))
    #topk_vals = dist_X.argsort()[:topk].astype(int)

    topk_vals = dist_X.argsort().astype(int)
    topk_vals = data_idx_within_i_cluster[topk_vals]

    return topk_vals

def sort_closest_cossine_center(center_vec, m_clusters,vocab_embeddings, c_ind):
        data_idx_within_i_cluster = np.array([ idx for idx, clu_num in enumerate(m_clusters) if clu_num == c_ind ])
        one_cluster_tf_matrix = np.zeros((len(data_idx_within_i_cluster) , center_vec.shape[0]))

        for row_num, data_idx in enumerate(data_idx_within_i_cluster):
            one_row = vocab_embeddings[data_idx]
            one_cluster_tf_matrix[row_num] = one_row

        dist_X =  (cosine_similarity(one_cluster_tf_matrix, center_vec.reshape(1, -1))).squeeze()
        dist_X = 2.0*(1.0-dist_X)
        #topk = min(10, len(data_idx_within_i_cluster))
        #topk_vals = dist_X.argsort()[:topk].astype(int)

        topk_vals = dist_X.argsort().astype(int)
        topk_vals = data_idx_within_i_cluster[topk_vals]

        return topk_vals

def find_top_k_words(k, top_vals, vocab):
    ind = []
    unique = set()
    for i in top_vals:
        word = vocab[i]
        if word not in unique:
            ind.append(i)
            unique.add(vocab[i])
            if len(unique) == k:
                break
    return ind



def rank_freq(top_k_words, train_w_to_f_mult):
    top_10_words = []
    for words in top_k_words:
        words = np.array(words)
        count = np.array([len(train_w_to_f_mult[word]) for word in words ])
        topk_vals = count.argsort()[-10:][::-1].astype(int)
        top_10_words.append(words[topk_vals])
    return top_10_words

def rank_td_idf(top_k_words, tf_idf):
    top_10_words = []
    for words in top_k_words:
        words = np.array(words)
        count = np.array([tf_idf[word] for word in words ])
        #topk_vals = count.argsort()[-10:][::-1].astype(int)
        topk_vals = count.argsort()[-10:][::-1].astype(int)
        top_10_words.append(words[topk_vals])
    return top_10_words


def rank_centrality(top_k_words, top_k, word_in_file):
    for i, cluster in enumerate(top_k):
        cluster = np.array(cluster)

        subgraph = calc_coo_matrix(top_k_words[i], word_in_file)
        G = nx.from_numpy_matrix(subgraph)
        sc = nx.subgraph_centrality(G)

        ind = np.argsort([sc[node] for node in sorted(sc)])[-10:][::-1].astype(int)


        top_k_words[i] = np.array(top_k_words[i])[ind]
    return top_k_words


def calc_coo_matrix(word_intersect, word_in_file):
    coo = np.zeros((len(word_intersect), len(word_intersect)))
    for i in range(len(word_intersect)):
        for j in range(i, len(word_intersect)):
            coo[i, j] = count_wpair(word_intersect[i], word_intersect[j], word_in_file)
            coo[j, i] = coo[i, j]
    return coo

def count_wpair(word1, word2, word_in_file):
    combined_count = 0
    if word1 != word2:
        combined_count = len(set(word_in_file[word1]) & set(word_in_file[word2]))
    return combined_count


def find_words_for_cluster(m_clusters,  clusters):
    indices = []
    for i in range(0, clusters):
        if i == -1:
            continue
        data_idx_within_i_cluster = [ idx for idx, clu_num in enumerate(m_clusters) if clu_num == i ]
        indices.append(data_idx_within_i_cluster)
    return indices

def visualize(intersection):
    intersection_red = TSNE_dim_reduction(intersection, 2)
    for i in range(0,len(n_p)):
        labels = np.where(labels==i, n_p[i], labels)
    plt.scatter(intersection_red[:, 0], intersection_red[:, 1], c=labels, vmin=-0.5, vmax=0.5,  s=5, cmap='RdBu')

    centers = np.empty(shape=(gmm.n_components, intersection_red.shape[1]))
    for i in range(gmm.n_components):
        density = scipy.stats.multivariate_normal(cov=gmm.covariances_[i], mean=gmm.means_[i]).logpdf(intersection)
        centers[i, :] = intersection_red[np.argmax(density)]

    plt.scatter(centers[:, 0], centers[:, 1], c="black", s=35, alpha=0.7)
    plt.show(block=True)


In [119]:
# -*- coding: utf-8 -*-

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import KFold
from nltk.corpus import reuters
import string
import numpy as np

import os

DATADIR = "data"


def create_global_vocab(vocab_files):
    vocab_list = set(line.split()[0] for line in open(vocab_files[0]))
    for vocab in vocab_files:
        vocab_list = vocab_list & set(line.split()[0] for line in open(vocab))
    return vocab_list


def combine_split_children(type):
    files = []
    index = 0
    with open(os.path.join(DATADIR, 'CBTest/data/cbt_train.txt'), encoding='utf-8') as fp:
        data = fp.readlines()
    with open(os.path.join(DATADIR, 'CBTest/data/cbt_valid.txt'), encoding='utf-8') as fp:
        data2 = fp.readlines()
    with open(os.path.join(DATADIR, 'CBTest/data/cbt_test.txt'), encoding='utf-8') as fp:
        data3 = fp.readlines()
    data += "\n"
    data += data2
    data += "\n"
    data += data3

    for line in data:
        words = line.strip()
        if "BOOK_TITLE" in words:
            continue
        elif "CHAPTER" in words:
            words = words.split()[2:]
        else:
            words = words.split()

        if "-RRB-" in words:
            words.remove("-RRB-")
        if "-LRB-" in words:
            words.remove("-LRB-")

        sentence = (" ".join(words) + "\n")
        if "-RCB-" in words:
            sentence = sentence[0:sentence.find("-")] + sentence[sentence.rfind("-") + 1:]

        if index % 20 == 0:
            files.append(sentence)
        else:
            files[int(index / 20)] += sentence

        index += 1
    files = np.array(files)

    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    indices = list(kf.split(files))[0]

    train_valid = files[indices[0]]
    test = files[indices[1]]

    kf = KFold(n_splits=4, shuffle=True, random_state=0)
    indices = list(kf.split(train_valid))[0]

    train = train_valid[indices[0]]
    valid = train_valid[indices[1]]
    if type == "train":
        return train
    elif type == "valid":
        return valid
    else:
        return test


def create_files_20news(type):
    if type == "valid":
        type = "test"
    data = fetch_20newsgroups(data_home='./data/', subset=type, remove=('headers', 'footers', 'quotes'))
    files = data['data'];
    return files


def create_files_reuters(type):
    t = type
    if type == "valid":
        t = "test"

    documents = reuters.fileids()
    id = [d for d in documents if d.startswith(t)]
    files = np.array([reuters.raw(doc_id) for doc_id in id])

    # if type != "test":
    #     kf = KFold(n_splits=5, shuffle=True, random_state = 0)
    #     indices = list(kf.split(files))[0]
    #     train = files[indices[0]]
    #     valid = files[indices[1]]

    #     if type == "train":
    #         return train
    #     elif type == "valid":
    #         return valid
    return files


def create_files_children(type):
    files = combine_split_children(type)
    return files


def create_vocab_preprocess(stopwords, data, vocab, preprocess, process_data=False):
    word_to_file = {}
    word_to_file_mult = {}
    strip_punct = str.maketrans("", "", string.punctuation)
    strip_digit = str.maketrans("", "", string.digits)

    process_files = []
    for file_num in range(0, len(data)):
        words = data[file_num].lower().translate(strip_punct).translate(strip_digit)
        words = words.split()
        # words = [w.strip() for w in words]
        proc_file = []

        for word in words:
            if word in stopwords or (word not in vocab and len(vocab)) or word == "dlrs" or word == "revs":
                continue
            if word in word_to_file:
                word_to_file[word].add(file_num)
                word_to_file_mult[word].append(file_num)
            else:
                word_to_file[word] = set()
                word_to_file_mult[word] = []

                word_to_file[word].add(file_num)
                word_to_file_mult[word].append(file_num)

        process_files.append(proc_file)

    for word in list(word_to_file):
        if len(word_to_file[word]) <= preprocess or len(word) <= 3:
            word_to_file.pop(word, None)
            word_to_file_mult.pop(word, None)

    print("Files:" + str(len(data)))
    print("Vocab: " + str(len(word_to_file)))

    if process_data:
        vocab = word_to_file.keys()
        files = []
        for proc_file in process_files:
            fil = []
            for w in proc_file:
                if w in vocab:
                    fil.append(w)
            files.append(" ".join(fil))

        data = files

    return word_to_file, word_to_file_mult, data


def create_vocab_and_files(stopwords, dataset, preprocess, type, vocab):
    data = None
    if dataset == "20NG":
        data = create_files_20news(type)
    elif dataset == "children":
        data = create_files_children(type)
    elif dataset == "reuters":
        data = create_files_reuters(type)

    return create_vocab_preprocess(stopwords, data, vocab, preprocess)


In [120]:
#import gensim
import fasttext.util
import fasttext
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
import pdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler


def create_id_dict(id2name):
    data = {}
    for line in open(id2name):
        mapping = line.split()
        data[mapping[0]] = mapping[1]
    return data

def read_entity_file(file, id_to_word, vocab, entities, elmomix=None):
    data = []
    word_index = {}
    index = 0
    mapping = None
    if id_to_word != None:
        mapping = create_id_dict(id_to_word)

    if elmomix is None:
        for line in open(file):
            embedding = line.split()
            if id_to_word != None:
                embedding[0] = mapping[embedding[0]][1:]
            if embedding[0] in vocab:
                word_index[embedding[0]] = index
                index +=1
                if entities == "glove":
                    embedding = list(map(float, embedding[-300:]))
                else:
                    embedding = list(map(float, embedding[1:]))
                data.append(embedding)
    else:  # specify mixing coefficients for ELMo
        assert file[-1] in "012" and file[-7:-1] == ".layer"
        with open(file[:-1] + "0") as f0, open(file[:-1] + "0") as f0, open(file[:-1] + "1") as f1, open(file[:-1] + "2") as f2:
            for l0, l1, l2 in zip(f0, f1, f2):
                e0 = l0.split()
                e1 = l1.split()
                e2 = l2.split()
                assert e0[0] == e1[0] and e1[0] == e2[0]
                assert len(e0) == len(e1) and len(e1) == len(e2)
                if id_to_word != None:
                    e0[0] = mapping[e0[0]][1:]
                    e1[0] = mapping[e1[0]][1:]
                    e2[0] = mapping[e2[0]][1:]
                if e0[0] in vocab:
                    word_index[e0[0]] = index
                    index +=1
                    embedding = [elmomix[0] * float(x0) + elmomix[1] * float(x1) + elmomix[2] * float(x2) for x0, x1, x2 in zip(e0[1:], e1[1:], e2[1:])]
                    data.append(embedding)

    print("KG: " + str(len(data)))
    return data, word_index

def create_doc_to_word_emb(word_to_doc, file_num, word_list, dim):
    word_to_doc_matrix = np.zeros((len(word_list), file_num))
    for i, word in enumerate(word_list):
        for doc in word_to_doc[word]:
            word_to_doc_matrix[i][doc] += 1

    trun_ftw = TruncatedSVD(n_components=dim).fit_transform(word_to_doc_matrix)
    return trun_ftw

def find_intersect(word_index, vocab, data, files, type, add_doc):
    if add_doc == "DUP":
        return find_intersect_mult(word_index, vocab, data, type)
    elif add_doc == "SVD":
        intersection, words_index_intersect = find_intersect_unique(word_index, vocab, data, type)
        u = create_doc_to_word_emb(vocab, files, words_index_intersect, 1000)
        u = preprocessing.scale(u)
        #intersection = np.concatenate((intersection, u), axis=1)
        return u, words_index_intersect
    else:
        return find_intersect_unique(word_index, vocab, data, type)

def find_intersect_unique(word_index, vocab, data, type):
    words = []
    vocab_embeddings = []

    intersection = set(word_index.keys()) & set(vocab.keys())
    print("Intersection: " + str(len(intersection)))

    intersection = np.sort(np.array(list(intersection)))
    for word in intersection:
        if type == "word2vec":
            vocab_embeddings.append(data[word])
        else:
            vocab_embeddings.append(data[word_index[word]])
        words.append(word)

    vocab_embeddings = np.array(vocab_embeddings)

    return vocab_embeddings, words

def find_intersect_mult(word_index, vocab, data, type):
    words = []
    vocab_embeddings = []

    intersection = set(word_index.keys()) & set(vocab.keys())
    print("Intersection: " + str(len(intersection)))

    intersection = np.sort(np.array(list(intersection)))
    for word in intersection:
        for i in range(len(vocab[word])):
            if type == "word2vec":
                vocab_embeddings.append(data[word])
            else:
                vocab_embeddings.append(data[word_index[word]])
            words.append(word)
    print(len(words))
    vocab_embeddings = np.array(vocab_embeddings)
    return vocab_embeddings, words

def create_entities_ft(model, train_word_to_file, doc_info):
    #print("getting fasttext embeddings..")
    vocab_embeddings = []
    words = []
    intersection = set(train_word_to_file.keys())
    for word in intersection:
        if doc_info == "DUP":
            for i in train_word_to_file[word]:
                vocab_embeddings.append(model.get_word_vector(word))
                words.append(word)
        else:
            vocab_embeddings.append(model.get_word_vector(word))
            words.append(word)
    vocab_embeddings = np.array(vocab_embeddings)
    #print("complete..")
    return vocab_embeddings, words



def get_weights_tf(vocab_list, weights):
    return np.array([len(weights[w]) for w in vocab_list])

def get_rs_weights_tf(vocab_list, wghts):
    weights  = get_weights_tf(vocab_list, wghts)
    transformer = RobustScaler().fit(get_weights_tf(vocab_list, wghts).reshape(-1, 1))
    weight  = transformer.transform(weights.reshape(-1, 1))
    x  = MinMaxScaler().fit(weight)
    weights = (x.transform(weight)).T.squeeze()
    return weights

def get_weights_tfidf(vocab_list, weights):
    return [weights[w] for w in vocab_list]
def get_weights_tfdf(vocab_list, word_file_count, files_num):
    count = np.array(get_weights_tf(vocab_list, word_file_count))
    tf = count/np.sum(count)

    df = np.array([len(np.unique(word_file_count[w])) for w in vocab_list])
    df = df/files_num

    weights = tf * df
    print(weights.shape)

    tfdf = {}
    for i, w in enumerate(vocab_list):
        tfdf[w]=weights[i]

    return weights, tfdf
def get_tfidf_score(data, train_vocab):
    tf_idf_score = {}

    tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(data)

    words = tfidf_vectorizer.get_feature_names()
    total_tf_idf = tfidf_vectorizer_vectors.toarray().sum(axis=0)

    vocab = set(words) & set(train_vocab.keys())
    for i, word in enumerate(words):
        if word in vocab:
            tf_idf_score[word] = total_tf_idf[i]

    return tf_idf_score


In [121]:

import numpy as np




def average_npmi_topics(topic_words, ntopics, word_doc_counts, nfiles):

    eps = 10**(-12)

    all_topics = []
    for k in range(ntopics):
        word_pair_counts = 0
        topic_score = []

        ntopw = len(topic_words[k])

        for i in range(ntopw-1):
            for j in range(i+1, ntopw):
                w1 = topic_words[k][i]
                w2 = topic_words[k][j]

                w1w2_dc = len(word_doc_counts.get(w1, set()) & word_doc_counts.get(w2, set()))
                w1_dc = len(word_doc_counts.get(w1, set()))
                w2_dc = len(word_doc_counts.get(w2, set()))

                # what we had previously:
                #pmi_w1w2 = np.log(((w1w2_dc * nfiles) + eps) / ((w1_dc * w2_dc) + eps))

                # Correct eps:
                pmi_w1w2 = np.log((w1w2_dc * nfiles) / ((w1_dc * w2_dc) + eps) + eps)
                npmi_w1w2 = pmi_w1w2 / (- np.log( (w1w2_dc)/nfiles + eps))

                # Sanity check Which is equivalent to this:
                #if w1w2_dc ==0:
                #    npmi_w1w2 = -1
                #else:
                    #pmi_w1w2 = np.log( (w1w2_dc * nfiles)/ (w1_dc*w2_dc))
                    #npmi_w1w2 = pmi_w1w2 / (-np.log(w1w2_dc/nfiles))

                #if npmi_w1w2>1 or npmi_w1w2<-1:
                #    print("NPMI score not bounded for:", w1, w2)
                #    print(npmi_w1w2)
                #    sys.exit(1)

                topic_score.append(npmi_w1w2)

        all_topics.append(np.mean(topic_score))

    for k in range(ntopics):
        print(np.around(all_topics[k],5), " ".join(topic_words[k]))

    avg_score = np.around(np.mean(all_topics), 5)
    #print(f"\nAverage NPMI for {ntopics} topics: {avg_score}")

    return avg_score


In [124]:

from sklearn.metrics import pairwise_distances_argmin_min
import sys
import argparse
import string
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import gensim
import pdb
import math
import random

NSEEDS = 5

def main():
    args = parse_args()

    stopwords = set(line.strip() for line in open('stopwords_en.txt'))

    vocab = create_global_vocab(args.vocab)

    train_word_to_file, train_w_to_f_mult, files = create_vocab_and_files(stopwords, args.dataset, args.preprocess, "train", vocab)
    files_num = len(files)
    print("len vocab size:", len(train_word_to_file.keys()))

    intersection = None
    words_index_intersect = None

    tf_idf = get_tfidf_score(files, train_word_to_file)

    if args.entities == "word2vec":
        model = gensim.models.KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin', binary=True)
        intersection, words_index_intersect  = find_intersect(model.vocab,  train_w_to_f_mult, model, files_num, args.entities, args.doc_info)
    elif args.entities == "fasttext":

        # for compatibility, but move everything to embeds later.
        #if os.path.exists('models/wiki.en.bin'):
        #    ftfn = 'models/wiki.en.bin'
        #else:
        #
        #ftfn = 'embeds/fasttext/wiki.en.bin'

        #ft = fasttext.load_model(ftfn)
        intersection, words_index_intersect = create_entities_ft(ft, train_w_to_f_mult, args.doc_info)
        print(intersection.shape)
    elif args.entities == "KG" or args.entities == "glove" :
        elmomix = [float(a) for a in args.elmomix.split(";")] if args.elmomix != "" else None
        data, word_index = read_entity_file(args.entities_file, args.id2name, train_word_to_file, args.entities, elmomix=elmomix)
        intersection, words_index_intersect = find_intersect(word_index, train_w_to_f_mult, data, files_num, args.entities, args.doc_info)

    if args.use_dims:
        intersection = PCA_dim_reduction(intersection, args.use_dims)

    #weights , tfdf = get_weights_tfdf(words_index_intersect, train_w_to_f_mult, files_num)
    weights = None
    tfdf = None


    if args.doc_info == "WGT":
        weights = get_weights_tf(words_index_intersect, train_w_to_f_mult)

    if args.doc_info == "robust":
        weights = get_rs_weights_tf(words_index_intersect, train_w_to_f_mult)

    if args.doc_info == "tfdf":
        weights , tfdf = get_weights_tfdf(words_index_intersect, train_w_to_f_mult, files_num)

    if weights is not None and args.scale == "sigmoid":
        print("scaling.. sigmoid")
        weights = 1 / (1 + np.exp(weights))


    elif weights is not None and args.scale == "log":
        print("scaling.. log")
        weights = np.log(weights)





    dev_word_to_file, dev_word_to_file_mult, dev_files = create_vocab_and_files(stopwords, args.dataset,args.preprocess, "valid", vocab)
    dev_files_num = len(dev_files)


    test_word_to_file, test_word_to_file_mult, test_files = create_vocab_and_files(stopwords, args.dataset,args.preprocess, "test", vocab)
    test_files_num = len(test_files)




    topics_npmi = []

    for topics in args.num_topics:
        npmis = []

        print("Number of Clusters:" + str(topics))
        rand = 0
        global NSEEDS
        while rand < NSEEDS:

            try:
                top_k_words, top_k = cluster(args.clustering_algo, intersection, words_index_intersect, topics, args.rerank, weights, args.topics_file, rand)
            except:
                print("Warning: failed, try diff random seed.")
                new_rand = random.randint(5,1000)
                top_k_words, top_k = cluster(args.clustering_algo, intersection, \
                        words_index_intersect, topics, args.rerank, weights, args.topics_file, new_rand)



            top_k_words = rerank(args.rerank, top_k_words, top_k, train_w_to_f_mult, train_word_to_file, tf_idf, tfdf)
            val = average_npmi_topics(top_k_words, len(top_k_words), dev_word_to_file, dev_files_num)

            if np.isnan(val):
                NSEEDS +=1
                rand += 1
                continue

            npmi_score = np.around(val, 5)
            print("NPMI:" + str(npmi_score))
            npmis.append(npmi_score)

            rand += 1

        topics_npmi.append(np.mean(npmis))
        print("NPMI Mean:" + str(np.around(topics_npmi[-1], 5)))
        print("NPMI Var:" + str(np.around(np.var(npmis), 5)))

    best_topic = args.num_topics[np.argmax(topics_npmi)]







def cluster(clustering_algo, intersection, words_index_intersect, num_topics, rerank, weights, topics_file, rand):
    if clustering_algo == "KMeans":
        labels, top_k  = KMeans_model(intersection, words_index_intersect, num_topics, rerank, rand, weights)
    elif clustering_algo == "SPKMeans":
        labels, top_k  = SphericalKMeans_model(intersection, words_index_intersect, num_topics, rerank, rand, weights)
    elif clustering_algo == "GMM":
        labels, top_k = GMM_model(intersection, words_index_intersect, num_topics, rerank, rand)
    elif clustering_algo == "KMedoids":
        labels, top_k  = KMedoids_model(intersection,  words_index_intersect,  num_topics, rand)
    elif clustering_algo == "VMFM":
        labels, top_k = VonMisesFisherMixture_Model(intersection, words_index_intersect, num_topics, rerank, rand)

    #Affinity matrix based
    elif clustering_algo == "DBSCAN":
        k=6
        labels, top_k  = DBSCAN_model(intersection,k)
    elif clustering_algo == "Agglo":
        labels, top_k  = Agglo_model(intersecticlustering_algoon, num_topics, rand)
    elif clustering_algo == "Spectral":
        labels, top_k  = SpectralClustering_Model(intersection,num_topics, rand,  weights)

    if clustering_algo == 'from_file':
        with open('bert_topics.txt', 'r') as f:
            top_k_words = f.readlines()
        top_k_words = [tw.strip().replace(',', '').split() for tw in top_k_words]

    elif clustering_algo == 'LDA':
        with open(topics_file, 'r') as f:
            top_k_words = f.readlines()
        top_k_words = [tw.strip().replace(',', '').split() for tw in top_k_words]
        for i, top_k in enumerate(top_k_words):
            top_k_words[i] = top_k_words[i][2:12]
    else:
        bins, top_k_words = sort(labels, top_k,  words_index_intersect)
    return top_k_words, np.array(top_k)


def rerank(rerank, top_k_words, top_k, train_w_to_f_mult, train_w_to_f, tf_idf, tfdf):
    if rerank=="tf":
        top_k_words =  rank_freq(top_k_words, train_w_to_f_mult)
        #top_k_words =  rank_freq(top_k_words, train_w_to_f)
    elif rerank=="tfidf":
        top_k_words = rank_td_idf(top_k_words, tf_idf)

    elif rerank=="tfdf":
        top_k_words = rank_td_idf(top_k_words, tfdf)

    elif rerank=="graph":
        #doc_matrix = npmi.calc_coo_matrix(words_index_intersect, train_word_to_file)
        top_k_words = rank_centrality(top_k_words, top_k, train_w_to_f)
    return top_k_words






def sort(labels, indices, word_index):
    bins = {}
    index = 0
    top_k_bins = []
    for label in labels:
        if label not in bins:
            bins[label] = [word_index[index]]
        else:
            bins[label].append(word_index[index])
        index += 1;
    for i in range(0, len(indices)):
        ind = indices[i]
        top_k = []
        for word_ind in ind:
            top_k.append(word_index[word_ind])
        top_k_bins.append(top_k)
    return bins, top_k_bins

def print_bins(bins, name, type):
    f = open(name + "_" + type + "_corpus_bins.txt","w+")
    for i in range(0, 20):
        f.write("Bin " + str(i) + ":\n")
        for word in bins[i]:
            f.write(word + ", ")
        f.write("\n\n")

    f.close()

def print_top_k(top_k_bins, name, type):
    f = open(name + "_" + type + "_corpus_top_k.txt","w+")
    for i in range(0, 20):
        f.write("Bin " + str(i) + ":\n")
        top_k = top_k_bins[i]
        for word in top_k:
            f.write(word + ", ")
        f.write("\n\n")
    f.close()

def parse_args(nb_dict = None) :
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("--entities", type=str, choices=["word2vec", "fasttext", "glove", "KG"])
    parser.add_argument( "--entities_file", type=str, help="entity file")
    parser.add_argument( "--elmomix", type=str, default="", help="elmomix coefficients, separated by ';', should sum to 1")

    parser.add_argument("--clustering_algo", type=str, required=True, choices=["KMeans", "SPKMeans", "GMM", "KMedoids","Agglo","DBSCAN","Spectral","VMFM",
        'from_file', 'LDA'])

    parser.add_argument( "--topics_file", type=str, help="topics file")

    parser.add_argument('--use_dims', type=int)
    parser.add_argument('--num_topics',  nargs='+', type=int, default=[20])
    parser.add_argument("--doc_info", type=str, choices=["SVD", "DUP", "WGT", "robust", \
    "logtfdf"])
    parser.add_argument("--rerank", type=str, choices=["tf", "tfidf", "tfdf", "graph"]) \

    parser.add_argument('--id2name', type=Path, help="id2name file")

    parser.add_argument("--dataset", type=str, default ="20NG", choices=["20NG", "children", "reuters"])

    parser.add_argument("--preprocess", type=int, default=5)
    
    parser.add_argument("--vocab", required=True,  type=str, nargs='+', default=[])
    parser.add_argument("--scale", type=str, required=False)
    
    args_dict = dict()
    if nb_dict is not None:
        args_dict = nb_dict
    else:
        args_dict = {
        "entities" : "fasttext",
        "clustering_algo": "KMeans",
        "use_dims": 2,
        "dataset": "20NG",
        "vocab": "dict/english.txt"
    }
    args_list = []
    for k, v in args_dict.items():
        args_list.append(f"--{k}")
        args_list.append(str(v))


    args = parser.parse_args(args_list)
    #print(args)
    return args




In [125]:
args_dict = {
    "entities" : "fasttext",
    "clustering_algo": "GMM",
    "use_dims": 2,
    "dataset": "20NG",
    "vocab": "dict/english.txt",
    "rerank": "tf",
    "doc_info": "WGT"
    }
args = parse_args(args_dict)

In [126]:
#args = parse_args()
stopwords = set(line.strip() for line in open('stopwords_en.txt'))
vocab = create_global_vocab(args.vocab)
train_word_to_file, train_w_to_f_mult, files = create_vocab_and_files(stopwords, args.dataset, args.preprocess, "train", vocab)
files_num = len(files)
tf_idf = get_tfidf_score(files, train_word_to_file)

Files:11314
Vocab: 11176




In [11]:
ft = None
if model is not None:
    ft = model.model
else:
    ft = fasttext.load_model("embeds/fasttext/wiki.en.bin")
ft

NameError: name 'model' is not defined

In [3]:
ft = fasttext.load_model("embeds/fasttext/wiki.en.bin")



In [127]:
intersection, words_index_intersect = create_entities_ft(ft, train_w_to_f_mult, args.doc_info)
intersection = PCA_dim_reduction(intersection, args.use_dims)
weights, tfdf = None, None
if args.doc_info == "WGT":
    weights = get_weights_tf(words_index_intersect, train_w_to_f_mult)

if args.doc_info == "robust":
    weights = get_rs_weights_tf(words_index_intersect, train_w_to_f_mult)

if args.doc_info == "tfdf":
    weights , tfdf = get_weights_tfdf(words_index_intersect, train_w_to_f_mult, files_num)

if weights is not None and args.scale == "sigmoid":
    print("scaling.. sigmoid")
    weights = 1 / (1 + np.exp(weights))


elif weights is not None and args.scale == "log":
    print("scaling.. log")
    weights = np.log(weights)

In [129]:
dev_word_to_file, dev_word_to_file_mult, dev_files = create_vocab_and_files(stopwords, args.dataset,args.preprocess, "train", vocab)
dev_files_num = len(dev_files)
test_word_to_file, test_word_to_file_mult, test_files = create_vocab_and_files(stopwords, args.dataset,args.preprocess, "test", vocab)
test_files_num = len(test_files)

Files:11314
Vocab: 11176
Files:7532
Vocab: 8622


In [130]:
topics_npmi = []
#tf_idf, tfdf = None, None
octis_cohere = Coherence(texts=[doc.split() for doc in test_corpus])
for topics in args.num_topics:
    npmis = []
    rand = 0
    global NSEEDS
    while rand < NSEEDS:
        top_k_words, top_k = cluster(
        args.clustering_algo, intersection, 
        words_index_intersect, topics, args.rerank, 
        weights, args.topics_file, rand)
        print((np.array(top_k_words).shape, top_k.shape))
        #print(top_k_words)
        print(top_k_words[0])
        print(top_k)
        top_k_words = rerank(args.rerank, top_k_words, top_k, train_w_to_f_mult, train_word_to_file, tf_idf, tfdf)
        val = average_npmi_topics(top_k_words, len(top_k_words), dev_word_to_file, dev_files_num)
        octis_coh = octis_cohere.score({'topics': top_k_words})
        print(f"octis coherence: {octis_coh}")
        if np.isnan(val):
            NSEEDS +=1
            rand += 1
            continue
        npmi_score = np.around(val, 5)
        print("NPMI:" + str(npmi_score))
        npmis.append(npmi_score)

        rand += 1
    topics_npmi.append(np.mean(npmis))
    print("NPMI Mean:" + str(np.around(topics_npmi[-1], 5)))
    print("NPMI Var:" + str(np.around(np.var(npmis), 5)))

((20, 100), (20, 100))
['light', 'feature', 'attached', 'stocks', 'booted', 'nuts', 'filling', 'exec', 'chewing', 'market', 'supplied', 'operations', 'photographic', 'flowing', 'commercial', 'cutting', 'rounded', 'freeze', 'squeezed', 'industrial', 'grips', 'airports', 'accessed', 'deployed', 'fingers', 'dice', 'terminator', 'tapping', 'sink', 'seed', 'clouds', 'checkers', 'locations', 'martian', 'foil', 'dump', 'drop', 'maneuver', 'tiny', 'vanilla', 'chains', 'approx', 'consists', 'firepower', 'handler', 'distributors', 'spark', 'seeds', 'halo', 'explosions', 'pile', 'astronauts', 'developer', 'video', 'astronautics', 'fahrenheit', 'tapes', 'squeeze', 'deposited', 'grain', 'engineered', 'jumps', 'trees', 'intercept', 'freezes', 'leak', 'consortium', 'prints', 'extra', 'small', 'explode', 'smart', 'companies', 'finger', 'washing', 'rebooted', 'foreground', 'swap', 'panel', 'notebook', 'kidney', 'sales', 'specialty', 'shallow', 'hooks', 'logo', 'worm', 'dribble', 'channel', 'drain', 'po

octis coherence: -0.33038367043209754
NPMI:0.18306
((20, 100), (20, 100))
['holiday', 'fell', 'riot', 'anton', 'college', 'legion', 'goodbye', 'ronald', 'cheer', 'vera', 'crowley', 'aged', 'gang', 'broke', 'jeffrey', 'signed', 'alexandria', 'dressed', 'herbert', 'luke', 'mccarthy', 'bombed', 'served', 'janet', 'legend', 'wallach', 'retirement', 'battle', 'legends', 'returning', 'arrives', 'miss', 'darius', 'devil', 'clinton', 'shortly', 'girl', 'stole', 'truelove', 'potter', 'resident', 'residence', 'tribune', 'queen', 'bury', 'calvin', 'adam', 'grew', 'birthday', 'commander', 'sunday', 'graves', 'singer', 'friday', 'islanders', 'michael', 'senior', 'warriors', 'roommate', 'village', 'buried', 'england', 'career', 'jude', 'bout', 'angel', 'hells', 'curt', 'anniversary', 'noah', 'palace', 'dawn', 'steven', 'nurse', 'levi', 'girls', 'parish', 'nickname', 'alan', 'webster', 'arriving', 'rally', 'dinner', 'joshua', 'trinity', 'arrival', 'wills', 'guests', 'named', 'papa', 'rushed', 'saint'

In [48]:
for topic in 5:
    print()

TypeError: 'int' object is not iterable