In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer
import numpy as np
import prince
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

'''
Variables: 
---------

corpus : list of documents
embeddings : documents embeddings of size NxM (N : number of documents, M : embedding dimension) 
red_emd : reduced embeddings matrix using dimentionality reduction
k : number of clusters
labels : documents labels
pred : list of clustering predicted clusters 

''';

In [4]:
def dim_red(mat, p):
    '''
    Perform dimensionality reduction

    Input:
    -----
        mat : NxM list 
        p : number of dimensions to keep 
    Output:
    ------
        red_mat : NxP list such that p<<m
    ''' 
    # acp = prince.PCA(n_components=p)
    # red_mat = acp.fit(mat, supplementary_columns=['rank', 'points'])
    # red_mat.eigenvalues_summary
    # red_mat.transform(mat)
    
    mat=pd.DataFrame(mat)
    acp = prince.PCA(n_components=p)
    red_mat = acp.fit_transform(mat)
    
    return red_mat

In [5]:
def clust(mat, k):
    '''
    Perform clustering

    Input:
    -----
        mat : input list 
        k : number of cluster
    Output:
    ------
        pred : list of predicted labels
    '''
    
 
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(mat)
    pred = kmeans.predict(mat)
    
    return pred

In [10]:
def cross_validation(mat, k, num_iterations):
    '''
    Execute clustering function multiple times with different initializations

    Input:
    -----
        mat : input list 
        k : number of clusters
        num_iterations : number of times to run the clustering function
    Output:
    ------
        avg_nmi : average normalized mutual info score
        avg_ari : average adjusted rand score
        std_nmi : standard deviation of NMI scores
        std_ari : standard deviation of ARI scores
    '''

    nmis = []
    aris = []

    for _ in range(num_iterations):
        pred = clust(mat, k)
        nmi_score = normalized_mutual_info_score(pred, labels)
        ari_score = adjusted_rand_score(pred, labels)

        nmis.append(nmi_score)
        aris.append(ari_score)

    avg_nmi = np.mean(nmis)
    avg_ari = np.mean(aris)
    std_nmi = np.std(nmis)
    std_ari = np.std(aris)
    print(f'NMI: {avg_nmi:.2f} \nARI: {avg_ari:.2f} \nSTD_NMI: {std_nmi} \n \nSTD_ARI: std_ari')
    


In [11]:
# import data
ng20 = fetch_20newsgroups(subset='test')
corpus = ng20.data[:2000]
labels = ng20.target[:2000]
k = len(set(labels))

# embedding
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(corpus)

# perform dimentionality reduction
red_emb = dim_red(embeddings, 2)


# perform clustering
pred = clust(red_emb, k)
# evaluate clustering results
nmi_score = normalized_mutual_info_score(pred,labels)
ari_score = adjusted_rand_score(pred,labels)
pred2 = cross_validation(red_emb, k, 100)

print(f'NMI: {nmi_score:.2f} \nARI: {ari_score:.2f}')



  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

Avg_NMI: 0.30 
Avg_ARI: 0.10 
STD_NMI: 0.00 
 
STD_ARI: 0.00
NMI: 0.30 
ARI: 0.10


  super()._check_params_vs_input(X, default_n_init=10)
