In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
import numpy as np
from sklearn import metrics
import pandas as pd

In [2]:
dataset = fetch_20newsgroups(
remove = ('headers', 'footers', 'quotes'),
subset ='all',
categories =[ 'comp.graphics', 'rec.autos', 'rec.sport.baseball',
'sci.space', 'talk.politics.mideast'])

labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts = True)
true_k = unique_labels.shape[0]

In [3]:
len(dataset)

5

In [4]:
dataset.data

["G'day all,\n\nCan anybody point me at a utility which will read/convert/crop/whatnot/\ndisplay HDF image files ? I've had a look at the HDF stuff under NCSA \nand it must take an award for odd directory structure, strange storage\napproaches and minimalist documentation :-)\n\nPart of the problem is that I want to look at large (5MB+) HDF files and\ncrop out a section. Ideally I would like a hdftoppm type of utility, from\nwhich I can then use the PBMplus stuff quite merrily. I can convert the cropped\npart into another format for viewing/animation.\n\nOtherwise, can someone please explain how to set up the NCSA Visualisation S/W\nfor HDF (3.2.r5 or 3.3beta) and do the above cropping/etc. This is for\nSuns with SunOS 4.1.2.\n\nAny help GREATLY appreciated. Ta muchly !\n\nCheers,\n\tMarkus\n\n-- \nMarkus Buchhorn, Parallel Computing Research Facility\nemail = markus@octavia.anu.edu.au\nAustralian National University, Canberra, 0200 , Australia.\n[International = +61 6, Australia = 06]

In [5]:
vectorizer = TfidfVectorizer(stop_words = "english")
X_tfidf = vectorizer.fit_transform(dataset.data)

In [6]:
print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

n_samples: 4884, n_features: 41105


In [7]:
type(X_tfidf)

scipy.sparse.csr.csr_matrix

In [8]:
true_k

5

In [9]:
from sklearn.cluster import KMeans
from collections import defaultdict
scores = defaultdict(list)
train_times = []
for seed in range(5):
    
    t0 = time()
    
    kmeans = KMeans(n_clusters=true_k, max_iter=100, n_init=1, random_state=seed).fit(X_tfidf)
    train_times.append(round(time() - t0, 5))
    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    
    print(f"Number of elements in each cluster: {cluster_sizes}")
    
    scores["Homogeneity"].append(round(metrics.homogeneity_score(labels, kmeans.labels_),6))
    scores["Completeness"].append(round(metrics.completeness_score(labels, kmeans.labels_),6))
    scores["V-measure"].append(round(metrics.v_measure_score(labels, kmeans.labels_),6))
    scores["Adjusted Rand-Index"].append(round(metrics.adjusted_rand_score(labels, kmeans.labels_),6))
    scores["Silhouette Coefficient"].append(round(metrics.silhouette_score(X_tfidf, kmeans.labels_),6))

Number of elements in each cluster: [   1    1    1    1 4880]
Number of elements in each cluster: [ 555  967  381 2569  412]
Number of elements in each cluster: [ 557  514  686  575 2552]
Number of elements in each cluster: [ 524  743 2849  256  512]
Number of elements in each cluster: [2764  948  167  700  305]


In [10]:
average_kmeans = []
std_kmeans = []
for key, value in scores.items():
    mean_score, std_score = np.mean(value), np.std(value)
    print(key, ':', value)
    average_kmeans.append(mean_score)
    std_kmeans.append(std_score)
    print("Average: ", np.mean(value),"\nStandard Deviation: ", np.std(value))
    print()

print("True number of documents in each category according to the class labels: " f"{category_sizes}")

Homogeneity : [0.000821, 0.336157, 0.462703, 0.361698, 0.323107]
Average:  0.29689719999999997 
Standard Deviation:  0.15593595397649637

Completeness : [0.169926, 0.411905, 0.551026, 0.472694, 0.430627]
Average:  0.40723560000000003 
Standard Deviation:  0.12794161825238887

V-measure : [0.001634, 0.370196, 0.503017, 0.409813, 0.369198]
Average:  0.3307716 
Standard Deviation:  0.1716266016683894

Adjusted Rand-Index : [3e-06, 0.193589, 0.265815, 0.210745, 0.173232]
Average:  0.1686768 
Standard Deviation:  0.08977503184382615

Silhouette Coefficient : [0.004769, 0.007182, 0.007531, 0.008847, 0.008795]
Average:  0.0074248000000000005 
Standard Deviation:  0.0014848302798636621

True number of documents in each category according to the class labels: [973 990 994 987 940]


In [11]:
average_df = pd.DataFrame(columns = ['Homogeneity','Completeness','V-measure','Adjusted Rand-Index',
                                     'Silhouette Coefficient'])
std_df = pd.DataFrame(columns = ['Homogeneity','Completeness','V-measure','Adjusted Rand-Index',
                                 'Silhouette Coefficient'])


In [12]:
average_times = np.mean(train_times)
std_times = np.std(train_times)

print("Average time - Kmeans:", average_times, "\nStandard dev. time - Kmeans:", std_times)

Average time - Kmeans: 0.10494200000000001 
Standard dev. time - Kmeans: 0.04300354934188572


In [13]:
from sklearn.cluster import SpectralClustering
scores_spectral = defaultdict(list)
times_spectral = []
for seed in range(5):
    
    t0 = time()
    
    sc = SpectralClustering(n_clusters=true_k,n_init=1,random_state=seed).fit(X_tfidf)
    times_spectral.append(round(time() - t0, 5))
    cluster_ids, cluster_sizes = np.unique(sc.labels_, return_counts=True)
    
    print(f"Number of elements in each cluster: {cluster_sizes}")
    
    scores_spectral["Homogeneity"].append(round(metrics.homogeneity_score(labels, sc.labels_),6))
    scores_spectral["Completeness"].append(round(metrics.completeness_score(labels, sc.labels_),6))
    scores_spectral["V-measure"].append(round(metrics.v_measure_score(labels, sc.labels_),6))
    scores_spectral["Adjusted Rand-Index"].append(round(metrics.adjusted_rand_score(labels, sc.labels_),6))
    scores_spectral["Silhouette Coefficient"].append(round(metrics.silhouette_score(X_tfidf, sc.labels_),6))
    

Number of elements in each cluster: [ 602 1098 2777  131  276]
Number of elements in each cluster: [1833  279  455  602 1715]
Number of elements in each cluster: [2777 1095  276  131  605]
Number of elements in each cluster: [ 276  611  131 2780 1086]
Number of elements in each cluster: [ 276 2780  131 1086  611]


In [14]:
average_spectral = []
std_spectral = []
for key, value in scores_spectral.items():
    mean_score, std_score = np.mean(value), np.std(value)
    print(key, ':', value)
    average_spectral.append(mean_score)
    std_spectral.append(std_score)
    print("Average: ", np.mean(value),"\nStandard Deviation: ", np.std(value))
    print()

print("True number of documents in each category according to the class labels: " f"{category_sizes}")

Homogeneity : [0.220716, 0.20865, 0.220846, 0.221148, 0.221148]
Average:  0.21850160000000002 
Standard Deviation:  0.0049287050469672055

Completeness : [0.302534, 0.243667, 0.302618, 0.302995, 0.302995]
Average:  0.2909618 
Standard Deviation:  0.0236481573184889

V-measure : [0.255228, 0.224803, 0.255345, 0.255681, 0.255681]
Average:  0.24934760000000003 
Standard Deviation:  0.012273623843022073

Adjusted Rand-Index : [0.095517, 0.107714, 0.095629, 0.095958, 0.095958]
Average:  0.0981552 
Standard Deviation:  0.0047826314681355085

Silhouette Coefficient : [0.008316, -7.4e-05, 0.008315, 0.008326, 0.008326]
Average:  0.006641800000000001 
Standard Deviation:  0.0033579032981907032

True number of documents in each category according to the class labels: [973 990 994 987 940]


In [15]:
average_times = np.mean(times_spectral)
std_times = np.std(times_spectral)

print("Average time - Spectral:", average_times, "\nStandard dev. time - Spectral:", std_times)

Average time - Spectral: 2.5657500000000004 
Standard dev. time - Spectral: 0.1264488068745609


In [16]:
from sklearn.cluster import AgglomerativeClustering
scores_agglo = defaultdict(list)
times_agglo = []

t0 = time()

ac = AgglomerativeClustering(n_clusters=true_k).fit(X_tfidf.todense())
times_agglo.append(round(time() - t0, 5))
cluster_ids, cluster_sizes = np.unique(ac.labels_, return_counts=True)

print(f"Number of elements in each cluster: {cluster_sizes}")

scores_agglo["Homogeneity"].append(round(metrics.homogeneity_score(labels, ac.labels_),6))
scores_agglo["Completeness"].append(round(metrics.completeness_score(labels, ac.labels_),6))
scores_agglo["V-measure"].append(round(metrics.v_measure_score(labels, ac.labels_),6))
scores_agglo["Adjusted Rand-Index"].append(round(metrics.adjusted_rand_score(labels, ac.labels_),6))
scores_agglo["Silhouette Coefficient"].append(round(metrics.silhouette_score(X_tfidf, ac.labels_),6))



Number of elements in each cluster: [3358  166  607  622  131]


In [17]:
average_agglo = []
std_agglo = []
for key, value in scores_agglo.items():
    mean_score, std_score = np.mean(value), np.std(value)
    print(key, ':', value)
    average_agglo.append(mean_score)
    std_agglo.append(std_score)
    print("Average: ", np.mean(value),"\nStandard Deviation: ", np.std(value))
    print()

print("True number of documents in each category according to the class labels: " f"{category_sizes}")

Homogeneity : [0.289742]
Average:  0.289742 
Standard Deviation:  0.0

Completeness : [0.470415]
Average:  0.470415 
Standard Deviation:  0.0

V-measure : [0.358607]
Average:  0.358607 
Standard Deviation:  0.0

Adjusted Rand-Index : [0.139406]
Average:  0.139406 
Standard Deviation:  0.0

Silhouette Coefficient : [0.009655]
Average:  0.009655 
Standard Deviation:  0.0

True number of documents in each category according to the class labels: [973 990 994 987 940]


In [18]:
average_df = average_df.iloc[0:0]
std_df = std_df.iloc[0:0]
average_df.loc[len(average_df)] = average_kmeans
std_df.loc[len(std_df)] = std_kmeans
average_df.loc[len(average_df)] = average_spectral
std_df.loc[len(std_df)] = std_spectral
average_df.loc[len(average_df)] = average_agglo
std_df.loc[len(std_df)] = std_agglo
average_df

Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand-Index,Silhouette Coefficient
0,0.296897,0.407236,0.330772,0.168677,0.007425
1,0.218502,0.290962,0.249348,0.098155,0.006642
2,0.289742,0.470415,0.358607,0.139406,0.009655


In [19]:
std_df

Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand-Index,Silhouette Coefficient
0,0.155936,0.127942,0.171627,0.089775,0.001485
1,0.004929,0.023648,0.012274,0.004783,0.003358
2,0.0,0.0,0.0,0.0,0.0
