In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from time import time
from sklearn.datasets import load_files

print("loading documents ...")
t = time()
docs = load_files('datasets/clustering/data')
print("summary: {0} documents in {1} categories.".format(
    len(docs.data), len(docs.target_names)))
print("done in {0} seconds".format(time() - t))

loading documents ...
summary: 3949 documents in 4 categories.
done in 0.05391120910644531 seconds


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = 20000
print("vectorizing documents ...")
t = time()
vectorizer = TfidfVectorizer(max_df=0.4, 
                             min_df=2, 
                             max_features=max_features, 
                             encoding='latin-1')
X = vectorizer.fit_transform((d for d in docs.data))
print("n_samples: %d, n_features: %d" % X.shape)
print("number of non-zero features in sample [{0}]: {1}".format(
    docs.filenames[0], X[0].getnnz()))
print("done in {0} seconds".format(time() - t))

vectorizing documents ...
n_samples: 3949, n_features: 20000
number of non-zero features in sample [datasets/clustering/data/sci.electronics/11902-54322]: 56
done in 0.40638208389282227 seconds


In [4]:
from sklearn.cluster import KMeans

print("clustering documents ...")
t = time()
n_clusters = 4
kmean = KMeans(n_clusters=n_clusters, 
               max_iter=100,
               tol=0.01,
               verbose=1,
               n_init=3)
kmean.fit(X);
print("kmean: k={}, cost={}".format(n_clusters, int(kmean.inertia_)))
print("done in {0} seconds".format(time() - t))

clustering documents ...
Initialization complete
Iteration 0, inertia 7565.864829378881
Iteration 1, inertia 3844.843426433031
Iteration 2, inertia 3833.643792836914
Iteration 3, inertia 3830.542968974707
Iteration 4, inertia 3827.881802393392
Iteration 5, inertia 3825.824980044752
Iteration 6, inertia 3823.818483879976
Iteration 7, inertia 3822.4141829397886
Iteration 8, inertia 3820.4456894787654
Iteration 9, inertia 3818.9989605774335
Iteration 10, inertia 3818.226372617699
Iteration 11, inertia 3817.468294194249
Iteration 12, inertia 3816.948196405341
Iteration 13, inertia 3816.607441895567
Iteration 14, inertia 3816.359426640883
Iteration 15, inertia 3816.33544836309
Iteration 16, inertia 3816.3140336019082
Iteration 17, inertia 3816.306392849937
Iteration 18, inertia 3816.3026140311795
Iteration 19, inertia 3816.3007895598926
Converged at iteration 19: strict convergence.
Initialization complete
Iteration 0, inertia 7470.665399825355
Iteration 1, inertia 3851.055619064295
Iterati

KMeans(max_iter=100, n_clusters=4, n_init=3, tol=0.01, verbose=1)

kmean: k=4, cost=3814
done in 0.462360143661499 seconds


In [5]:
len(kmean.labels_)

3949

In [6]:
kmean.labels_[1000:1010]

array([3, 3, 3, 1, 0, 1, 2, 3, 1, 1], dtype=int32)

In [7]:
docs.filenames[1000:1010]

array(['datasets/clustering/data/sci.crypt/10888-15289',
       'datasets/clustering/data/sci.crypt/11490-15880',
       'datasets/clustering/data/sci.crypt/11270-15346',
       'datasets/clustering/data/sci.electronics/12383-53525',
       'datasets/clustering/data/sci.space/13826-60862',
       'datasets/clustering/data/sci.electronics/11631-54106',
       'datasets/clustering/data/sci.space/14235-61437',
       'datasets/clustering/data/sci.crypt/11508-15928',
       'datasets/clustering/data/sci.space/13593-60824',
       'datasets/clustering/data/sci.electronics/12304-52801'],
      dtype='<U52')

In [8]:
from __future__ import print_function

print("Top terms per cluster:")

order_centroids = kmean.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names_out()
for i in range(n_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: henry toronto zoo spencer zoology utzoo work kipling svr3 svr4
Cluster 1: my any me by your know some do so has
Cluster 2: space nasa pat moon shuttle gov we digex mission access
Cluster 3: key clipper chip encryption government keys will escrow we nsa


In [9]:
a = np.array([[20, 10, 30, 40], [100, 300, 200, 400], [1, 5, 3, 2]])
a.argsort()[:, ::-1]

array([[3, 2, 0, 1],
       [3, 1, 2, 0],
       [1, 2, 3, 0]])

In [10]:
a = np.array([10, 30, 20, 40])
a.argsort()[::-1]

array([3, 1, 2, 0])

In [11]:
from sklearn import metrics

label_true = np.random.randint(1, 4, 6)
label_pred = np.random.randint(1, 4, 6)
print("Adjusted Rand-Index for random sample: %.3f"
      % metrics.adjusted_rand_score(label_true, label_pred))
label_true = [1, 1, 3, 3, 2, 2]
label_pred = [3, 3, 2, 2, 1, 1]
print("Adjusted Rand-Index for same structure sample: %.3f"
      % metrics.adjusted_rand_score(label_true, label_pred))

Adjusted Rand-Index for random sample: 0.318
Adjusted Rand-Index for same structure sample: 1.000


In [12]:
from sklearn import metrics

label_true = [1, 1, 2, 2]
label_pred = [2, 2, 1, 1]
print("Homogeneity score for same structure sample: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))
label_true = [1, 1, 2, 2]
label_pred = [0, 1, 2, 3]
print("Homogeneity score for each cluster come from only one class: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))
label_true = [1, 1, 2, 2]
label_pred = [1, 2, 1, 2]
print("Homogeneity score for each cluster come from two class: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))
label_true = np.random.randint(1, 4, 6)
label_pred = np.random.randint(1, 4, 6)
print("Homogeneity score for random sample: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))

Homogeneity score for same structure sample: 1.000
Homogeneity score for each cluster come from only one class: 1.000
Homogeneity score for each cluster come from two class: 0.000
Homogeneity score for random sample: 0.685


In [13]:
from sklearn import metrics

label_true = [1, 1, 2, 2]
label_pred = [2, 2, 1, 1]
print("Completeness score for same structure sample: %.3f"
      % metrics.completeness_score(label_true, label_pred))
label_true = [0, 1, 2, 3]
label_pred = [1, 1, 2, 2]
print("Completeness score for each class assign to only one cluster: %.3f"
      % metrics.completeness_score(label_true, label_pred))
label_true = [1, 1, 2, 2]
label_pred = [1, 2, 1, 2]
print("Completeness score for each class assign to two class: %.3f"
      % metrics.completeness_score(label_true, label_pred))
label_true = np.random.randint(1, 4, 6)
label_pred = np.random.randint(1, 4, 6)
print("Completeness score for random sample: %.3f"
      % metrics.completeness_score(label_true, label_pred))

Completeness score for same structure sample: 1.000
Completeness score for each class assign to only one cluster: 1.000
Completeness score for each class assign to two class: 0.000
Completeness score for random sample: 0.315


In [14]:
from sklearn import metrics

label_true = [1, 1, 2, 2]
label_pred = [2, 2, 1, 1]
print("V-measure score for same structure sample: %.3f"
      % metrics.v_measure_score(label_true, label_pred))
label_true = [0, 1, 2, 3]
label_pred = [1, 1, 2, 2]
print("V-measure score for each class assign to only one cluster: %.3f"
      % metrics.v_measure_score(label_true, label_pred))
print("V-measure score for each class assign to only one cluster: %.3f"
      % metrics.v_measure_score(label_pred, label_true))
label_true = [1, 1, 2, 2]
label_pred = [1, 2, 1, 2]
print("V-measure score for each class assign to two class: %.3f"
      % metrics.v_measure_score(label_true, label_pred))

V-measure score for same structure sample: 1.000
V-measure score for each class assign to only one cluster: 0.667
V-measure score for each class assign to only one cluster: 0.667
V-measure score for each class assign to two class: 0.000


In [15]:
from sklearn import metrics

labels = docs.target
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, kmean.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, kmean.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, kmean.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, kmean.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, kmean.labels_, sample_size=1000))

Homogeneity: 0.404
Completeness: 0.583
V-measure: 0.477
Adjusted Rand-Index: 0.279
Silhouette Coefficient: 0.005
