In [2]:
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.sys.ibm.pc.hardware',
              'comp.graphics', 
              'comp.sys.mac.hardware',
              'comp.os.ms-windows.misc',
              'rec.autos', 
              'rec.motorcycles',
              'rec.sport.baseball', 
              'rec.sport.hockey']
dataset = fetch_20newsgroups(subset='all',categories=categories,
                                   shuffle=True, random_state=42)
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

7882 documents
8 categories



In [3]:
# Convert a collection of raw documents to a matrix of TF-IDF features.
# (Equivalent to CountVectorizer followed by TfidfTransformer)
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df=3,stop_words='english')
X_vect_tfidf = vect.fit_transform(dataset.data)

In [4]:
# ----------------------------------
# Question 1
# Report dimensions of the TF-IDF Matrix
# ----------------------------------
print(X_vect_tfidf.shape)

(7882, 27768)


In [20]:
import numpy as np
labels = dataset.target
true_k = np.unique(labels).shape[0]
print("true k: %d" % true_k)
print('unique labels:')
print(np.unique(labels))


true k: 8
unique labels:
[0 1 2 3 4 5 6 7]


In [14]:
from sklearn.cluster import KMeans
from time import time

t0 = time()
km = KMeans(n_clusters=2, random_state=0, max_iter=1000, n_init=30)
km.fit(X_vect_tfidf)
print("done in %fs" % (time() - t0))

done in 322.664610s


In [21]:
print('unique labels:')
print(np.unique(labels))
print(km.cluster_centers_.shape)
print(np.unique(km.labels_))
print(km.inertia_)

unique labels:
[0 1 2 3 4 5 6 7]
(2, 27768)
[0 1]
7733.995449994944


In [16]:
from sklearn import metrics
print("Homogeneity:            %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness:           %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure:              %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index:    %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X_vect_tfidf, km.labels_, sample_size=1000))

Homogeneity:            0.203
Completeness:           0.803
V-measure:              0.324
Adjusted Rand-Index:    0.113
Silhouette Coefficient: 0.003


In [106]:
# ----------------------------------
# Question 2
# Report Contingency Table, A
# ----------------------------------
ground_truth = np.array(labels)
cluster_results = np.array(km.labels_)
ni = len(np.unique(ground_truth))
nj = len(np.unique(cluster_results))
compareMat = ground_truth == cluster_results 
A=np.zeros((ni,nj), dtype=int)
#print(ni)
#print(nj)
#print(A.shape)
for i in range(ni):
    for j in range(nj):
        ind = (ground_truth==i) & (cluster_results==j)
        A[i][j]=len(ground_truth[ind])
        #print(ground_truth[ind])
print(A)
#print()
#print(np.sum(A,axis=0))
#print(np.sum(A,axis=1))
#print(np.sum(np.sum(A,axis=0)))
#print(np.sum(np.sum(A,axis=1)))

[[  1 972]
 [  0 985]
 [  3 979]
 [  0 963]
 [  1 989]
 [  1 995]
 [794 200]
 [922  77]]


In [80]:
array1 = np.array([0,1,1,1,0,1])
array2 = np.array([0,1,2,1,3,4])

print((array1==array2) & (array1==1))


[False  True False  True False False]
