Based on http://scikit-learn.org/stable/auto_examples/text/document_clustering.html

In [122]:
from sklearn.datasets import fetch_20newsgroups
from sklearn import metrics
from sklearn.cluster import KMeans

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import numpy as np

In [123]:
class DocumentClustering(object):
    def __init__(self, dataset, target):
        self.dataset = dataset
        self.labels = target
        self.labels_count = np.unique(self.labels).shape[0]
        
        self.vectorizer = TfidfVectorizer(stop_words='english')
        
        self._vectorize_data()
        
        
    def _vectorize_data(self):
        self.X = self.vectorizer.fit_transform(self.dataset)
        
    def train(self, verbose=False):
        self.kmeans = KMeans(n_clusters=self.labels_count, verbose=verbose)
        self.kmeans.fit(self.X)
        
    def print_metrics(self):
        print "Homogeneity: %0.3f" % metrics.homogeneity_score(self.labels, self.kmeans.labels_)
        print "Completeness: %0.3f" % metrics.completeness_score(self.labels, self.kmeans.labels_)
        print "V-measure: %0.3f" % metrics.v_measure_score(self.labels, self.kmeans.labels_)
        print "Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, self.kmeans.labels_)
        print "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(self.X, self.kmeans.labels_, sample_size=1000)


In [124]:
input_categories = ['talk.politics.misc', 'soc.religion.christian', 'comp.graphics', 'sci.med']
train_dataset = fetch_20newsgroups(subset='train', categories=input_categories, shuffle=True, random_state=42)

In [129]:
dc = DocumentClustering(train_dataset.data, train_dataset.target)
dc.train(True)

Initialization complete
Iteration  0, inertia 4312.431
Iteration  1, inertia 2192.360
Iteration  2, inertia 2186.371
Iteration  3, inertia 2182.036
Iteration  4, inertia 2179.941
Iteration  5, inertia 2177.540
Iteration  6, inertia 2174.825
Iteration  7, inertia 2172.900
Iteration  8, inertia 2171.067
Iteration  9, inertia 2169.904
Iteration 10, inertia 2169.409
Iteration 11, inertia 2168.967
Iteration 12, inertia 2168.684
Iteration 13, inertia 2168.518
Iteration 14, inertia 2168.081
Iteration 15, inertia 2166.293
Iteration 16, inertia 2165.028
Iteration 17, inertia 2164.953
Converged at iteration 17: center shift 0.000000e+00 within tolerance 2.674971e-09
Initialization complete
Iteration  0, inertia 4312.468
Iteration  1, inertia 2188.359
Iteration  2, inertia 2179.758
Iteration  3, inertia 2174.284
Iteration  4, inertia 2170.210
Iteration  5, inertia 2167.740
Iteration  6, inertia 2166.061
Iteration  7, inertia 2165.569
Iteration  8, inertia 2165.423
Iteration  9, inertia 2165.382
I

In [130]:
dc.print_metrics()

Homogeneity: 0.316
Completeness: 0.509
V-measure: 0.390
Adjusted Rand-Index: 0.183
Silhouette Coefficient: 0.007


In [131]:
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = count_vect.get_feature_names()
for i in range(len(categories)):
    print "Cluster %d:" % i
    for ind in order_centroids[i, :10]:
        print ' %s' % terms[ind] 
    print

Cluster 0:
 edu
 com
 cramer
 optilink
 people
 government
 article
 writes
 state
 clinton

Cluster 1:
 edu
 com
 subject
 lines
 organization
 graphics
 university
 posting
 host
 nntp

Cluster 2:
 god
 jesus
 edu
 christians
 christian
 people
 christ
 church
 bible
 faith

Cluster 3:
 pitt
 geb
 banks
 gordon
 cs
 edu
 cadre
 dsl
 shameful
 n3jxp

