In [1]:
#The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents,
#partitioned (nearly) evenly across 20 different newsgroups. To the best of our knowledge,
#it was originally collected by Ken Lang, probably for his paper 
#“Newsweeder: Learning to filter netnews,” though he does not explicitly mention this
#collection.
#The 20 newsgroups collection has become a popular data set for experiments
#in text applications of machine learning techniques, such as text classification 
#and text clustering.

In [2]:
# selecting labels 
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [3]:
import sklearn
from sklearn.datasets import fetch_20newsgroups

dataset = sklearn.datasets.load_files("../text_analytics/data/twenty_newsgroups/20news-bydate-train/", categories = categories, load_content=True, shuffle=True, encoding='UTF-8', decode_error='replace', random_state=0)

In [4]:
# You can explore the data a bit
print dataset.target_names
print("\n".join(dataset.data[0].split("\n")[:3]))
print(dataset.target_names[dataset.target[0]])


['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
From: dpc47852@uxa.cso.uiuc.edu (Daniel Paul Checkman)
Subject: Re: Is MSG sensitivity superstition?
Article-I.D.: news.C5wI4F.Dt
sci.med


In [5]:
# Tokenisation
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words="english")
X_train_counts = count_vect.fit_transform(dataset.data)


In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
# Feature representation
# TF
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

#TF_IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [7]:
import numpy as np

dataset_test = sklearn.datasets.load_files("../text_analytics/data/twenty_newsgroups/20news-bydate-test/",  categories = categories, load_content=True, shuffle=True, encoding='UTF-8', decode_error='replace', random_state=42)

X_test_counts = count_vect.transform(dataset_test.data)
X_test = tfidf_transformer.transform(X_test_counts)

labels = dataset_test.target
true_k = np.unique(labels).shape[0]

In [8]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
km.fit(X_test)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=4, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [9]:
from sklearn import metrics
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X_test, km.labels_, sample_size=1000))


Homogeneity: 0.330
Completeness: 0.419
V-measure: 0.369
Adjusted Rand-Index: 0.213
Silhouette Coefficient: 0.004


In [10]:
print("Top terms per cluster:")

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = count_vect.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])


Top terms per cluster:
Cluster 0:
 gif
 file
 image
 format
 graphics
 bit
 ac
 files
 images
 24
Cluster 1:
 god
 sin
 mary
 love
 jesus
 com
 hate
 creps
 baptism
 parents
Cluster 2:
 homosexuality
 god
 edu
 church
 christian
 religion
 christianity
 people
 homosexual
 com
Cluster 3:
 edu
 com
 subject
 lines
 organization
 university
 posting
 nntp
 host
 writes


In [11]:
# The performance is pretty bad...
# What happens if you limit the feature space? e.g. Exclude stop words?

# What happens if you include all categories?
# Which categories are most easy to cluster?
# What happens if we don't know the number of clusters? (check out Hierarchical clustering)

In [12]:
# Can you cluster based in similarity? (Cosine distance)
# What's a nice way to visualise this data?

In [13]:
#set up colors per clusters using a dict
top_terms_matrix = []
for x in X_test:
    xc = x.tocsr()
    xs = zip(xc.indices,xc.data)
    top_terms = [ a for a,b in sorted(xs, key = lambda x: xs[1])[0:4]]
    top_terms_matrix.append( top_terms )
    #print 'next'


In [24]:
top_terms = []
features = count_vect.get_feature_names()
for doc in top_terms_matrix:
        row = []
        for word in doc:
            row.append(features[word])
        top_terms.append(row)
        
for i in range(true_k):
    print("Cluster %d words:" % i)
    for ind in order_centroids[i, :4]: #replace 6 with n words per cluster
        print terms[ind]
    print("Cluster %d documents:" % i)
    for ind,doc in enumerate(top_terms):
        if km.labels_[ind] == i:
            print doc

            


Cluster 0 words:
gif
file
image
format
Cluster 0 documents:
[u'zip', u'wustl', u'wuarchive', u'wrote']
[u'won', u'version', u'utas', u'university']
[u'zamenhof', u'wrong', u'writes', u'wpi']
[u'xv', u'world', u'various', u'used']
[u'writes', u'want', u'vitiello', u've']
[u'world', u'won', u'vms', u'version']
[u'works', u'windows', u'warfare', u'va']
[u'yuv', u'video', u'usually', u'tu']
[u'wide', u'walk', u'uk', u'thanks']
[u'want', u'v3', u'utility', u'tw']
[u'za', u'yup', u'wrong', u'writes']
[u'zip', u've', u'university', u'unfortunately']
[u'xloadimage', u'ximage', u'wrong', u'writes']
[u'xloadimage', u'wrote', u'wrong', u'writes']
[u'x11', u'viewer', u'using', u'thanks']
[u'york', u'use', u'university', u'summary']
[u'za', u'writes', u'unit', u'uiuc']
[u'xwd', u'xloadimage', u'x11', u'writes']
[u'zooming', u'zoom', u'zippy', u'zephyr']
[u'world', u'vitiello', u've', u'user']
[u'zip', u'yes', u'xv', u'world']
[u'zoom', u'zoo', u'zip', u'years']
[u'yutani', u'xv', u'xloadimage', u'w