In [1]:
#The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents,
#partitioned (nearly) evenly across 20 different newsgroups. To the best of our knowledge,
#it was originally collected by Ken Lang, probably for his paper 
#“Newsweeder: Learning to filter netnews,” though he does not explicitly mention this
#collection.
#The 20 newsgroups collection has become a popular data set for experiments
#in text applications of machine learning techniques, such as text classification 
#and text clustering.

In [34]:
# selecting labels 
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [35]:
import sklearn
from sklearn.datasets import fetch_20newsgroups

dataset = sklearn.datasets.load_files("text_analytics/data/twenty_newsgroups/20news-bydate-train/", load_content=True, shuffle=True, encoding='UTF-8', decode_error='replace', random_state=0)

In [36]:
# You can explore the data a bit
print dataset.target_names
print("\n".join(dataset.data[0].split("\n")[:3]))
print(dataset.target_names[dataset.target[0]])


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: cubbie@garnet.berkeley.edu (                               )
Subject: Re: Cubs behind Marlins? How?
Article-I.D.: agate.1pt592$f9a
rec.sport.baseball


In [37]:
# Tokenisation
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words="english")
X_train_counts = count_vect.fit_transform(dataset.data)


In [23]:
# How many times does GPU appear?
# How many words in total does your dictionary have?
# Which words are the most common?

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer
# Feature representation
# TF
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

#TF_IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [39]:
import numpy as np

dataset_test = sklearn.datasets.load_files("text_analytics/data/twenty_newsgroups/20news-bydate-test/", load_content=True, shuffle=True, encoding='UTF-8', decode_error='replace', random_state=42)

X_test_counts = count_vect.transform(dataset_test.data)
X_test = tfidf_transformer.transform(X_test_counts)

labels = dataset_test.target
true_k = np.unique(labels).shape[0]

In [42]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
km.fit(X_test)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=20, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [43]:
from sklearn import metrics
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X_test, km.labels_, sample_size=1000))


Homogeneity: 0.317
Completeness: 0.393
V-measure: 0.351
Adjusted Rand-Index: 0.113
Silhouette Coefficient: 0.006


In [44]:
print("Top terms per cluster:")

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = count_vect.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])


Top terms per cluster:
Cluster 0:
 game
 espn
 hockey
 team
 ca
 buffalo
 bruins
 edu
 games
 sabres
Cluster 1:
 com
 ca
 lines
 subject
 organization
 writes
 sun
 edu
 article
 hp
Cluster 2:
 fbi
 koresh
 cramer
 batf
 people
 optilink
 waco
 government
 edu
 com
Cluster 3:
 cmu
 andrew
 edu
 carnegie
 mellon
 pittsburgh
 pa
 pens
 nntp
 skate
Cluster 4:
 edu
 university
 subject
 lines
 organization
 posting
 host
 nntp
 cs
 cc
Cluster 5:
 windows
 dos
 file
 edu
 microsoft
 os
 program
 pc
 ms
 com
Cluster 6:
 stratus
 sw
 cdt
 com
 rocket
 vos
 tavares
 investors
 packet
 computer
Cluster 7:
 netcom
 com
 clipper
 sternlight
 government
 chip
 david
 key
 encryption
 keys
Cluster 8:
 uk
 ac
 mathew
 demon
 xv
 writes
 mantis
 mcc
 university
 dresden
Cluster 9:
 car
 edu
 geico
 duke
 infante
 engine
 ford
 com
 mustang
 cars
Cluster 10:
 edu
 uiuc
 ohio
 cleveland
 cso
 state
 magnus
 cwru
 acs
 university
Cluster 11:
 drive
 scsi
 ide
 controller
 disk
 drives
 hard
 meg
 hd
 bu

In [29]:
# The performance is pretty bad...
# What happens if you limit the feature space? e.g. Exclude stop words?

# What happens if you include all categories?
# Which categories are most easy to cluster?
# What happens if we don't know the number of clusters? (check out Hierarchical clustering)

In [30]:
# Can you cluster based in similarity? (Cosine distance)
# What's a nice way to visualise this data?