# Clustering twenty newsgroups using scikit-learn

In [1]:
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.feature_extraction.text as sk_text
import sklearn.cluster
import sklearn.metrics

## Getting the data

In [2]:
newsgroups = sklearn.datasets.fetch_20newsgroups()

In [3]:
newsgroups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
type(newsgroups.data), len(newsgroups.data), type(newsgroups.data[0])

(list, 11314, str)

In [5]:
newsgroups.data[:3]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [6]:
newsgroups.target[:3]

array([7, 4, 4])

In [7]:
newsgroups.target_names[:3]

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc']

In [8]:
nobs = len(newsgroups.data)
train_frac = 0.95
cutoff = int(nobs * train_frac)

train_data, valid_data = newsgroups.data[:cutoff], newsgroups.data[cutoff:]
train_labels, valid_labels = newsgroups.target[:cutoff], newsgroups.target[cutoff:]

len(train_data), len(valid_data)

(10748, 566)

## Filtering stopwords

In [9]:
len(sk_text.ENGLISH_STOP_WORDS), sorted(sk_text.ENGLISH_STOP_WORDS)[:10]

(318,
 ['a',
  'about',
  'above',
  'across',
  'after',
  'afterwards',
  'again',
  'against',
  'all',
  'almost'])

## Term-Frequency / Inverse Document-Frequency (TF-IDF) vectorizer

In [10]:
vectorizer = sk_text.TfidfVectorizer(max_features=10_000, stop_words=sk_text.ENGLISH_STOP_WORDS)

In [11]:
train_vectors = vectorizer.fit_transform(train_data)
train_vectors, train_vectors.shape

(<10748x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 954700 stored elements in Compressed Sparse Row format>,
 (10748, 10000))

In [12]:
valid_vectors = vectorizer.transform(valid_data)
valid_vectors, valid_vectors.shape

(<566x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 49132 stored elements in Compressed Sparse Row format>,
 (566, 10000))

## K-means clustering

In [13]:
kmeans = sklearn.cluster.KMeans(n_clusters=30)

In [14]:
train_clusters = kmeans.fit_predict(train_vectors)
train_clusters, train_clusters.shape

(array([21, 17, 17,  8, ..., 18,  2, 19,  0], dtype=int32), (10748,))

In [15]:
valid_clusters = kmeans.predict(valid_vectors)
valid_clusters, valid_clusters.shape

(array([13,  5, 15, 24, ..., 16,  8,  8, 17], dtype=int32), (566,))

## Evaluate mutual information score

In [16]:
train_mi = sklearn.metrics.adjusted_mutual_info_score(train_labels, train_clusters)
train_mi

0.4003120251051953

In [17]:
valid_mi = sklearn.metrics.adjusted_mutual_info_score(valid_labels, valid_clusters)
valid_mi

0.383634000751514

## Print cluster labels

In [18]:
for cluster_id in np.unique(valid_clusters):
    print('cluster:', cluster_id)
    doc_idxs = np.where(valid_clusters == cluster_id)[0]
    print([newsgroups.target_names[valid_labels[idx]] for idx in doc_idxs])
    print()

cluster: 0
['sci.space', 'sci.space', 'sci.space', 'talk.politics.misc', 'sci.space', 'sci.space', 'sci.space', 'sci.space', 'sci.space', 'sci.space', 'talk.politics.misc', 'rec.sport.hockey']

cluster: 1
['misc.forsale', 'talk.politics.mideast', 'talk.religion.misc', 'comp.sys.mac.hardware', 'rec.autos']

cluster: 2
['talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast', 'talk.politics.mideast']

cluster: 3
['talk.religion.misc']

cluster: 4
['talk.religion.misc', 'soc.religion.christian', 'talk.politics.misc', 'alt.atheism', 'soc.religion.christian', 'alt.atheism', 'alt.atheism', 'soc.religion.christian', 'soc.religion.christian', 'soc.religion.christian', 'soc.religion.christian', 'soc.religio