In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups

In [8]:
comp_categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
rec_categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
comp_train = fetch_20newsgroups(subset='train', categories=comp_categories, shuffle=True, random_state=42)
rec_train = fetch_20newsgroups(subset='train', categories=rec_categories, shuffle=True, random_state=42)
comp_test = fetch_20newsgroups(subset='test', categories=comp_categories, shuffle=True, random_state=42)
rec_test = fetch_20newsgroups(subset='test', categories=rec_categories, shuffle=True, random_state=42)

In [13]:
# 1. transform the documents into TF-IDF vectors. 
# Use min df = 3, exclude the stopwords (no need to do stemming). Report the dimensions of the TF-IDF matrix you get.
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(min_df=3, stop_words=text.ENGLISH_STOP_WORDS) 
vectorized_corpus = vectorizer.fit_transform(comp_train.data + rec_train.data)

tfidf_transformer = TfidfTransformer()
trainX = tfidf_transformer.fit_transform(vectorized_corpus)

testX = tfidf_transformer.transform(vectorizer.transform(comp_test.data + rec_test.data))

# 0 for comp class, 1 for rec class
trainy = np.zeros(shape=(len(comp_train.filenames) + len(rec_train.filenames),))
trainy[len(comp_train.filenames):] = 1

testy = np.zeros(shape=(len(comp_test.filenames) + len(rec_test.filenames),))
testy[len(comp_test.filenames):] = 1

In [15]:
print trainX.shape, testX.shape, trainy.shape, testy.shape

 (4732, 20297) (3150, 20297) (4732,) (3150,)


In [23]:
# 2. Apply K-means clustering with k = 2 using the TF-IDF data.
# Compare the clustering results with the known class labels.
# a) Inspect the contingency matrix to get a sense of your clustering result.
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.cluster import contingency_matrix

km = KMeans(n_clusters=2, init='k-means++', max_iter=100)
km.fit(trainX)
ypred = km.predict(testX)

In [24]:
print km.labels_.shape, testy.shape
contingency_matrix(testy, ypred, eps=None, sparse=False)

(4732,) (3150,)


array([[1558,    2],
       [ 937,  653]])

In [25]:
# b) 5 Metrics

print("Homogeneity: %0.3f" % metrics.homogeneity_score(testy, ypred))
print("Completeness: %0.3f" % metrics.completeness_score(testy, ypred))
print("V-measure: %0.3f" % metrics.v_measure_score(testy, ypred))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(testy, ypred))
print("Adjusted Mutual Info: %0.3f" % metrics.adjusted_mutual_info_score(testy, ypred))

Homogeneity: 0.237
Completeness: 0.322
V-measure: 0.273
Adjusted Rand-Index: 0.163
Adjusted Mutual Info: 0.237


In [None]:
# 3. a)
