In [27]:
# Standard imports
import numpy as np
import pandas as pd

# SKLearn related imports
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.pipeline import make_pipeline

from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans

# NLTK Text Processing package
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
df = pd.read_csv('./text-in-practice/data/uci-news-aggregator.csv')
df = df[['TITLE', 'CATEGORY']]
df.columns = ['title', 'category']

In [9]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=False)

In [35]:

X = vectorizer.fit_transform(df.title)

In [36]:
X.shape

(422419, 34794)

In [None]:
svd = TruncatedSVD(5000)
normalizer = preprocessing.Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

In [23]:
for i in range(2,6):
    print('With {} clusters'.format(i))
    km = MiniBatchKMeans(n_clusters=i, init='k-means++', n_init=1,
                             init_size=1000, batch_size=1000)
    km.fit(X)
    labels = df.category
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f"
          % metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, km.labels_, sample_size=1000))

With 2 clusters
Homogeneity: 0.001
Completeness: 0.006
V-measure: 0.002
Adjusted Rand-Index: 0.000
Silhouette Coefficient: 0.005
With 3 clusters
Homogeneity: 0.033
Completeness: 0.171
V-measure: 0.055
Adjusted Rand-Index: 0.011
Silhouette Coefficient: 0.003
With 4 clusters
Homogeneity: 0.028
Completeness: 0.077
V-measure: 0.042
Adjusted Rand-Index: 0.002
Silhouette Coefficient: 0.004
With 5 clusters
Homogeneity: 0.044
Completeness: 0.159
V-measure: 0.069
Adjusted Rand-Index: 0.002
Silhouette Coefficient: 0.001


In [22]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(4):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: says video facebook microsoft china kardashian kim day review game
Cluster 1: new york video trailer apple album google sales season star
Cluster 2: google 2014 apple report glass android beats iphone awards tv
Cluster 3: samsung galaxy s5 apple vs tab guardians note gear android
