# Clustering Documents with KMeans

In [1]:
import numpy as np
import pandas as pd
import re
import os
import gensim.utils
import mpld3
import matplotlib.pyplot as plt
import matplotlib as mpl

from functions import *
from sklearn.manifold import MDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import cross_val_score
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

## Loading Data

In [2]:
d = load_data('hard.cor')

In [3]:
for key in d.keys():
    print key, len(d[key])

HARD1 3455
HARD2 502
HARD3 376


Create a documents of all sentences, regardless of meaning

In [4]:
train_documents = [sentence for value in d.values() for sentence in value]

In [5]:
print type(train_documents[0]), len(train_documents)

<type 'str'> 4333


We also want to extract the labels of the documents into a numpy array

In [6]:
def set_labels(d):
    ys = []
    for i in d.keys():
        yi = np.repeat(str(i), len(d[i]))
        ys.append(yi)
    return np.concatenate(ys)

In [7]:
y = set_labels(d)

## Lemmatizing

In [8]:
lem_train_documents = lem(train_documents)

In [9]:
lem_train_documents[0]

'lose popular support someone have kill defeat do'

## TF-IDF Vectorizing

In [10]:
vectors, vectorizer = Tfidf(lem_train_documents)

In [11]:
vectorizer

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
vectors

<4333x73935 sparse matrix of type '<type 'numpy.float64'>'
	with 106933 stored elements in Compressed Sparse Row format>

## KMeans Clustering

Since KMeans Clustering is an unsupervised learning and does not require the labels. We can use the lemmatized training documents directly.

In [13]:
# Get the number of meanings for the word we are working with
n_meanings = len(d)
print n_meanings

3


In [14]:
# Fit KMeans model to our Tfidf vectors from the lemmatized training documents
km = KMeans(n_clusters = n_meanings, n_init=100)
km.fit(vectors)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=100,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [15]:
# Call the predicted labels from KMeans model.
# Labels are assigned as 0, 1, 2,..., (n_meanings-1). 
km.labels_

array([0, 1, 0, ..., 0, 0, 0], dtype=int32)

In [16]:
# Count the number of labels in each group. 
np.bincount(km.labels_)

array([3062,  408,  863])

Note that cluster labels do not correspond to the order of meaning in the dictionary. We will use the sizes of clusters to determine the meanings. Largest cluster corresponds to the most popular meaning in our data.

## Evaluating Kmeans Model

To see how well the model performs, we will match the predicted labels (km.labels_) to the actual labels from the dictionary. We can also calculate the scores of our predictions 

In [None]:
pd.crosstab(km.labels_, y)

As we can see, our KMeans model tend to overpredict them most common meaning, the result is good in more common meaning and poor on less common meanings.

## Finding Most Common Words in Each Cluster

Despite not working well until we can optimize the model further, it is useful to see which words are most common in each cluster. We will use Latent Semantic Analysis (LSA) to do that.

In [19]:
# Fit LSA model
lsa = TruncatedSVD(n_components = n_meanings, n_iter=100)
lsa.fit(vectors)

TruncatedSVD(algorithm='randomized', n_components=3, n_iter=100,
       random_state=None, tol=0.0)

In [20]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(lsa.components_):
    comp_terms = zip(terms, comp)
    sorted_terms = sorted(comp_terms, key = lambda x:x[1], reverse = True)[:10]
    print "---Meaning: {}--- ".format(d.keys()[i])
    print
    for term in sorted_terms:
        print term[0]
    print

---Meaning: HARD1--- 

say
time
believe
work
make
thing
person
know
say say
year

---Meaning: HARD2--- 

time
believe
work
make
know
come
think
just
person
thing

---Meaning: HARD3--- 

time
know
right
play
way
new
real
say time
say
feeling



## Plotting "HARD" data

In [17]:
lim = 600

In [18]:
vectors_limit = vectors[: lim, : lim]
y_limit = y[: lim]

In [19]:
# Use cosine distance to represent the Tfidf vectors
dist = 1 - cosine_similarity(vectors_limit)

In [27]:
MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

In [28]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3'}

#set up cluster names using a dict
cluster_names = {0: 'predict: HARD1',
                 1: 'predict: HARD2', 
                 2: 'predict: HARD3', 
                 }

In [29]:
clusters = km.labels_[: lim].tolist()

In [30]:
len(xs), len(ys), len(clusters), len(y_limit)

(600, 600, 600, 600)

In [31]:
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=y_limit)) 

In [25]:
df.head()

Unnamed: 0,label,title,x,y
0,0,HARD1,0.645055,0.161545
1,1,HARD1,0.769988,0.048801
2,0,HARD1,-0.606205,-0.11901
3,0,HARD1,-0.218119,0.754435
4,0,HARD1,-0.148321,0.767342


In [26]:
#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
plt.savefig('HARD_KMeams_clusters_small.png', dpi=200)