### In this example, we illustrate the use of KMeans clustering to cluster documents given a term-document matrix as the data. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.set_printoptions(linewidth=120, precision=2, suppress=True)
pd.options.display.float_format='{:.2f}'.format
pd.set_option('max_colwidth', 120)

#### We will use the proprietary KMeans implementation provided in the module K_Means.py. This implementation supports a variety of distance metrics for clustering. The default metric used below is based on Cosine similarity.

In [2]:
from K_Means import kmeans, cluster_sizes, display_centroids

In [3]:
Data = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/term-doc-mat.csv", header=None)

In [4]:
Data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,database,24,32,12,6,43,2,0,3,1,6,4,0,0,0,0
1,index,9,5,5,2,20,0,1,0,0,0,27,14,3,2,11
2,likelihood,0,3,0,0,3,7,12,4,27,4,0,1,0,0,0
3,linear,3,0,0,0,0,16,0,2,25,23,7,12,21,3,2
4,matrix,1,0,0,0,0,33,2,0,7,12,14,5,12,4,0
5,query,12,2,0,0,27,0,0,0,0,22,9,4,0,5,3
6,regression,0,0,0,0,0,18,32,22,34,17,0,0,0,0,0
7,retrieval,1,0,0,0,2,0,0,0,3,9,27,7,5,4,4
8,sql,21,10,16,7,31,0,0,0,0,0,0,0,0,1,0
9,vector,2,0,0,2,0,27,4,2,11,8,33,16,14,7,3


In [5]:
# Let's remove the column containing the terms
# TD will be out term x document matrix
TD = Data.iloc[:,1:]
TD

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,24,32,12,6,43,2,0,3,1,6,4,0,0,0,0
1,9,5,5,2,20,0,1,0,0,0,27,14,3,2,11
2,0,3,0,0,3,7,12,4,27,4,0,1,0,0,0
3,3,0,0,0,0,16,0,2,25,23,7,12,21,3,2
4,1,0,0,0,0,33,2,0,7,12,14,5,12,4,0
5,12,2,0,0,27,0,0,0,0,22,9,4,0,5,3
6,0,0,0,0,0,18,32,22,34,17,0,0,0,0,0
7,1,0,0,0,2,0,0,0,3,9,27,7,5,4,4
8,21,10,16,7,31,0,0,0,0,0,0,0,0,1,0
9,2,0,0,2,0,27,4,2,11,8,33,16,14,7,3


In [6]:
# Reindex the columns to start from 0
TD.columns= range(15)
TD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,24,32,12,6,43,2,0,3,1,6,4,0,0,0,0
1,9,5,5,2,20,0,1,0,0,0,27,14,3,2,11
2,0,3,0,0,3,7,12,4,27,4,0,1,0,0,0
3,3,0,0,0,0,16,0,2,25,23,7,12,21,3,2
4,1,0,0,0,0,33,2,0,7,12,14,5,12,4,0
5,12,2,0,0,27,0,0,0,0,22,9,4,0,5,3
6,0,0,0,0,0,18,32,22,34,17,0,0,0,0,0
7,1,0,0,0,2,0,0,0,3,9,27,7,5,4,4
8,21,10,16,7,31,0,0,0,0,0,0,0,0,1,0
9,2,0,0,2,0,27,4,2,11,8,33,16,14,7,3


In [7]:
# The list of our index terms
terms = Data.iloc[:,0]
terms

0      database
1         index
2    likelihood
3        linear
4        matrix
5         query
6    regression
7     retrieval
8           sql
9        vector
Name: 0, dtype: object

#### First, we want to do some document clustering. Since the data is in term-document format, we need to obtain the transpose of the TD matrix.

In [8]:
DT = TD.T

#### Now we have a document-term matrix:

In [9]:
DT

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,24,9,0,3,1,12,0,1,21,2
1,32,5,3,0,0,2,0,0,10,0
2,12,5,0,0,0,0,0,0,16,0
3,6,2,0,0,0,0,0,0,7,2
4,43,20,3,0,0,27,0,2,31,0
5,2,0,7,16,33,0,18,0,0,27
6,0,1,12,0,2,0,32,0,0,4
7,3,0,4,2,0,0,22,0,0,2
8,1,0,27,25,7,0,34,3,0,11
9,6,0,4,23,12,22,17,9,0,8


#### Let's add some pre-spwcified labels to our documents. We don't need these for clustering. But once unsupervised clustering is completed, if we know what the expected grouping are in the data, we could use the comparison between these pre-existing categories and the cluster assignments to evaluate the quality of clusters. 

In [10]:
cat_labels = np.array(["Databases", "Databases", "Databases", "Databases", "Databases", "Regression", "Regression", "Regression", "Regression", "Regression", "Information Retrieval", "Information Retrieval", "Information Retrieval", "Information Retrieval", "Information Retrieval"])
cat_labels = pd.Series(cat_labels, index=DT.index)

DT_labeled = DT.copy()
DT_labeled["Category"] = cat_labels

DT_labeled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Category
0,24,9,0,3,1,12,0,1,21,2,Databases
1,32,5,3,0,0,2,0,0,10,0,Databases
2,12,5,0,0,0,0,0,0,16,0,Databases
3,6,2,0,0,0,0,0,0,7,2,Databases
4,43,20,3,0,0,27,0,2,31,0,Databases
5,2,0,7,16,33,0,18,0,0,27,Regression
6,0,1,12,0,2,0,32,0,0,4,Regression
7,3,0,4,2,0,0,22,0,0,2,Regression
8,1,0,27,25,7,0,34,3,0,11,Regression
9,6,0,4,23,12,22,17,9,0,8,Regression


In [11]:
categories = np.array(DT_labeled["Category"])
categories

array(['Databases', 'Databases', 'Databases', 'Databases', 'Databases', 'Regression', 'Regression',
       'Regression', 'Regression', 'Regression', 'Information Retrieval', 'Information Retrieval',
       'Information Retrieval', 'Information Retrieval', 'Information Retrieval'], dtype=object)

In [12]:
numTerms=TD.shape[0]
NDocs = TD.shape[1]

#### Now we are ready for clustering. 

#### The kmeans function in the imported module assumes that the data is in a 2d Numpy array.

In [13]:
DT_Mat = np.array(DT)

In [14]:
clusters, centroids = kmeans(DT_Mat, 3, 20)

Iteration: 1


#### Let's take a look at the cluster centroids

In [15]:
pd.options.display.float_format='{:,.2f}'.format

centroids = pd.DataFrame(centroids, columns=terms)
centroids

Unnamed: 0,database,index,likelihood,linear,matrix,query,regression,retrieval,sql,vector
0,1.0,9.5,1.33,10.17,11.33,3.5,3.0,7.83,0.17,16.67
1,23.4,8.2,1.2,0.6,0.2,8.2,0.0,0.6,17.0,0.8
2,2.5,0.25,11.75,12.5,5.25,5.5,26.25,3.0,0.0,6.25


#### The cluster centroids reveal some general patterns in the data as well as unique characteristics of each cluster. For example, it's clear that Cluster 0 is dominated by documents related to linear regression while Cluster 1 contains documents primarily related to SQL databases, etc.

In [16]:
# Let's look at cluster assigments for each of the instances in the data.
print(clusters)

[1 1 1 1 1 0 2 2 2 2 0 0 0 0 0]


In [17]:
doc_clusters = pd.DataFrame({'Cluster': clusters, 'Category': categories})
doc_clusters

Unnamed: 0,Cluster,Category
0,1,Databases
1,1,Databases
2,1,Databases
3,1,Databases
4,1,Databases
5,0,Regression
6,2,Regression
7,2,Regression
8,2,Regression
9,2,Regression


#### It appears that our clusters perfectly match the pre-existing categories.

#### We can also examine the size of the clusters.

In [18]:
def cluster_sizes(data, clusters):
    #cluster_array is an array of cluster labels for each instance in the data
    
    size = {}
    cluster_labels = np.unique(clusters)
    n_clusters = cluster_labels.shape[0]

    for c in cluster_labels:
        size[c] = len(data[clusters == c])
    return size

In [19]:
c_size = cluster_sizes(DT, clusters)

for c in c_size.keys():
    print("Size of Cluster", c, "= ", c_size[c])

Size of Cluster 0 =  6
Size of Cluster 1 =  5
Size of Cluster 2 =  4


#### If pre-existing categories are available in the data, then we can use Scikit-learn's metrics completeness_score and homogeneity_score to evaluate the degree to which our clusters mimic the categories.  Homogeneity: each cluster contains only members of a single class/category. Completeness: all members of a given class/category are assigned to the same cluster. Both metrics return values between 0 and 1, with 1 corresponding to a perfect score.

In [20]:
from sklearn.metrics import completeness_score, homogeneity_score

In [21]:
print(completeness_score(categories,clusters))

0.8462932564414634


In [22]:
print(homogeneity_score(categories,clusters))

0.8359526054773928


#### Next, let' use the KMeans implementation from Scikit-learn. Note that this implementation only supports Euclidean distance by default. Also, the Sklearn cludtering functions, including KMeans support Pandas DataFrames as well as sparse matrices by default.

In [23]:
from sklearn.cluster import KMeans

In [24]:
kmeans = KMeans(n_clusters=3, max_iter=200, init='k-means++', n_init=3, verbose=1, random_state=33)

# kmeans = KMeans(n_clusters=3, max_iter=200, init='random', n_init=3, verbose=1, random_state=33)

In [25]:
kmeans.fit(DT)

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 7350.900000000001
start iteration
done sorting
end inner loop
Iteration 1, inertia 6784.888888888889
start iteration
done sorting
end inner loop
Iteration 2, inertia 6784.888888888889
center shift 0.000000e+00 within tolerance 8.816089e-03
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 7034.7
start iteration
done sorting
end inner loop
Iteration 1, inertia 7034.7
center shift 0.000000e+00 within tolerance 8.816089e-03
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 6539.464285714285
start iteration
done sorting
end inner loop
Iteration 1, inertia 6539.464285714285
center shift 0.000000e+00 within tolerance 8.816089e-03


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=200,
       n_clusters=3, n_init=3, n_jobs=None, precompute_distances='auto',
       random_state=33, tol=0.0001, verbose=1)

In [26]:
clusters_sk = kmeans.predict(DT)

In [27]:
clusters_sk

array([0, 0, 0, 1, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1])

In [28]:
categories_sk = categories[categories == clusters_sk]
categories_sk

array([], dtype=object)

In [29]:
doc_clusters_sk = pd.DataFrame({'Cluster': clusters_sk, 'Category': categories})
doc_clusters_sk

Unnamed: 0,Cluster,Category
0,0,Databases
1,0,Databases
2,0,Databases
3,1,Databases
4,0,Databases
5,2,Regression
6,2,Regression
7,2,Regression
8,2,Regression
9,1,Regression


In [30]:
centroids_sk = pd.DataFrame(kmeans.cluster_centers_, columns=terms)
centroids_sk

Unnamed: 0,database,index,likelihood,linear,matrix,query,regression,retrieval,sql,vector
0,27.75,9.75,1.5,0.75,0.25,10.25,0.0,0.75,19.5,0.5
1,2.29,8.43,0.71,9.71,6.71,6.14,2.43,8.0,1.14,11.86
2,1.5,0.25,12.5,10.75,10.5,0.0,26.5,0.75,0.0,11.0


In [31]:
c_size = cluster_sizes(DT, clusters_sk)

for c in c_size.keys():
    print("Size of Cluster", c, "= ", c_size[c])

Size of Cluster 0 =  4
Size of Cluster 1 =  7
Size of Cluster 2 =  4


#### Let's again measure the completeness and homogeneity score.

In [32]:
print(completeness_score(categories,clusters_sk))

0.6854600718886354


In [33]:
print(homogeneity_score(categories,clusters_sk))

0.6617440906306454
