In [99]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans                # we'll be using scikit-learn's KMeans for this assignment
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [100]:
wiki = pd.read_csv(r"D:\dasci\github\datasets\coursera_ML_UW\4_Clustering_&_Retrieval\people_wiki.csv")

In [101]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

In [102]:
word_count = load_sparse_csr(r"D:\dasci\github\datasets\coursera_ML_UW\4_Clustering_&_Retrieval\people_wiki_word_count.npz")

In [103]:
import json

with open(r"D:\dasci\github\datasets\coursera_ML_UW\4_Clustering_&_Retrieval\people_wiki_map_index_to_word.json") as jsn:
    map_index_to_word = json.load(jsn)

In [104]:
def unpack_dict(matrix, map_index_to_word):
    table = sorted(map_index_to_word, key=map_index_to_word.get)
    
    data = matrix.data
    indices = matrix.indices
    indptr = matrix.indptr
    
    num_doc = matrix.shape[0]

    return [{k:v for k,v in zip([table[word_id] for word_id in indices[indptr[i]:indptr[i+1]] ],
                                 data[indptr[i]:indptr[i+1]].tolist())} \
               for i in range(num_doc) ]

wiki['word_count'] = unpack_dict(word_count, map_index_to_word)

In [105]:
tf_idf = load_sparse_csr(r"D:\dasci\github\datasets\coursera_ML_UW\4_Clustering_&_Retrieval\people_wiki_tf_idf.npz")

In [106]:
tf_idf = normalize(tf_idf)

In [107]:
def bipartition(cluster, maxiter=400, num_runs=4, seed=None):
    '''cluster: should be a dictionary containing the following keys
                * dataframe: original dataframe
                * matrix:    same data, in matrix format
                * centroid:  centroid for this particular cluster'''
    
    data_matrix = cluster['matrix']
    dataframe   = cluster['dataframe']
    
    # Run k-means on the data matrix with k=2. We use scikit-learn here to simplify workflow.
    kmeans_model = KMeans(n_clusters=2, max_iter=maxiter, n_init=num_runs, random_state=seed, n_jobs=1)    
    kmeans_model.fit(data_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    
    # Divide the data matrix into two parts using the cluster assignments.
    data_matrix_left_child, data_matrix_right_child = data_matrix[cluster_assignment==0], \
                                                      data_matrix[cluster_assignment==1]
    
    # Divide the dataframe into two parts, again using the cluster assignments.
    cluster_assignment_sa = np.array(cluster_assignment) # minor format conversion
    dataframe_left_child, dataframe_right_child     = dataframe[cluster_assignment_sa==0], \
                                                      dataframe[cluster_assignment_sa==1]
        
    
    # Package relevant variables for the child clusters
    cluster_left_child  = {'matrix': data_matrix_left_child,
                           'dataframe': dataframe_left_child,
                           'centroid': centroids[0]}
    cluster_right_child = {'matrix': data_matrix_right_child,
                           'dataframe': dataframe_right_child,
                           'centroid': centroids[1]}
    
    return (cluster_left_child, cluster_right_child)

In [108]:
wiki_data = {'matrix': tf_idf, 'dataframe': wiki} # no 'centroid' for the root cluster
left_child, right_child = bipartition(wiki_data, maxiter=100, num_runs=6, seed=1)

In [88]:
print (left_child)
print (right_child)

{'matrix': <11510x547979 sparse matrix of type '<class 'numpy.float64'>'
	with 1885831 stored elements in Compressed Sparse Row format>, 'dataframe':                                                      URI  \
0            <http://dbpedia.org/resource/Digby_Morrell>   
17     <http://dbpedia.org/resource/Paddy_Dunne_(Gael...   
21           <http://dbpedia.org/resource/Ceiron_Thomas>   
22            <http://dbpedia.org/resource/Adel_Sellimi>   
25             <http://dbpedia.org/resource/Vic_Stasiuk>   
28            <http://dbpedia.org/resource/Leon_Hapgood>   
30               <http://dbpedia.org/resource/Dom_Flora>   
33               <http://dbpedia.org/resource/Bob_Reece>   
41     <http://dbpedia.org/resource/Bob_Adams_(Americ...   
48              <http://dbpedia.org/resource/Marc_Logan>   
49          <http://dbpedia.org/resource/Corey_Woolfolk>   
63              <http://dbpedia.org/resource/Alan_Roper>   
75      <http://dbpedia.org/resource/Vladimir_Yurchenko>   
78        

In [89]:
map_index_to_word_inv = {v: k for k, v in map_index_to_word.items()}

In [95]:
def display_single_tf_idf_cluster(cluster, map_index_to_word):
    '''map_index_to_word: SFrame specifying the mapping betweeen words and column indices'''
    
    wiki_subset   = cluster['dataframe']
    tf_idf_subset = cluster['matrix']
    centroid      = cluster['centroid']
    
    ## Print top 5 words with largest TF-IDF weights in the cluster
    idx = centroid.argsort()[::-1]
    for i in range(5):
        print('{0:s}:{1:.3f}'.format(map_index_to_word_inv[idx[i]], centroid[idx[i]])),
    print('')
    
    # Compute distances from the centroid to all data points in the cluster.
    distances = pairwise_distances(tf_idf_subset, [centroid], metric='euclidean').flatten()
    # compute nearest neighbors of the centroid within the cluster.
    nearest_neighbors = distances.argsort()
    # For 8 nearest neighbors, print the title as well as first 180 characters of text.
    # Wrap the text at 80-character mark.
    for i in range(8):
        print(nearest_neighbors[i])
        text = ' '.join(wiki_subset[wiki_subset.index==nearest_neighbors[i]]['text'].values[0].split(None, 25)[0:25])
        print('* {0:50s} {1:.5f}\n  {2:s}\n  {3:s}'.format(wiki_subset[nearest_neighbors[i]]['name'],
              distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
    print('')

In [96]:
display_single_tf_idf_cluster(left_child, map_index_to_word)
display_single_tf_idf_cluster(right_child, map_index_to_word)

league:0.040
season:0.036
team:0.029
football:0.029
played:0.028

6830


IndexError: index 0 is out of bounds for axis 0 with size 0

In [37]:
athletes = left_child
non_athletes = right_child

In [38]:
# Bipartition the cluster of athletes
left_child_athletes, right_child_athletes = bipartition(athletes, maxiter=100, num_runs=6, seed=1)

In [None]:
display_single_tf_idf_cluster(left_child_athletes, map_index_to_word)

In [None]:
display_single_tf_idf_cluster(right_child_athletes, map_index_to_word)

In [39]:
athletes = left_child
non_athletes = right_child

In [40]:
# Bipartition the cluster of athletes
left_child_athletes, right_child_athletes = bipartition(athletes, maxiter=100, num_runs=6, seed=1)

In [41]:
baseball            = left_child_athletes
ice_hockey_football = right_child_athletes

In [42]:
left_child_ihs, right_child_ihs = bipartition(ice_hockey_football, maxiter=100, num_runs=6, seed=1)

In [43]:
map_index_to_word

{'biennials': 522004,
 'lb915': 116290,
 'shatzky': 127647,
 'woode': 174106,
 'damfunk': 133206,
 'nualart': 153444,
 'hatefillot': 164111,
 'missionborn': 261765,
 'yeardescribed': 161075,
 'theoryhe': 521685,
 'vinalop': 222759,
 'soestdijk': 166345,
 'boncea': 150371,
 'spiders': 519990,
 'bienniale': 429277,
 'woody': 541515,
 'trawling': 189895,
 'pampoulovawagner': 201040,
 'bentara': 202586,
 'laserbased': 25758,
 'caner': 346073,
 'canes': 478262,
 'canet': 436468,
 'iaspark': 395341,
 'categoriesborn': 12586,
 '5982': 277649,
 'caney': 459867,
 'phosphorushe': 379479,
 'yusaf': 270311,
 'hhsoffice': 141697,
 '5985': 32985,
 'fsos': 109474,
 'caned': 324502,
 'gaa': 534680,
 'iguau': 456103,
 'storiesin': 513151,
 'braziljorge': 107111,
 'iguaz': 127410,
 'kealhofer': 342134,
 'canek': 159182,
 '2116': 429919,
 'canem': 83461,
 'victorialooking': 58169,
 'martre': 111954,
 'lippert': 491344,
 'pagesolove': 33220,
 'sowell': 489254,
 'weiskopfs': 344360,
 'hedquist': 165506,
 '