In [1]:
import pandas as pd
import numpy as np 
from collections import OrderedDict
pd.set_option('display.max_colwidth', 400)
import re
rs=10

from anytree import Node, RenderTree, PreOrderIter
from anytree.exporter import DotExporter, DictExporter

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

from tqdm import tqdm
import yake

# Transformed data

In [2]:
embedding_df = pd.read_csv('../data/transformed_data_10000.csv')

In [3]:
def find_best_n_for_kmeans(X, n1, n2):  

    df = X.copy()
    silhoutte_values_dict = {}
    neg_silhoutte_values_dict = {}
    
    for n in range(n1, n2):

        # create a model with given n no of clusters
        clusterer = KMeans(n_clusters=n, random_state=rs)
        cluster_labels = clusterer.fit_predict(df)

        # find avg sihoutte score for the model
        silhouette_avg = silhouette_score(df, cluster_labels)
        if(silhouette_avg >=0.1):
            silhoutte_values_dict[n] = silhouette_avg
        
    sorted_silhoutte_dict = dict(sorted(silhoutte_values_dict.items(), key=lambda item: item[1]))

    sorted_silhoutte_dict_list = list(sorted_silhoutte_dict)

    sil_list_len = len(sorted_silhoutte_dict_list)
    if (sil_list_len >=2):
        two_useful_clusters =  [sorted_silhoutte_dict_list[-1], sorted_silhoutte_dict_list[-2]]
    elif(sil_list_len ==1):
        two_useful_clusters =  [sorted_silhoutte_dict_list[-1]]
    elif(sil_list_len == 0):
        return 0
    
    for n in two_useful_clusters:
        clusterer = KMeans(n_clusters=n, random_state=rs)
        cluster_labels = clusterer.fit_predict(df)

        sample_silhouette_values = silhouette_samples(df, cluster_labels)
        neg_values_list = []
        neg_val_count = 0
        for i in range(n):
            ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
            ith_cluster_silhouette_values = np.sort(ith_cluster_silhouette_values)
            
            for j in ith_cluster_silhouette_values:
                if j < 0:
                    neg_val_count = neg_val_count +1
                    neg_values_list.append(j)
        neg_silhoutte_values_dict[n] = np.mean(neg_values_list)  

    n_clusters = min(neg_silhoutte_values_dict, key=neg_silhoutte_values_dict.get)
    return n_clusters
    

In [4]:
def clustering(data, n1, n2, min_samples_per_cluster, p):
    
    temp_data = data.copy()
    print(temp_data.shape)
    global final_df
    
    n = find_best_n_for_kmeans(temp_data, n1, n2)
    print("best cluster choosen:", n)
    
    if(n!=0):
        clusterer = KMeans(n_clusters=n, random_state=rs)
        cluster_index_list = clusterer.fit_predict(temp_data)
        
        temp_data['cluster_index'] = cluster_index_list
        cluster_grouped_df = temp_data.groupby(['cluster_index'])
    
        for c in range(n):
            cluster_group = cluster_grouped_df.get_group(c)
            
            new_parent = Node(cluster_group.index, parent=p)
            if(cluster_group.shape[0] >= min_samples_per_cluster):
                 clustering(cluster_group, n1, n2, min_samples_per_cluster, new_parent)
            else:
                print("Cheers got a cluster because of size", cluster_group.shape[0]) 
                temp = {'text_id' : cluster_group.index}
                final_df = final_df.append(temp, ignore_index=True)
    else:
        print("clustering done............")

In [5]:
final_df = pd.DataFrame(columns = ['text_id'])
root = Node(embedding_df.index)

In [6]:
clustering(embedding_df, 2, 10, 300, root)

(10000, 500)
best cluster choosen: 2
(8762, 501)
best cluster choosen: 3
(1563, 501)
best cluster choosen: 7
Cheers got a cluster because of size 42
Cheers got a cluster because of size 181
(1074, 501)
best cluster choosen: 8
Cheers got a cluster because of size 149
Cheers got a cluster because of size 145
Cheers got a cluster because of size 27
Cheers got a cluster because of size 70
Cheers got a cluster because of size 63
Cheers got a cluster because of size 269
Cheers got a cluster because of size 112
Cheers got a cluster because of size 239
Cheers got a cluster because of size 13
Cheers got a cluster because of size 126
Cheers got a cluster because of size 26
Cheers got a cluster because of size 101
(5165, 501)
best cluster choosen: 4
(1842, 501)
best cluster choosen: 9
Cheers got a cluster because of size 158
Cheers got a cluster because of size 268
Cheers got a cluster because of size 78
Cheers got a cluster because of size 213
Cheers got a cluster because of size 267
Cheers got 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


best cluster choosen: 4
Cheers got a cluster because of size 21
Cheers got a cluster because of size 53
(453, 501)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


best cluster choosen: 2
(409, 501)
best cluster choosen: 2
(384, 501)
best cluster choosen: 9
Cheers got a cluster because of size 59
Cheers got a cluster because of size 15
Cheers got a cluster because of size 102
Cheers got a cluster because of size 32
Cheers got a cluster because of size 61
Cheers got a cluster because of size 7
Cheers got a cluster because of size 71
Cheers got a cluster because of size 12
Cheers got a cluster because of size 25
Cheers got a cluster because of size 25
Cheers got a cluster because of size 44
Cheers got a cluster because of size 21
Cheers got a cluster because of size 70
Cheers got a cluster because of size 160
(636, 501)
best cluster choosen: 7
Cheers got a cluster because of size 151
Cheers got a cluster because of size 46
Cheers got a cluster because of size 22
Cheers got a cluster because of size 66
Cheers got a cluster because of size 193
Cheers got a cluster because of size 98
Cheers got a cluster because of size 60
(388, 501)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


best cluster choosen: 9
Cheers got a cluster because of size 18
Cheers got a cluster because of size 93
Cheers got a cluster because of size 67
Cheers got a cluster because of size 60
Cheers got a cluster because of size 40
Cheers got a cluster because of size 22
Cheers got a cluster because of size 51
Cheers got a cluster because of size 17
Cheers got a cluster because of size 20
(2299, 501)
best cluster choosen: 9
(411, 501)
best cluster choosen: 8
Cheers got a cluster because of size 90
Cheers got a cluster because of size 83
Cheers got a cluster because of size 7
Cheers got a cluster because of size 37
Cheers got a cluster because of size 26
Cheers got a cluster because of size 128
Cheers got a cluster because of size 31
Cheers got a cluster because of size 9
Cheers got a cluster because of size 130
Cheers got a cluster because of size 123
Cheers got a cluster because of size 200
Cheers got a cluster because of size 183
Cheers got a cluster because of size 147
Cheers got a cluster 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


best cluster choosen: 2
(2022, 501)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


best cluster choosen: 2
(2008, 501)
best cluster choosen: 2
Cheers got a cluster because of size 223
(1785, 501)
best cluster choosen: 9
(349, 501)
best cluster choosen: 8
Cheers got a cluster because of size 43
Cheers got a cluster because of size 29
Cheers got a cluster because of size 74
Cheers got a cluster because of size 21
Cheers got a cluster because of size 80
Cheers got a cluster because of size 21
Cheers got a cluster because of size 27
Cheers got a cluster because of size 54
Cheers got a cluster because of size 90
(379, 501)
best cluster choosen: 9
Cheers got a cluster because of size 13
Cheers got a cluster because of size 59
Cheers got a cluster because of size 70
Cheers got a cluster because of size 116
Cheers got a cluster because of size 9
Cheers got a cluster because of size 8
Cheers got a cluster because of size 30
Cheers got a cluster because of size 7
Cheers got a cluster because of size 67
Cheers got a cluster because of size 134
(333, 501)
best cluster choosen: 8

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


best cluster choosen: 8
(1053, 501)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


best cluster choosen: 6
(943, 501)
best cluster choosen: 9
Cheers got a cluster because of size 133
Cheers got a cluster because of size 172
Cheers got a cluster because of size 142
Cheers got a cluster because of size 16
Cheers got a cluster because of size 53
Cheers got a cluster because of size 282
Cheers got a cluster because of size 113
Cheers got a cluster because of size 16
Cheers got a cluster because of size 16
Cheers got a cluster because of size 32
Cheers got a cluster because of size 17
Cheers got a cluster because of size 37
Cheers got a cluster because of size 13
Cheers got a cluster because of size 11
Cheers got a cluster because of size 23
Cheers got a cluster because of size 21
Cheers got a cluster because of size 15
Cheers got a cluster because of size 53
Cheers got a cluster because of size 22
Cheers got a cluster because of size 16
Cheers got a cluster because of size 35


In [7]:
final_df.shape

(134, 1)

In [8]:
final_df.sample(5)

Unnamed: 0,text_id
31,"Int64Index([ 522, 817, 1308, 1329, 1775, 1826, 2370, 2488, 2520, 3054, 3994,  4188, 4826, 5441, 5714, 5728, 5792, 6256, 6923, 7837, 7868, 8261,  8677, 9275, 9666],  dtype='int64')"
6,"Int64Index([ 452, 660, 678, 715, 766, 894, 1048, 1708, 1806, 1926, 2321,  2393, 2677, 2755, 2949, 3139, 3147, 3357, 3446, 3496, 3559, 3661,  3895, 4569, 4881, 5086, 5302, 5360, 5455, 5532, 5704, 5715, 5743,  5924, 5929, 6132, 6143, 6196, 6241, 6269, 6425, 6479, 6586, 6799,  7239, 7289, 7402, 7406, 7808, 7929, 7959, 8186, 8238, 8568, 8780,  ..."
7,"Int64Index([ 45, 68, 88, 116, 179, 183, 211, 214, 240, 262,  ...  9638, 9654, 9701, 9702, 9704, 9727, 9739, 9768, 9787, 9820],  dtype='int64', length=269)"
85,"Int64Index([ 128, 831, 1247, 2206, 2493, 2519, 2896, 2948, 3008, 5083, 6659,  7026, 7387, 7677, 7712, 7961, 8388, 8529, 8597, 9858, 9888],  dtype='int64')"
102,"Int64Index([2309, 2834, 2876, 3021, 3812, 3947, 4368, 4738, 4819, 5829, 5855,  6388, 6543, 6719, 6970, 6984, 7069, 7418, 7453, 7980, 8254, 8727,  9272, 9301, 9328, 9377, 9877],  dtype='int64')"


In [9]:
final_df.to_csv('../data/sample_1000_clusters.csv', index=False)

In [2]:
import pickle
def save_obj(obj, name):
    with open('../data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('../data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [15]:
save_obj(root, "root")

In [5]:
def print_tree(root):
    for pre, fill, node in RenderTree(root):
        print("%s%s" % (pre, node.name))

In [7]:
# print_tree(root)

In [3]:
tree_saved = load_obj("root")

In [8]:
print_tree(tree_saved)

RangeIndex(start=0, stop=10000, step=1)
├── Int64Index([   0,    1,    2,    3,    4,    5,    6,    8,    9,   10,
            ...
            9989, 9990, 9991, 9992, 9993, 9995, 9996, 9997, 9998, 9999],
           dtype='int64', length=8762)
│   ├── Int64Index([   6,   26,   44,   45,   59,   61,   63,   68,   74,   81,
            ...
            9914, 9927, 9937, 9948, 9966, 9970, 9973, 9976, 9989, 9991],
           dtype='int64', length=1563)
│   │   ├── Int64Index([ 123,  249,  303,  389, 1225, 1682, 2445, 2745, 2775, 2786, 3105,
            3436, 3467, 3847, 4279, 4494, 4523, 4568, 4688, 4748, 4823, 4848,
            5216, 5324, 5365, 5377, 5814, 6013, 6213, 6891, 7041, 7610, 7892,
            8069, 8096, 8302, 8809, 9052, 9258, 9296, 9420, 9690],
           dtype='int64')
│   │   ├── Int64Index([  26,   59,   81,  114,  169,  227,  228,  457,  496,  574,
            ...
            9379, 9444, 9471, 9518, 9519, 9620, 9669, 9758, 9864, 9976],
           dtype='int64', length=181