In [1]:
import nltk
from nltk.corpus import reuters
import re
import numpy as np
import pandas as pd
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
import sklearn
from scipy.cluster.hierarchy import dendrogram, linkage, ward, fcluster
import networkx as nx
import collections
import math
import operator
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.decomposition import PCA
from kneed import KneeLocator
from sklearn.manifold import TSNE
import scipy.spatial.distance

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gimli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Hierarchial Clustering

### Tf-Idf Matrix

In [4]:
def tf_idf(df):
    # tfidf. stop word removal. word tokenizer. 
    tfidf = TfidfVectorizer(stop_words = 'english', analyzer = 'word')
    m = tfidf.fit_transform(df['text'])
    
    feature_names = tfidf.get_feature_names() # words 

    return m, feature_names

### Distance Matrix

In [217]:
def dist_calculate(m):
    dist = euclidean_distances(m)  ## I think its ok to use euclidean because tf-idf normalizes
    flat_dist = scipy.spatial.distance.pdist(m, 'euclidean') # needed for linkage function
    # euclidean can be innaccurate if documents are different lengths such that vectors are different lengths 
    # I would prefer to use euclidean because then more sensicl to calculate centroids
    # ask professor?? 
    return dist, flat_dist

### PCA Dimensionality Reduction

In [6]:
def pca_reduce(m):
    pca = PCA(n_components = 0.8) # keep 95% of variance 
    pcam = pca.fit_transform(m.toarray())

    return pcam

### Linkage Matrix

In [7]:
def linkage_calculate(dist):
    linkage_matrix = linkage(dist, method = 'ward') 
    return linkage_matrix
    
# plot dendogram
#fig, ax = plt.subplots(figsize=(15, 20))
#ax = dendrogram(linkage_matrix, orientation="right", labels = df_retail.ids.unique())

In [8]:
def frame_merge(df, f):
    # merge in with original data via pandas
    frameh = pd.DataFrame(df.index, index = [f], columns = ['index_search'])
    frameh = pd.merge(frameh, df, right_index = True, left_on = 'index_search')
    frameh['cluster'] = frameh.index.str[0]
    frameh = frameh.reset_index()
    return frameh

In [238]:
def centroid_label(frameh, m_pca, m, feature_names, search):
    # most common words in clusters (based on tf-idf not just frequency)
    centroid = dict()
    labels = []
    for c in list(frameh.cluster.unique()):
        ## centroid ## 
        cluster1 = list(frameh[frameh.cluster == c].index.unique())
        # find documents cluster
        m1_pca = m_pca[cluster1,:]
        # take mean vector among all documents
        m1_pca = m1_pca.mean(axis = 0)
        # record mean vector: centroids of each sub cluster
        centroid[c] = m1_pca

        ## labels ##
        # redo mean vector with non-reduced tfidf matrix 
        m1 = m[cluster1,:]
        # take mean vector among all documents
        m1 = m1.mean(axis = 0)
        
        # max values in mean vector 
        lst = []

        for i in np.argsort(np.asarray(m1)[0])[::-1][:6]:
            if feature_names[i] == search: # don't record as label if it is the search
                continue
            lst.append(feature_names[i])
            

        labels.append(lst)
        
    return labels, centroid

In [76]:
def silhouette_individ(frameh, m):
    sil_a = dict()
    for c in list(frameh.cluster.unique()):
        sil_a[c] = dict()
        docs_i = list(frameh[frameh.cluster == c].index.unique())
        for i in docs_i:
            lst = []
            for j in docs_i: 
                if i != j:
                    if type(m) == np.ndarray: # if pca reduced, then ndarray instead of matrix
                        lst.append(np.linalg.norm(m[i]-m[j]))
                    else:
                        lst.append(np.linalg.norm(m[i].toarray()-m[j].toarray()))
            sil_a[c][i] = np.mean(lst)

    sil_b = dict()
    for c in list(frameh.cluster.unique()):
        sil_b[c] = dict()
        docs_in = list(frameh[frameh.cluster == c].index.unique())
        docs_out = list(frameh[frameh.cluster != c].index.unique())
        for i in docs_in:
            lst = []
            for j in docs_out: 
                if type(m) == np.ndarray:
                    lst.append(np.linalg.norm(m[i]-m[j]))
                else:
                    lst.append(np.linalg.norm(m[i].toarray()-m[j].toarray()))
            sil_b[c][i] = np.mean(lst)
            
    return sil_a, sil_b

In [43]:
def silhouette_take_avg(sil):
    avg = []
    for v in sil.values():
        avg.append(list(v.values()))
    avg = [item for sublist in avg for item in sublist]
    avg = [0 if math.isnan(i) else i for i in avg]
    avg = np.mean(avg)
    
    return avg

In [44]:
def silhouette_avg(frameh, m):
    sil_a, sil_b = silhouette_individ(frameh, m)
    avga = silhouette_take_avg(sil_a)
    avgb = silhouette_take_avg(sil_b)

    return (avgb - avga) / max(avgb, avga)

In [237]:
def clusters(k, linkage_matrix, m_pca, m, df, feature_names, search):
    f = fcluster(linkage_matrix, k, criterion = 'maxclust')
    frameh = frame_merge(df, f)
    labels, centroid = centroid_label(frameh, m_pca, m, feature_names, search)

    return frameh, labels, centroid

In [164]:
def distortion_calculate(m, centroid, frameh):
    sumd = 0
    for i in list(frameh.index.unique()):
        c = int(frameh[frameh.index == i].cluster)
        sumd += np.linalg.norm(m[i]-centroid[c])
        
    return sumd

In [236]:
def distortion_silhouette(linkage_matrix, m_pca, m, df, feature_names, search):
    # distortion - sum of squared errors between points and its centroid 
    # barely varies with different cluster numbers
    distortion = dict()
    silhouette = dict()

    for k in range(2, min(math.floor(len(df) / 3), 10)): 
        # max # clusters: 1/3 of documents as long as get on average 10 docs per. Else limit to 1/2 of documents. 
        # min # clusters: 2 
        frameh, labels, centroid = clusters(k, linkage_matrix, m_pca, m, df, feature_names, search)

        # calculate silhouette 
        silhouette[k] = silhouette_avg(frameh, m_pca)

        # calculate distortion
        sumd = distortion_calculate(m_pca, centroid, frameh)
        # take average 
        distortion[k] = sumd
        
    return distortion, silhouette

In [15]:
def distortion_roc(distortion):
    # relative rate of change 
    roc = []
    for k,v in distortion.items(): 
        if k+1 in distortion:
            roc.append(abs(distortion[k+1] - distortion[k]) / distortion[k])
            
    return roc

In [16]:
def find_k(roc):
    # find k using knee method 
    from kneed import KneeLocator
    kn = KneeLocator(range(len(roc)), roc, curve='convex', direction='decreasing')
    k = kn.knee + 1 # index started at 0 
    
    return k 

In [18]:
# plot distortion
#fig, ax = plt.subplots()

#distortion = sorted(distortion.items()) # sorted by key, return a list of tuples
#x, y = zip(*distortion) # unpack a list of pairs into two tuples
#ax.plot(x,y)
#ax.axvline(k, color = 'black')

In [234]:
def main(reduce):
    
    # read in data
    df = pd.read_pickle('reuters_processed')
    
    distortion_dict = dict()
    silhouette_dict = dict()
    k_dict = dict()
    labels_dict = dict()
    
    df_final = pd.DataFrame()
    
    for search in ['sugar', 'gold']: #'sugar', 'interest', 'gold'
        df_subset = df[df.categories.map(set([search]).issubset)] 
        df_subset = df_subset.reset_index()
        
        print(search)
        
        # TF-IDF matrix
        tfidf, feature_names = tf_idf(df_subset)
        
        # PCA dimensionality reduction
        if reduce:
            tfidf_unreduced = tfidf.copy()
            tfidf = pca_reduce(tfidf)
                    
        # distances 
        dist, dist_flat = dist_calculate(tfidf)
                
        # linkage matrix
        linkage_matrix = linkage_calculate(dist_flat)
        
        # find K 
        if reduce: 
            # use non-reduced tfidf to find labels. reduced for everything else. 
            distortion_lst, silhouette_lst = distortion_silhouette(linkage_matrix, tfidf, tfidf_unreduced, df_subset,
                                                                   feature_names, search)
            roc = distortion_roc(distortion_lst)
            k = find_k(roc)
            # final flat clusters
            frameh, labels, centroid = clusters(k, linkage_matrix, tfidf, tfidf_unreduced, df_subset, feature_names, search)
            distortion = distortion_calculate(tfidf, centroid, frameh)
            silhouette = silhouette_avg(frameh, tfidf)
            
        else:
            # pass tfidf in for both reduced and unreduced arguments
            distortion_lst, silhouette_lst = distortion_silhouette(linkage_matrix, tfidf, tfidf, df_subset, 
                                                                   feature_names, search)
            roc = distortion_roc(distortion_lst)
            k = find_k(roc)

            # final flat clusters
            frameh, labels, centroid = clusters(k, linkage_matrix, tfidf, tfidf, df_subset, feature_names, search)
            distortion = distortion_calculate(tfidf, centroid, frameh)
            silhouette = silhouette_avg(frameh, tfidf)
            
        distortion_dict[search] = distortion
        silhouette_dict[search] = silhouette
        k_dict[search] = k
        labels_dict[search] = labels
        
        frameh['search'] = search
        df_final = df_final.append(frameh)


    return distortion_dict, silhouette_dict, k_dict, labels_dict, df_final

In [239]:
distortion_dict, silhouette_dict, k_dict, labels_dict, df_final = main(reduce = True)

sugar
gold


In [None]:
### TO DO: visualization, hierarchies 

In [240]:
labels_dict


{'sugar': [['production', 'beet', 'imports', 'output', 'cane'],
  ['ecus', 'traders', 'export', 'licences', 'ec', 'rebate'],
  ['cargoes', 'white', 'traders', 'tender', 'india'],
  ['intervention', 'ec', 'rebate', 'ecus', 'bd']],
 'gold': [['reserves', 'ore', 'production', 'dollars', 'company'],
  ['coins', 'warrants', 'price', 'market', 'coin']]}