In [1]:
import nltk
from nltk.corpus import reuters
import re
import numpy as np
import pandas as pd
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
import sklearn
from scipy.cluster.hierarchy import dendrogram, linkage, ward, fcluster
import networkx as nx
import collections
import math
import operator
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.decomposition import PCA
from kneed import KneeLocator
from sklearn.manifold import TSNE

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gimli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Hierarchial Clustering

In [2]:
df = pd.read_pickle('reuters_processed')

In [3]:
df.categories

0                                                  [trade]
1                                                  [grain]
2                                         [crude, nat-gas]
3           [corn, grain, rice, rubber, sugar, tin, trade]
4                                      [palm-oil, veg-oil]
5                                                   [ship]
6              [coffee, lumber, palm-oil, rubber, veg-oil]
7                                           [grain, wheat]
8                                                   [gold]
9                                                    [acq]
10                                                   [tin]
11                                    [interest, money-fx]
12                                           [acq, copper]
13                                                   [ipi]
14       [carcass, corn, grain, livestock, oilseed, ric...
15                                                  [earn]
16                                                  [ear

### Tf-Idf Matrix

In [4]:
def tf_idf(df):
    # tfidf. stop word removal. word tokenizer. 
    tfidf = TfidfVectorizer(stop_words = 'english', analyzer = 'word')
    m = tfidf.fit_transform(df['text'])
    
    feature_names = tfidf.get_feature_names() # words 

    return m, feature_names

### Distance Matrix

In [5]:
def dist_calculate(m):
    dist = euclidean_distances(m)  ## I think its ok to use euclidean because tf-idf normalizes
    # euclidean can be innaccurate if documents are different lengths such that vectors are different lengths 
    # I would prefer to use euclidean because then more sensicl to calculate centroids
    # ask professor?? 
    return dist

### PCA Dimensionality Reduction

In [6]:
def pca_reduce(m):
    pca = PCA(n_components = 0.8) # keep 95% of variance 
    pcam = pca.fit_transform(m.toarray())

    return pcam

### Linkage Matrix

In [7]:
def linkage_calculate(dist):
    linkage_matrix = linkage(dist, method = 'ward') 
    return linkage_matrix
    
# plot dendogram
#fig, ax = plt.subplots(figsize=(15, 20))
#ax = dendrogram(linkage_matrix, orientation="right", labels = df_retail.ids.unique())

In [8]:
def frame_merge(df, f):
    # merge in with original data via pandas
    frameh = pd.DataFrame(df.index, index = [f], columns = ['index_retail'])
    frameh = pd.merge(frameh, df, right_index = True, left_on = 'index_retail')
    frameh['cluster'] = frameh.index.str[0]
    frameh = frameh.reset_index()
    return frameh

In [135]:
def centroid_label(frameh, m_pca, m, feature_names):
    # most common words in clusters (based on tf-idf not just frequency)
    centroid = dict()
    labels = []
    for c in list(frameh.cluster.unique()):
        ## centroid ## 
        cluster1 = list(frameh[frameh.cluster == c].index.unique())
        # find documents cluster
        m1_pca = m_pca[cluster1,:]
        # take mean vector among all documents
        m1_pca = m1_pca.mean(axis = 0)
        # record mean vector: centroids of each sub cluster
        centroid[c] = m1_pca

        ## labels ##
        # redo mean vector with non-reduced tfidf matrix 
        m1 = m[cluster1,:]
        # take mean vector among all documents
        m1 = m1.mean(axis = 0)
        
        # max values in mean vector 
        lst = []

        for i in np.argsort(np.asarray(m1)[0])[::-1][:6]:
            lst.append(feature_names[i])

        labels.append(lst)
        
    return labels, centroid

In [76]:
def silhouette_individ(frameh, m):
    sil_a = dict()
    for c in list(frameh.cluster.unique()):
        sil_a[c] = dict()
        docs_i = list(frameh[frameh.cluster == c].index.unique())
        for i in docs_i:
            lst = []
            for j in docs_i: 
                if i != j:
                    if type(m) == np.ndarray: # if pca reduced, then ndarray instead of matrix
                        lst.append(np.linalg.norm(m[i]-m[j]))
                    else:
                        lst.append(np.linalg.norm(m[i].toarray()-m[j].toarray()))
            sil_a[c][i] = np.mean(lst)

    sil_b = dict()
    for c in list(frameh.cluster.unique()):
        sil_b[c] = dict()
        docs_in = list(frameh[frameh.cluster == c].index.unique())
        docs_out = list(frameh[frameh.cluster != c].index.unique())
        for i in docs_in:
            lst = []
            for j in docs_out: 
                if type(m) == np.ndarray:
                    lst.append(np.linalg.norm(m[i]-m[j]))
                else:
                    lst.append(np.linalg.norm(m[i].toarray()-m[j].toarray()))
            sil_b[c][i] = np.mean(lst)
            
    return sil_a, sil_b

In [43]:
def silhouette_take_avg(sil):
    avg = []
    for v in sil.values():
        avg.append(list(v.values()))
    avg = [item for sublist in avg for item in sublist]
    avg = [0 if math.isnan(i) else i for i in avg]
    avg = np.mean(avg)
    
    return avg

In [44]:
def silhouette_avg(frameh, m):
    sil_a, sil_b = silhouette_individ(frameh, m)
    avga = silhouette_take_avg(sil_a)
    avgb = silhouette_take_avg(sil_b)

    return (avgb - avga) / max(avgb, avga)

In [136]:
def clusters(k, linkage_matrix, m_pca, m, df, feature_names):
    f = fcluster(linkage_matrix, k, criterion = 'maxclust')
    frameh = frame_merge(df, f)
    labels, centroid = centroid_label(frameh, m_pca, m, feature_names)

    return frameh, labels, centroid

In [155]:
def distortion_calculate(linkage_matrix, m_pca, m, df, feature_names):
    # distortion - sum of squared errors between points and its centroid 
    # barely varies with different cluster numbers
    distortion = dict()
    silhouette = dict()

    for k in range(2, min(math.floor(len(df) / 3), 10)): 
        # max # clusters: 1/3 of documents as long as get on average 10 docs per. Else limit to 1/2 of documents. 
        # min # clusters: 2 
        frameh, labels, centroid = clusters(k, linkage_matrix, m_pca, m, df, feature_names)

        # calculate silhouette 
        silhouette[k] = silhouette_avg(frameh, m_pca)

        # calculate distortion
        sumd = 0
        for i in list(frameh.index.unique()):
            c = int(frameh[frameh.index == i].cluster)
            sumd += np.linalg.norm(m_pca[i]-centroid[c])

        # take average 
        distortion[k] = sumd
        
    return distortion, silhouette

In [15]:
def distortion_roc(distortion):
    # relative rate of change 
    roc = []
    for k,v in distortion.items(): 
        if k+1 in distortion:
            roc.append(abs(distortion[k+1] - distortion[k]) / distortion[k])
            
    return roc

In [16]:
def find_k(roc):
    # find k using knee method 
    from kneed import KneeLocator
    kn = KneeLocator(range(len(roc)), roc, curve='convex', direction='decreasing')
    k = kn.knee + 1 # index started at 0 
    
    return k 

In [18]:
# plot distortion
#fig, ax = plt.subplots()

#distortion = sorted(distortion.items()) # sorted by key, return a list of tuples
#x, y = zip(*distortion) # unpack a list of pairs into two tuples
#ax.plot(x,y)
#ax.axvline(k, color = 'black')

In [49]:
tfidf

array([[-0.12725811, -0.11379691, -0.03273929, ..., -0.02050605,
        -0.02364101,  0.01509296],
       [-0.12587341, -0.10832697, -0.02281295, ..., -0.02287693,
        -0.00580824,  0.01343096],
       [-0.1865698 , -0.09650942, -0.03255715, ..., -0.00093077,
        -0.03636289,  0.05258835],
       ...,
       [ 0.29685376,  0.02391125,  0.09261609, ...,  0.0069483 ,
         0.00323947, -0.00942778],
       [ 0.29685376,  0.02391125,  0.09261609, ...,  0.0069483 ,
         0.00323947, -0.00942778],
       [-0.06567059,  0.03205485, -0.12472584, ..., -0.01930781,
         0.02698276, -0.01353801]])

In [149]:
def main(reduce):
    for categories in ['ship']: #'sugar', 'interest', 'gold'
        df_subset = df[df.categories.map(set(['ship']).issubset)] 
        df_subset = df_subset.reset_index()
        
        # TF-IDF matrix
        tfidf, feature_names = tf_idf(df_subset)
        
        # PCA dimensionality reduction
        if reduce:
            tfidf_unreduced = tfidf.copy()
            tfidf = pca_reduce(tfidf)
        
        # distances 
        dist = dist_calculate(tfidf)
        
        # linkage matrix
        linkage_matrix = linkage_calculate(dist)
        
        # find K 
        # use non-reduced tfidf to find labels. reduced for everything else. 
        if reduce: 
            distortion, silhouette = distortion_calculate(linkage_matrix, tfidf, tfidf_unreduced, df_subset, feature_names)
            roc = distortion_roc(distortion)
            k = find_k(roc)

            # final flat clusters
            frameh, labels, centroid = clusters(k, linkage_matrix, tfidf, tfidf_unreduced, df_subset, feature_names)
        else:
            # pass tfidf in for both reduced and unreduced arguments
            distortion, silhouette = distortion_calculate(linkage_matrix, tfidf, tfidf, df_subset, feature_names)
            roc = distortion_roc(distortion)
            k = find_k(roc)

            # final flat clusters
            frameh, labels, centroid = clusters(k, linkage_matrix, tfidf, tfidf, df_subset, feature_names)

    return frameh, labels, centroid, k, df_subset, feature_names, tfidf, linkage_matrix

In [156]:
frameh, labels, centroid, k, df_subset, feature_names, tfidf, linkage_matrix = main(reduce = True)

  


In [None]:
## in centroid_label, feature_names is not reduced. There is no way to preserve labels with reducing
# either abandon PCA or 

### TO DO: visualization, hierarchies 

In [157]:
labels

[['strike', 'gulf', 'seamen', 'ships', 'waiting', 'shipping'],
 ['gulf', 'iranian', 'oil', 'iran', 'port', 'strike'],
 ['portland', 'loading', 'grain', 'ships', 'merchants', 'load'],
 ['zverev', 'expense', 'fights', 'figure', 'figures', 'fiji']]

In [133]:
labels

[['port', 'strike', 'tonnes', 'union', 'seamen', 'dollars'],
 ['portland', 'loading', 'grain', 'ships', 'merchants', 'load'],
 ['zverev', 'expense', 'fights', 'figure', 'figures', 'fiji'],
 ['gulf', 'iran', 'iranian', 'oil', 'attack', 'kuwait']]

In [153]:
k

4