TO DO SVD:
- Need evaluation metrics: silhouette, distortion for svd as well. 
- Try to get phrases rather than just words - perhaps beyond scope
- Eliminate overlapping labels - simple: combine clusters if one same word in label (assuming not using phrases)

TO DO in general:
- score and rank clusters 
- metrics to compare clusters for two methods
- Scale up to multiple search terms. Figure out metrics to evaluate. (silhouette etc.)
- Visualization with tsne or umap rather than mds -- tsne works. can't get umap to install.   

In [1]:
import nltk
from nltk.corpus import reuters
import re
import numpy as np
import pandas as pd
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
import sklearn
from scipy.cluster.hierarchy import dendrogram, linkage, ward, fcluster
import networkx as nx
import collections
import math
import operator
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.decomposition import PCA
from kneed import KneeLocator
from sklearn.manifold import TSNE

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gimli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Tf-Idf Matrix

In [2]:
def tf_idf(df):
    # tfidf. stop word removal. word tokenizer. 
    tfidf = TfidfVectorizer(stop_words = 'english', analyzer = 'word')
    m = tfidf.fit_transform(df['text'])
    
    feature_names = tfidf.get_feature_names() # words 

    return m, feature_names

### Remove Search from TF-IDF Matrix

In [91]:
def remove_search(tfidf, feature_names, search):
    try: # sometimes search already removed (stop word)
        # remove search from the tfidf matrix: do not want as a label or clustering factor
        search_index = feature_names.index(search)
        cols = list(range(0,len(feature_names)))
        del cols[cols.index(search_index)]
        tfidf = tfidf[:,cols]
        del feature_names[search_index]
    except ValueError: 
        pass
    except:
        raise 'unknown error'
    
    return tfidf, feature_names

### SVD 

In [4]:
def svd_calculate(m):
    U, S, Vt = np.linalg.svd(m.todense(), full_matrices = False) # full_matrices make dimensions work
    V = Vt.T
    return U, S, V

### Find K

In [5]:
# variance for various values of k -> rate of change 
# preference is to choose some percent of variance, but 80% retains too many topics 
def roc_var_calculate(S):
    # calculate variance for various values of k
    k_var_lst = []
    k_var = 0
    for i in S:
        k_var += i**2
        k_var_lst.append(k_var)
            
    # rate of change of variance
    roc = []
    for k in range(len(k_var_lst)):
        if k+1 < len(k_var_lst):
            roc.append(abs(k_var_lst[k+1] - k_var_lst[k]) / k_var_lst[k])
    
    return roc

In [6]:
# Find Elbow in Variance ROC 
def find_knee(roc):
    # knee in variance ROC 
    kn = KneeLocator(range(len(roc)), roc, curve='convex', direction='decreasing')
    k = kn.knee
    #https://raghavan.usc.edu//papers/kneedle-simplex11.pdf
    
    return k 

In [7]:
#fig, ax = plt.subplots()
#ax.plot(roc)
#ax.axvline(k, color = 'black')
#print(k)

### Reduce Dimensionality

In [8]:
def reduce_V(V, k):
    # zero out non-selected k's
    #S[k:] = 0
    V = V[:,:k] 
    # V is term to concept. Because no phrases yet, no need to multiple by a term matrix. Currently T = identity 
    
    return V 

### Find Cluster Labels

In [74]:
def find_labels(V, feature_names): # top 3 words/vectors 
    # find maximum 3 term vectors for each column (concept) in V
    # find top 3 with key = top 1, 2, 3
    max_w = dict()
    for i in range(1,4):
        max_w[i] = []
        for r in range(len(V.T)):
            max_w[i].append(np.array(V.T[r])[0].argsort()[-i:][::-1][i-1])

    # find top 3 with key = concept
    max_w3 = dict()
    for r in range(len(V.T)):
        if r in max_w3:
            max_w3[r].append(np.array(V.T[r])[0].argsort()[-3:][::-1])
        else:
            max_w3[r] = [np.array(V.T[r])[0].argsort()[-3:][::-1]]


    # find corresponding words -> labels 
    labels = dict()
    for k,v in max_w3.items():
        labels[k] = []
        for w in v[0]:
            labels[k].append(feature_names[w])
            
    return labels, max_w
    
# label overlap only if use phrases 
# TO DO: what about phrases? then could just use max 

### Find Documents in Clusters Based on Labels 
Can include any of the top 3 words that describes each cluster/concept

In [10]:
def find_docs(feature_names, max_w, m):
    # term-concept label matrix 
    # term-term matrix is identity because currently no phrases. 

    # compute for each of the top 3 words 
    # assign to cluster if any of those words exist in the document 
    Q1 = np.identity(len(feature_names))[:,max_w[1]]
    Q2 = np.identity(len(feature_names))[:,max_w[2]]
    Q3 = np.identity(len(feature_names))[:,max_w[3]]

    # cij = strength of membership of jth document to ith concept 
    C1 = np.matmul(Q1.T, m.T.toarray())
    C2 = np.matmul(Q2.T, m.T.toarray())
    C3 = np.matmul(Q3.T, m.T.toarray())
    
    # choose documents for each cluster with strength > 0 aka exists in document
    # any of the top 3 words in the cluster 
    docs = dict()
    for r in range(len(C1)):
        docs[r] = []
        for c in range(len(C1[r])): 
            if C1[r][c] > 0 or C2[r][c] > 0 or C3[r][c] > 0: # threshold 
                docs[r].append(c)

    # documents can be in multiple clusters 
    # documents can be in no clusters 

    # drop clusters with only 1 document in it 
    del_lst = []
    for k,v in docs.items():
        if len(v) == 1:
            del_lst.append(k)
    for i in del_lst:
        del docs[i]
        
    return docs

## Assign Clusters in DF 

In [11]:
def cluster_df(df, docs):
    # create dataframe that indicates which documents belong to which cluster and labels. List of clusters. 
    framesvd = df
    #framesvd['label'] = ''
    framesvd['cluster'] = ''
    for k,v in docs.items():
        for d in v: 
            framesvd.cluster = np.where(framesvd.index == d, framesvd.cluster + str(k) + ',', framesvd.cluster)
            #framesvd.label = np.where(framesvd.index == d, str(labels[k]), framesvd.label)
                    # this only gets one of th clusters in the list. not all of them. 

    framesvd.cluster = framesvd.cluster.str.split(',')
    framesvd.cluster = framesvd.cluster.apply(lambda row: [i for i in row if i != ''])
    
    return framesvd

# Main Function

In [92]:
def main():
    
    # read in data
    df = pd.read_pickle('reuters_processed')

    percent_zero_dict = dict()
    df_final = pd.DataFrame()
    labels_dict = dict()

    for search in ['tin', 'sugar', 'interest', 'gold']:
        df_subset = df[df.categories.map(set([search]).issubset)] 
        df_subset = df_subset.reset_index()
        
        print(search)
        
        # TF-IDF matrix
        tfidf, feature_names = tf_idf(df_subset)
        
        # remove search from tf-idf matrix
        tfidf, feature_names = remove_search(tfidf, feature_names, search)
        
        # SVD 
        U, S, V = svd_calculate(tfidf)
        
        # Find K and reduce dimensionality
        roc = roc_var_calculate(S)
        k = find_knee(roc)
        V = reduce_V(V, k)
        
        # Find cluster labels
        labels, max_w = find_labels(V, feature_names)
        
        # Assign documents to clusters based on labels
        docs = find_docs(feature_names, max_w, tfidf)
        
        # mark clusters in dataframe
        frame = cluster_df(df_subset, docs)

        # percent of documents with no cluster
        frame['len']= frame.cluster.apply(lambda row: len(row))
        percent_zero_dict[search] = len(frame[frame.len == 0]) / len(frame)
        
        labels_dict[search] = labels
        
        frame['search'] = search
        df_final = df_final.append(frame)
        
    return df_final, labels_dict, percent_zero_dict

In [93]:
df_final, labels_dict, percent_zero_dict = main()

tin
sugar
interest
gold


In [95]:
percent_zero_dict

{'tin': 0.1,
 'sugar': 0.13580246913580246,
 'interest': 0.027196652719665274,
 'gold': 0.056451612903225805}

In [94]:
labels_dict

{'tin': {0: ['miners', 'strike', 'government'],
  1: ['miners', 'strike', 'corporation'],
  2: ['atpc', 'exports', 'thailand'],
  3: ['extension', 'ec', 'atpc']},
 'sugar': {0: ['zimbabwe', 'gabon', 'guinea'],
  1: ['production', 'cargoes', 'beet'],
  2: ['cargoes', 'india', 'white'],
  3: ['received', 'tonnes', 'current'],
  4: ['beet', 'bd', 'white'],
  5: ['beet', 'commission', 'plantings'],
  6: ['quota', 'imports', 'import'],
  7: ['exports', 'intervention', 'imports'],
  8: ['white', 'denatured', 'tender'],
  9: ['denatured', 'raw', 'soviet']},
 'interest': {0: ['stg', 'bank', 'market'],
  1: ['stg', 'band', 'bank'],
  2: ['rate', 'prime', 'rates'],
  3: ['prime', 'bank', 'france'],
  4: ['bundesbank', 'money', 'market'],
  5: ['reserves', 'add', 'federal'],
  6: ['france', 'intervention', 'unchanged'],
  7: ['base', 'france', 'intervention'],
  8: ['bundesbank', 'leaves', 'prime'],
  9: ['bills', 'average', 'dollars'],
  10: ['bundesbank', 'bills', 'average'],
  11: ['bank', 'en

In [96]:
df_final

Unnamed: 0,index,ids,categories,text,cluster,len,search
0,3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",thai trade deficit widens in first thailands...,[2],1,tin
1,10,test/14844,[tin],subroto says indonesia supports tin pact exten...,[3],1,tin
2,26,test/14877,[tin],ec mainly for tin extension no uk stand taken ...,[3],1,tin
3,148,test/15112,[tin],paris group publishing daily franc tin price ...,[],0,tin
4,198,test/15219,[tin],talks continue on tin agreement extension di...,[3],1,tin
5,436,test/15624,[tin],spot tin easier on european free market spot...,[],0,tin
6,532,test/15817,[tin],consensus seen on tin pact extension the qua...,[3],1,tin
7,1345,test/17731,[tin],tin traders response muted to kl futures marke...,"[0, 2, 3]",3,tin
8,1806,test/18924,[tin],itc contests use of documents as court evidenc...,[3],1,tin
9,1863,test/19065,[tin],thai smelter faces tin concentrate supply shor...,"[0, 2, 3]",3,tin
