# Compare Lingo and Hierarchical Results 
- Searches that return 1 cluster in Lingo: compare with clusters in hierarchical
- Searches with large differences in various stats between Lingo and Hierarchical
    - Can use these for manual/qualitative examination
- Average statistic comparison 
- Average number of documents per cluster
- Percent of label overlap 
- Lingo average # docs in zero vs many clusters

In [1]:
import pickle
import numpy as np
import pandas as pd

### Load Data

In [15]:
with open('lingo', "rb") as f:
    df_final_l = pickle.load(f)
    labels_l = pickle.load(f)
    k_l = pickle.load(f)
    dist_l = pickle.load(f)
    sil_l = pickle.load(f)
    zero = pickle.load(f)
    mult = pickle.load(f)
    cluster1_l = pickle.load(f)

In [16]:
with open('hierarchial', "rb") as f:
    df_final_h = pickle.load(f)
    labels_h = pickle.load(f)
    k_h = pickle.load(f)
    dist_h = pickle.load(f)
    sil_h = pickle.load(f)
    cluster1_h = pickle.load(f)

In [17]:
df = pd.read_pickle('reuters_processed')

## Search Terms w/ 1 Cluster
Both methods produce some searches with 1 cluster.   
Usually because too small ie not enough documents for more than 1 cluster   
For lingo, could also be because labels too overlapping/clusters not semantically distinct enough, so combined into 1

In [18]:
# searches that have 1 cluster in lingo, but multiple in hierarchical
    # all searches with 1 cluster in hierarchical also have 1 cluster in Lingo. Not enough documents
diff = set(cluster1_l).difference(set(cluster1_h))
diff 

{'coconut-oil',
 'dmk',
 'income',
 'instal-debt',
 'l-cattle',
 'naphtha',
 'nickel',
 'oat',
 'potato',
 'rape-oil',
 'tea'}

In [19]:
# distribution of k values for hierarchical for search terms that have 1 cluster in lingo
# higher than average
    # in order to break into clusters, hierarchical finds small specific sub-topics because there aren't big differences
    # hence lingo combines into 1 cluster instead 
pd.DataFrame(list({key:value for (key, value) in k_h.items() if key in diff}.values())).describe()

Unnamed: 0,0
count,11.0
mean,5.909091
std,2.844452
min,2.0
25%,3.5
50%,6.0
75%,8.5
max,9.0


In [20]:
# for ease of comparison, limit hierarchial to lingo searches for rest of stats
df_final_h = df_final_h[df_final_h.search.isin(set(df_final_l.search.unique()))]
for k in diff:
    labels_h.pop(k, None)
    k_h.pop(k, None)
    dist_h.pop(k, None)
    sil_h.pop(k, None)

### Identify Search Terms with Large Differences in Stats between Methods
Investigate search terms to understand why

In [146]:
# function to find search terms for which methods produce statistics different by value of at least n 
def stat_diff(stat1, stat2, n):
    for i in stat1.keys():
        diff = abs(stat2[i] - stat1[i])
        if diff > n:
            print(i)

__Number of Clusters: Differ by 5__

In [147]:
stat_diff(k_h, k_l, 5)

alum
heat
soy-oil
hog
sunseed
platinum
lei
lumber


__Distortion: Differ by 0.5__

In [148]:
stat_diff(dist_h, dist_l, 0.5)

heat
hog
platinum
lei


__Silhouette: Differ by 0.5__

In [149]:
stat_diff(sil_l, sil_h, 0.5)

groundnut
silver
meal-feed
rice
lumber
lei
sun-oil
platinum


### Compare Aggregate Stats

__Distortion__

In [150]:
dist_h_avg = np.mean(list(dist_h.values()))
dist_l_avg = np.mean(list(dist_l.values()))
print('Hierarchial:', dist_h_avg)
print('Lingo:', dist_l_avg)

Hierarchial: 0.49442469524103083
Lingo: 0.7434019082897014


__Silhouette__

In [151]:
sil_h_avg = np.mean(list(sil_h.values()))
sil_l_avg = np.mean(list(sil_l.values()))
print('Hierarchial:', sil_h_avg)
print('Lingo:', sil_l_avg)

Hierarchial: 0.41817061410906964
Lingo: 0.14283906837858099


__Number of Clusters__

In [152]:
k_l_avg = np.mean(list(k_l.values()))
k_h_avg = np.mean(list(k_h.values()))
print('Hierarchial:', k_h_avg)
print('Lingo:', k_l_avg)

Hierarchial: 4.984126984126984
Lingo: 4.126984126984127


### Avg. Number of Documents per Cluster

New dataframe: Number of documents per cluster for each search term    

In [153]:
# hierarchical: number of documents per cluster for each search term 
grouph = df_final_h.groupby(['search','cluster']).ids.agg('count')

In [156]:
# lingo: number of documents per cluster for each search term 
    # documents in multiple clusters, so parse lists appropriately
dfl = df_final_l.copy()[['cluster', 'ids', 'search']]
# take documents in at least one cluster
dfl = dfl[dfl.cluster.str.len() != 0]
# separate lists of multiple clusters into one line per 
dfl = pd.DataFrame([(d, tup.search) for tup in dfl.itertuples() for d in tup.cluster])
dfl.columns = ['cluster', 'search']
dfl['num'] = 1
groupl = dfl.groupby(['search','cluster']).num.agg('count')

Averages

In [155]:
# number of documents per cluster on average 
print('Hierarchial:', grouph.mean())
print('lingo:', groupl.mean())   

Hierarchial: 21.812101910828027
lingo: 40.315384615384616


## Percent of Overlapping Labels
Average percent of labels per search term that are the same

In [157]:
labels_overlap_dict = dict()

# loop through searches
for search in labels_l.keys():
    # for each method, create a flat list of all label words 
    h = []
    l = []
    for v in labels_l[search].values():
        for w in v: 
            h.append(w)
    for v in labels_h[search].values():
        for w in v:
            l.append(w)
    
    # calculate precent the same (intersection size / union size)
    overlap_percent = len(set(h).intersection(set(l))) / len(set(h).union(set(l)))
    
    # record dictionary
    labels_overlap_dict[search] = overlap_percent
    
# take mean of all overlap percents
np.mean(list(labels_overlap_dict.values()))

0.2630759442345624

## Lingo: % documents in 0 clusters and in multiple clusters

In [158]:
# distribution of % docs in multiple clusters
pd.DataFrame(list(mult.values())).describe()

Unnamed: 0,0
count,63.0
mean,0.3043
std,0.222556
min,0.0
25%,0.128538
50%,0.235294
75%,0.47926
max,0.821839


In [159]:
# distribution of % docs in zero clusters
pd.DataFrame(list(zero.values())).describe()

Unnamed: 0,0
count,63.0
mean,0.2396
std,0.148625
min,0.020619
25%,0.129167
50%,0.206897
75%,0.338542
max,0.666667
