In [48]:
import pandas as pd
from ast import literal_eval
from collections import Counter
import numpy as np

In [49]:
df = pd.read_csv("../results/cleaned_data.csv")
clusters = pd.read_csv("../results/clusters_both_100_30_01_27")

In [50]:
window = 20

In [51]:
note_list = df.noteTextList.tolist() 
tweet_list = df.tweetTextList.tolist()

note_list = [literal_eval(x) for x in note_list]
tweet_list = [literal_eval(x) for x in tweet_list]


word_list = note_list + tweet_list

In [52]:
def return_neighborhood_cts(word_list, i, window):
    # INPUT: a list of tokens, a pointer for the key word, window size
    # OUTPUT: a dictionary of counts for the neighborhood of the key word
    pre_pane_li = i-window
    pre_pane_ri = i
    post_pane_li = i+1
    post_pane_ri = i+1+window
    if pre_pane_li < window  and i < window:
        pre_pane_li = 0
    if i == len(word_list) - 1:
        pre_pane_li = len(word_list) - window - 1
    neighborhood = word_list[pre_pane_li:pre_pane_ri] + word_list[post_pane_li:post_pane_ri]
    inner_vecs = dict.fromkeys(set(neighborhood), 0)
    for word in neighborhood:
        inner_vecs[word] = inner_vecs[word] + 1
    return inner_vecs

master_class = {}
total_ct = 0
for l in word_list:
    word_list = l
    outer_vec = {}
    for i in range(len(word_list)): # Looping thru all words
        tmp = return_neighborhood_cts(word_list, i, window) # returning neighborhood counts for a key word
        if (word_list[i] in outer_vec.keys()): # if a key word already has a dict entry, add to it
            for k, v in tmp.items():
                total_ct += v
                try:
                    outer_vec[word_list[i]][k] = outer_vec[word_list[i]][k] + v
                except:
                    outer_vec[word_list[i]][k] = v
        else: # if a key word does not have a dict entry, create it
            outer_vec[word_list[i]] = tmp
            total_ct += sum(tmp.values())
            
    # updating the master dict over many documents
    for k,v in outer_vec.items():
        try:
            master_class[k] = Counter(master_class[k]) + Counter(outer_vec[k])
        except:
            master_class[k] = outer_vec[k]


# make every dict in the master dictionary a counter object such that when a co-occurence doesn't occur, the joint 
# probability == 0 

master_class = {key: Counter(value) for key, value in master_class.items()}

In [53]:
def npmi(w1, w2, vectors, vector_ct):
    # INPUT two words to compare, dict of vectors, total word count for denom
    # OUTPUT ppmi for those two words
    eps = 10**(-12)
    # numerator
    w1w2_dc = vectors[w1][w2] / vector_ct
    w1_dc = sum(vectors[w1].values()) / vector_ct
    w2_dc = sum(vectors[w2].values()) / vector_ct
    
    pmi_w1w2 = np.log((w1w2_dc) / ((w1_dc * w2_dc) + eps) + eps)
    npmi_w1w2 = pmi_w1w2 / (- np.log( (w1w2_dc) + eps))
    
    return npmi_w1w2

In [54]:
total_ct

7869759

In [55]:
clusters = clusters.groupby(['Cluster']).agg({"Word_Type": list})

In [56]:
test

['back',
 'close',
 'run',
 'break',
 'leave',
 'show',
 'see',
 'become',
 'move',
 'come']

In [57]:
cluster_ids = len(clusters)


avg_npmis = []
for i in range(cluster_ids):
    test = clusters.iloc[i]['Word_Type']

    npmi_scores = {}
    for w1 in test:
        npmi_sum = 0
        ct = 0
        for w2 in test:
            if w1 != w2:
                res = npmi(w1, w2, master_class, total_ct)
    #             print(w1, w2, res)
                npmi_sum += res
                ct += 1
            else:
                pass
        npmi_scores[w1] = npmi_sum/ct
    
    res = 0
    for val in npmi_scores.values():
        res += val
  
    # using len() to get total keys for mean computation
    res = res / len(npmi_scores)
    
    final = (i,res)
    avg_npmis.append(np.around((final),5))
            
    

In [58]:
clusters.iloc[5]

Word_Type    [putins, trumps, president, vladimir, obamas, ...
Name: 5, dtype: object

In [59]:
avg_npmis

[array([ 0.     , -0.67456]),
 array([ 1.     , -0.95011]),
 array([ 2.     , -0.28948]),
 array([ 3.     , -0.43293]),
 array([ 4.     , -0.08439]),
 array([ 5.     , -0.65637]),
 array([ 6.     , -0.48618]),
 array([ 7.     , -0.30972]),
 array([ 8.    , -0.7374]),
 array([ 9.     , -0.36714]),
 array([10.     , -0.54193]),
 array([11.     , -0.92604]),
 array([12.     , -0.45119]),
 array([13.     , -0.88198]),
 array([14.     , -0.66214]),
 array([15.     , -0.74918]),
 array([16.    , -0.8277]),
 array([17.     , -0.55288]),
 array([18., -1.]),
 array([19.     , -0.94647]),
 array([20.    , -0.7639]),
 array([21.     , -0.94433]),
 array([22.     , -0.87479]),
 array([23.     , -0.80176]),
 array([24.     , -0.89107]),
 array([25.     , -0.53321]),
 array([26.     , -0.46753]),
 array([27.     , -0.51146]),
 array([28.     , -0.37662]),
 array([29.     , -0.27219])]

In [60]:
avg_npmis

[array([ 0.     , -0.67456]),
 array([ 1.     , -0.95011]),
 array([ 2.     , -0.28948]),
 array([ 3.     , -0.43293]),
 array([ 4.     , -0.08439]),
 array([ 5.     , -0.65637]),
 array([ 6.     , -0.48618]),
 array([ 7.     , -0.30972]),
 array([ 8.    , -0.7374]),
 array([ 9.     , -0.36714]),
 array([10.     , -0.54193]),
 array([11.     , -0.92604]),
 array([12.     , -0.45119]),
 array([13.     , -0.88198]),
 array([14.     , -0.66214]),
 array([15.     , -0.74918]),
 array([16.    , -0.8277]),
 array([17.     , -0.55288]),
 array([18., -1.]),
 array([19.     , -0.94647]),
 array([20.    , -0.7639]),
 array([21.     , -0.94433]),
 array([22.     , -0.87479]),
 array([23.     , -0.80176]),
 array([24.     , -0.89107]),
 array([25.     , -0.53321]),
 array([26.     , -0.46753]),
 array([27.     , -0.51146]),
 array([28.     , -0.37662]),
 array([29.     , -0.27219])]