In [1]:
import pandas as pd
from ast import literal_eval
from collections import Counter
import numpy as np

In [2]:
df = pd.read_csv("../results/cleaned_data.csv")
clusters = pd.read_csv("../results/clusters_both_100_30_01_27")

In [3]:
window = 10

In [4]:
note_list = df.noteTextList.tolist() 
tweet_list = df.tweetTextList.tolist()

note_list = [literal_eval(x) for x in note_list]
tweet_list = [literal_eval(x) for x in tweet_list]


word_list = note_list + tweet_list

In [5]:
def return_neighborhood_cts(word_list, i, window):
    # INPUT: a list of tokens, a pointer for the key word, window size
    # OUTPUT: a dictionary of counts for the neighborhood of the key word
    pre_pane_li = i-window
    pre_pane_ri = i
    post_pane_li = i+1
    post_pane_ri = i+1+window
    if pre_pane_li < window  and i < window:
        pre_pane_li = 0
    if i == len(word_list) - 1:
        pre_pane_li = len(word_list) - window - 1
    neighborhood = word_list[pre_pane_li:pre_pane_ri] + word_list[post_pane_li:post_pane_ri]
    inner_vecs = dict.fromkeys(set(neighborhood), 0)
    for word in neighborhood:
        inner_vecs[word] = inner_vecs[word] + 1
    return inner_vecs

master_class = {}
total_ct = 0
for l in word_list:
    word_list = l
    outer_vec = {}
    for i in range(len(word_list)): # Looping thru all words
        tmp = return_neighborhood_cts(word_list, i, window) # returning neighborhood counts for a key word
        if (word_list[i] in outer_vec.keys()): # if a key word already has a dict entry, add to it
            for k, v in tmp.items():
                total_ct += v
                try:
                    outer_vec[word_list[i]][k] = outer_vec[word_list[i]][k] + v
                except:
                    outer_vec[word_list[i]][k] = v
        else: # if a key word does not have a dict entry, create it
            outer_vec[word_list[i]] = tmp
            total_ct += sum(tmp.values())
            
    # updating the master dict over many documents
    for k,v in outer_vec.items():
        try:
            master_class[k] = Counter(master_class[k]) + Counter(outer_vec[k])
        except:
            master_class[k] = outer_vec[k]


# make every dict in the master dictionary a counter object such that when a co-occurence doesn't occur, the joint 
# probability == 0 

master_class = {key: Counter(value) for key, value in master_class.items()}

In [6]:
def npmi(w1, w2, vectors, vector_ct):
    # INPUT two words to compare, dict of vectors, total word count for denom
    # OUTPUT ppmi for those two words
    eps = 10**(-12)
    # numerator
    w1w2_dc = vectors[w1][w2] / vector_ct
    w1_dc = sum(vectors[w1].values()) / vector_ct
    w2_dc = sum(vectors[w2].values()) / vector_ct
    
    pmi_w1w2 = np.log((w1w2_dc) / ((w1_dc * w2_dc) + eps) + eps)
    npmi_w1w2 = pmi_w1w2 / (- np.log( (w1w2_dc) + eps))
    
    return npmi_w1w2

In [7]:
total_ct

6043395

In [8]:
clusters = clusters.groupby(['Cluster']).agg({"Word_Type": list})

In [9]:
test

NameError: name 'test' is not defined

In [45]:
cluster_ids = len(clusters)


avg_npmis = []
for i in range(cluster_ids):
    test = clusters.iloc[i]['Word_Type']

    npmi_scores = {}
    for w1 in test:
        npmi_sum = 0
        ct = 0
        for w2 in test:
            if w1 != w2:
                res = npmi(w1, w2, master_class, total_ct)
    #             print(w1, w2, res)
                npmi_sum += res
                ct += 1
            else:
                pass
        npmi_scores[w1] = npmi_sum/ct
    
    res = 0
    for val in npmi_scores.values():
        res += val
  
    # using len() to get total keys for mean computation
    res = res / len(npmi_scores)
    
    final = (i,res)
    avg_npmis.append(np.around((final),5))
            
    

In [46]:
clusters.iloc[5]

Word_Type    [putins, trumps, president, vladimir, obamas, ...
Name: 5, dtype: object

In [47]:
avg_npmis

[array([ 0.     , -0.73852]),
 array([ 1.     , -0.94877]),
 array([ 2.     , -0.35078]),
 array([ 3. , -0.5]),
 array([ 4.     , -0.10452]),
 array([ 5.     , -0.65276]),
 array([ 6.     , -0.57188]),
 array([ 7.     , -0.43032]),
 array([ 8.     , -0.80552]),
 array([ 9.     , -0.43341]),
 array([10.     , -0.57092]),
 array([11.     , -0.97453]),
 array([12.     , -0.51724]),
 array([13.     , -0.92967]),
 array([14.     , -0.70535]),
 array([15.     , -0.78769]),
 array([16.     , -0.90966]),
 array([17.     , -0.59134]),
 array([18., -1.]),
 array([19.     , -0.97282]),
 array([20.     , -0.78646]),
 array([21.     , -0.94342]),
 array([22.     , -0.87268]),
 array([23.     , -0.77553]),
 array([24.     , -0.91903]),
 array([25.     , -0.58053]),
 array([26.    , -0.5171]),
 array([27.     , -0.56065]),
 array([28.     , -0.39145]),
 array([29.     , -0.33827])]

In [42]:
avg_npmis

[(0, -0.7385150042359989),
 (1, -0.9487718674723971),
 (2, -0.35077506603635433),
 (3, -0.5000042280502419),
 (4, -0.10451733598865762),
 (5, -0.6527563809317293),
 (6, -0.5718801579821878),
 (7, -0.43031819889632034),
 (8, -0.8055181263825594),
 (9, -0.43340793357870144),
 (10, -0.5709188743264241),
 (11, -0.9745342376829866),
 (12, -0.5172387797914599),
 (13, -0.9296743026580033),
 (14, -0.7053537562822005),
 (15, -0.7876921353192519),
 (16, -0.9096572926519337),
 (17, -0.5913430852915638),
 (18, -1.0),
 (19, -0.9728153669669759),
 (20, -0.7864610390665889),
 (21, -0.9434224785973466),
 (22, -0.8726777904460292),
 (23, -0.7755253079831277),
 (24, -0.9190288360473906),
 (25, -0.5805277855677268),
 (26, -0.5170993632431421),
 (27, -0.5606471900258804),
 (28, -0.3914496345441007),
 (29, -0.3382671576512696)]