In [3]:
import pandas as pd
from ast import literal_eval
from collections import Counter
import numpy as np


In [4]:
df = pd.read_csv("../results/test.csv") # reading in the test data to create a word-word freq matrix 
clusters = pd.read_csv("../results/all_runs.csv") # reading in the top words for each cluster (with hyperparameter variations)

In [7]:
clusters.groupby(['numCluster', 'dimensions'])['Word_Type'].nunique()

numCluster  dimensions
20          100            335
            300            361
            500            341
            768            331
50          100            870
            300            833
            500            845
            768            846
100         100           1735
            300           1677
            500           1656
            768           1667
Name: Word_Type, dtype: int64

In [8]:
clusters.head()

Unnamed: 0.1,Unnamed: 0,Cluster,Word_Type,Weights,SqDist,numCluster,dimensions,seed,silhouette_score
0,0,0,anyones,1.8e-05,1031.16,20,100,372,0.035141
1,1,0,someones,2.7e-05,1095.53,20,100,372,0.035141
2,2,0,pres,4.4e-05,1099.91,20,100,372,0.035141
3,3,0,qs,1.2e-05,1129.63,20,100,372,0.035141
4,4,0,arent,0.000186,1133.64,20,100,372,0.035141


In [None]:
cluster avg distance = for cluster 0 with hyperparameters A,B; it will be the mean sqdist 

In [15]:
temp = clusters.groupby(['numCluster','dimensions','seed','Cluster'])['SqDist'].mean().reset_index()

In [23]:
temp[(temp['dimensions'] == 100) & (temp['seed'] == 372) & (temp['numCluster'] == 100)]

Unnamed: 0,numCluster,dimensions,seed,Cluster,SqDist
840,100,100,372,0,6237.336
841,100,100,372,1,7725.038
842,100,100,372,2,4963.834
843,100,100,372,3,5704.909
844,100,100,372,4,7106.032
...,...,...,...,...,...
935,100,100,372,95,7415.378
936,100,100,372,96,6040.071
937,100,100,372,97,5295.390
938,100,100,372,98,7408.894


In [4]:
df['noteLength'] = df['noteText'].str.split().str.len()
df['tweetLength'] = df['tweetText'].str.split().str.len()

In [28]:
# TODO: make this into a log 
df['tweetLength'].quantile(0.25)

14.0

In [9]:
window = 14
# the window should be longer for semantics. The first quartile of note length is 14 words, for both tweets and notes, 
# so I'll go with that 

In [13]:

# combining note and tweet text into a single list 

note_list = df.noteTextList.tolist() 
tweet_list = df.tweetTextList.tolist()

note_list = [literal_eval(x) for x in note_list]
tweet_list = [literal_eval(x) for x in tweet_list]


word_list = note_list + tweet_list
flat_word_list = [item for sublist in word_list for item in sublist]

In [14]:
# need a preprocessing step where if a word in the clusters isn't found in the test dataset, it's omitted 
# because that word would have no frequency statistics associated with it 
clusters['Test_Train_Match'] = np.where(clusters['Word_Type'].isin(flat_word_list), 1, 0)



In [15]:
# TODO: log this!! omitting about 1081 words by my count 
clusters['Test_Train_Match'].value_counts()

1    18782
0     1081
Name: Test_Train_Match, dtype: int64

In [16]:
# just going to omit them because idk 
clusters = clusters[clusters['Test_Train_Match'] == 1]

In [None]:
def return_neighborhood_cts(word_list, i, window):
    """
    Helper function to calculate NPMI statistics for the given window
    INPUT: a list of tokens, a pointer for the key word, window size
    OUTPUT: a dictionary of counts for the neighborhood of the key word
    """
    pre_pane_li = i-window
    pre_pane_ri = i
    post_pane_li = i+1
    post_pane_ri = i+1+window
    if pre_pane_li < window  and i < window:
        pre_pane_li = 0
    if i == len(word_list) - 1:
        pre_pane_li = len(word_list) - window - 1
    neighborhood = word_list[pre_pane_li:pre_pane_ri] + word_list[post_pane_li:post_pane_ri]
    inner_vecs = dict.fromkeys(set(neighborhood), 0)
    for word in neighborhood:
        inner_vecs[word] = inner_vecs[word] + 1
    return inner_vecs


def mainPMIStats(master_class, word_list)
master_class = {}
total_ct = 0
for l in word_list:
    word_list = l
    outer_vec = {}
    for i in range(len(word_list)): # Looping thru all words
        tmp = return_neighborhood_cts(word_list, i, window) # returning neighborhood counts for a key word
        if (word_list[i] in outer_vec.keys()): # if a key word already has a dict entry, add to it
            for k, v in tmp.items():
                total_ct += v
                try:
                    outer_vec[word_list[i]][k] = outer_vec[word_list[i]][k] + v
                except:
                    outer_vec[word_list[i]][k] = v
        else: # if a key word does not have a dict entry, create it
            outer_vec[word_list[i]] = tmp
            total_ct += sum(tmp.values())
            
    # updating the master dict over many documents
    for k,v in outer_vec.items():
        try:
            master_class[k] = Counter(master_class[k]) + Counter(outer_vec[k])
        except:
            master_class[k] = outer_vec[k]
    

# make every dict in the master dictionary a counter object such that when a co-occurence doesn't occur, the joint 
# probability == 0 

master_class = {key: Counter(value) for key, value in master_class.items()}

In [None]:
master_class

In [None]:
def npmi(w1, w2, vectors, vector_ct):
    # INPUT two words to compare, dict of vectors, total word count for denom
    # OUTPUT ppmi for those two words
    eps = 10**(-12)
    # numerator
    w1w2_dc = vectors[w1][w2] / vector_ct
    w1_dc = sum(vectors[w1].values()) / vector_ct
    w2_dc = sum(vectors[w2].values()) / vector_ct
    
    pmi_w1w2 = np.log((w1w2_dc) / ((w1_dc * w2_dc) + eps) + eps)
    npmi_w1w2 = pmi_w1w2 / (- np.log( (w1w2_dc) + eps))
    return npmi_w1w2

In [None]:
def avgClusterNPMI(cluster_words, stats):
    """
    Input: List of words in a cluster
    Output: Average NPMI for the cluster 
    wtf is happening? It is averaged twice: NPMI is calculated for each w1, w2; so an average of all the pairs is taken
    And, an average of all of the words is taken in the cluster.
    It looks like Lau et al sums the NPMIs but that would result in NPMIs that aren't really interpretable (not
    within the [-1,1] range)
    """

    npmi_scores = {}
    for w1 in cluster_words:
        npmi_sum = 0
        ct = 0
        for w2 in cluster_words:
            if w1 != w2:
                res = npmi(w1, w2, stats, total_ct)
#                 print(w1,w2,res)
#                 print(w1, w2, res)
                
                npmi_sum += res
                ct += 1
                
            else:
                pass
        # taking the average of every w1 against every other w2
        npmi_scores[w1] = npmi_sum/ct
       
    
    res = 0
    for val in npmi_scores.values():
        
        res += val
        
  
    # using len() to get total keys for mean computation
    # averaging all words in a cluster 

    res = res / len(npmi_scores)

    
    return (res, npmi_scores)4       

In [None]:
avgClusterNPMI(["biden", "putin", "russia", "usa"], master_class)

In [None]:
temp = clusters[(clusters['Cluster'] == 0 ) &(clusters['numCluster'] == 20 ) & (clusters['dimensions'] == 100)
        & (clusters['seed'] == 932)]


In [None]:
lst = temp.Word_Type.tolist()

In [None]:
avgClusterNPMI(lst, master_class)

In [None]:
df = clusters.groupby(['numCluster', 'dimensions', 'seed'])

In [None]:
run_data = []
omitted_clusters = []
for key, item in df:
    print(key)
    # iterating through each dataframe of results for each set of hyperparamaters 
    cluster_ids = item.Cluster.unique().tolist()
    cluster_scores = 0
    for cluster_id in cluster_ids:
#         print(cluster_id)
        temp_df = item[item['Cluster'] == cluster_id]
        word_list = temp_df.Word_Type.tolist() 
        if len(word_list) != 1:
            score = avgClusterNPMI(word_list, master_class)
            cluster_scores += score
        else:
            omitted_clusters.append((key,cluster_id))
            # some clusters might only have one word. They're omitted from the analysis
            continue
    run_score = cluster_scores/len(cluster_ids)
    run_data.append([key[0], key[1], key[2], run_score])
    

In [None]:
run_data

In [None]:
results = pd.DataFrame(run_data, columns=['NClusters', 'NDims', 'RandomSeed', "Score"])

In [None]:
pd.DataFrame(results.groupby(['NClusters','NDims'])['Score'].mean()).reset_index()

In [None]:
total_ct

In [None]:
clusters = clusters.groupby(['Cluster']).agg({"Word_Type": list})

In [None]:
test

In [None]:
cluster_ids = len(clusters)


avg_npmis = []
for i in range(cluster_ids):
    cluster_words = clusters.iloc[i]['Word_Type']

    # this portion calculates the average average NPMI over a cluster 
    npmi_scores = {}
    for w1 in cluster_words:
        npmi_sum = 0
        ct = 0
        for w2 in cluster_words:
            if w1 != w2:
                res = npmi(w1, w2, master_class, total_ct)
    #             print(w1, w2, res)
                npmi_sum += res
                ct += 1
            else:
                pass
        # taking the average of every w1 against every other w2
        try:
            npmi_scores[w1] = npmi_sum/ct
        except:
            
    
    res = 0
    for val in npmi_scores.values():
        res += val
  
    # using len() to get total keys for mean computation
    # averaging all words in a cluster 
    res = res / len(npmi_scores)
    
    final = (i,res)
    avg_npmis.append(np.around((final),5))
            
    

In [None]:
clusters.iloc[5]

In [None]:
avg_npmis

In [None]:
avg_npmis