In [None]:
import sys
import os 
import csv

import numpy as np
import pandas as pd
from collections import Counter

import spacy
from spacy.lang.en import English
from spacy.attrs import POS

# if missing, run `python -m spacy download en_core_web_lg`
nlp = spacy.load('en_core_web_lg')

from utils.nlp_utils import makeMyPMI, memoize, lemmatize_doc
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

sys.path.append('../')
%load_ext autoreload
%autoreload 2

# Import & pre-process dataframe

In [None]:
# pass data through NLP pipeline
version_to_use = 'tangramsSequential_collapsed'
d_raw = pd.read_csv('../../data/{}.csv'.format(version_to_use))
d_raw['text'] = [nlp(text) for text in d_raw['contents']]
d_raw['lemmas'] = [lemmatize_doc(parsed_text) for parsed_text in d_raw['text']]

In [None]:
# reindex
d = d_raw.set_index(['gameid','intendedName', 'repetitionNum'])
mux = pd.MultiIndex.from_product(
    [d.index.levels[0], d.index.levels[1], d.index.levels[2]], 
    names=['gameid','intendedName', 'repetitionNum']
)
d = d.reindex(mux).reset_index()

### Compute tf-idf values

In [None]:
rows = []
tangrams_list = ['A','B','C','D','E','F','G','H','I','J','K','L']
for name, initial_df in d.query('repetitionNum == 1').groupby('gameid') :
    final_df = d.query('repetitionNum == 6 and gameid == "{}"'.format(name)).sort_values('intendedName').reset_index()
    initial_df = initial_df.sort_values('intendedName').reset_index()

    # Handle nans
    nan_rows = [i for i in range(initial_df.shape[0]) if pd.isna(initial_df.iloc[i,6])]
    nan_insert_rows = [k - lag for (lag, k) in enumerate(nan_rows)]

    # get number of matches
    docs_dict = Dictionary(doc for doc in initial_df['lemmas'] + final_df['lemmas'] if not np.any(pd.isna(doc)))
    docs_corpus = [docs_dict.doc2bow(doc) for doc in initial_df['lemmas'] if not np.any(pd.isna(doc))]
    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict, smartirs='bfn') # note: 'bfn' uses boolean for tf term
    docs_tfidf  = model_tfidf[docs_corpus]
    docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
    docs_vecs   = np.insert(docs_vecs, nan_insert_rows, np.nan, axis=0)

    # map back to df
    for i, row in initial_df.iterrows() :
        words = [docs_dict[i] for i in range(len(docs_vecs[0,]))]
        for j, word in enumerate(words) :
            tfidf = docs_vecs[i,j]
            match = word in list(final_df.loc[i,]['lemmas'])
            rows.append([row['gameid'], row['intendedName'], word, tfidf, match])

In [None]:
PMI_df = pd.DataFrame(rows,
    columns = ['gameid', 'intendedName', 'word', 'tf-idf', 'finalRoundMatch']
)

In [None]:
PMI_df.to_csv('../outputs/PMI.csv')

## Write out PMIs & matching rates for all words


In [None]:
with open('../outputs/sequential_matchAndPMI.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['word', 'POS', 'match', 'pmi', 'total'])
    for word in wordList :
        pmi = 0
        match = 0
        total = 0
        for gameid in gameidList:  
            memoizedCounts = {}
            for tangram in tangramList:
                memoizedCounts = memoize(d, gameid, memoizedCounts)
                round1WordList = selectTangramRoundWords(d, tangram, "1", gameid)
                total = total + 1 if word in round1WordList else total
                if word in round1WordList :
                    PMI_df = makeMyPMI(d, tangram, "1", gameid, memoizedCounts)
                    pmi = pmi + PMI_df[PMI_df['word'] == word]['logPMI'].tolist()[0]
                    round6WordList = selectTangramRoundWords(d, tangram, "6", gameid)
                    match = (match + 1 if (word in round1WordList and word in round6WordList)
                             else match)
        writer.writerow([word, POSdict[word], float(match) / float(total), pmi/total, total])


In [None]:
numSamples = 1000

#grab words with highestPMI for a given tangram/gameid
def highestPMIWords(d, tangram, roundNum, gameid):
    allTangramCounts = {}
    allTangramCounts['counts'] = getWordCounts(d, gameid, "1")
    allTangramCounts['numWords'] = float(sum(allTangramCounts["counts"].values()))

    PMIdf = (makeMyPMI(d, tangram, roundNum, gameid, allTangramCounts))

    # Remove numbers
    PMIdf['POS'] = [POSdict[word] for word in PMIdf['word']]

    #if PMIdf has words, pull out max values, it is empty return it as is
    if len(PMIdf.index) > 0:
        PMI_values = PMIdf.logPMI.unique()
        maxPMI = PMI_values.max()
        PMIdf = PMIdf.loc[PMIdf['logPMI'] == maxPMI]
        PMIdfword = PMIdf['word']
        return PMIdfword.tolist()
    else: 
        return PMIdf

with open('../outputs/PMIbootstrap.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['sampleNum', 'tangram', 'gameid', 'numCandidates', 'match', 'highest'])
    for gameid in gameidList :
        for tangram in tangramList :
            round1Words = selectTangramRoundWords(d, tangram, "1", gameid)
            if len(round1Words) > 0:
                # First, write highest PMI match
                highPMIWords = highestPMIWords(d, tangram, "1", gameid)
                round6Words = selectTangramRoundWords(d, tangram, "6", gameid)
                match = np.mean([1 if word in round6Words else 0 for word in highPMIWords ])
                writer.writerow([0, tangram, gameid, len(highPMIWords), match, "highest"])

                # Next, take a bunch of null samples
                for i in range(numSamples) :
                    randomWord = np.random.choice(round1Words)
                    match = np.mean([1 if randomWord in round6Words else 0])
                    writer.writerow([i + 1, tangram, gameid, 1, match, "null"])