# Import lots of stuff

In [1]:
import sys
sys.path.append('../')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import math
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import textacy
import textacy.io
from zss import simple_distance
from nltk import Tree
#from utils import nlp_utils as utils
#from nltk import bigrams
from collections import Counter
#from pycorenlp import StanfordCoreNLP
%matplotlib inline
pd.set_option('display.max_rows', 500)

Use the lg spacy corpus 

In [175]:
import spacy
from spacy.lang.en import English
nlp = spacy.load('en_core_web_lg')

# Preprocess data for analyses

## Import annotated file

In [176]:
d_raw = pd.read_csv('../../data/tangrams.csv')
d = d_raw.copy()

## Get list of all game ids

In [177]:
gameidList = pd.unique(d.gameid.ravel()).tolist()
print(gameidList[0:5])
print(len(gameidList))

['7026-d36b3d5a-1ba3-403c-bcbd-31d0d1a0aa6b', '1567-e80221b4-f349-4b3b-9921-224ee47eea1f', '1670-7a1fc24b-6599-4efb-8a98-22ce7368261a', '5108-ab2d0b72-034a-4f00-991a-416faf93c98c', '3419-7061f84e-30ba-48dc-9bc3-133c5fdf8fa3']
137


## Get list of all tangram names

In [178]:
tangramList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']
print(tangramList)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']


# What are most common words & phrases to reduce? 

Strangely, spacy has no n-gram function, so we use textacy, a convenience wrapper around spacy

In [179]:
d['doc'] = [textacy.Doc(textacy.preprocess_text(row, lowercase = True), lang='en') for row in d['contents']]

KeyboardInterrupt: 

In [84]:
d['unigrams'] = [list(doc.to_terms_list(ngrams=1, as_strings=True, 
                                       normalize = 'lower',filter_stops = False, named_entities = False)) for doc in d['doc']]

d['bigrams'] = [list(doc.to_terms_list(ngrams=2, as_strings=True, 
                                       normalize = 'lower',filter_stops = False, named_entities = False)) for doc in d['doc']]

d['trigrams'] = [list(doc.to_terms_list(ngrams=3, as_strings=True, 
                                       normalize = 'lower',filter_stops = False, named_entities = False)) for doc in d['doc']]

In [88]:
def getCounts(countType, df, gameid, repetitionNum, tangram = None) :
    roundCond = 'repetitionNum == ' + repetitionNum
    gameidCond = 'gameid == "' + gameid + '"'
    if(tangram is not None) :
        tangramCond = 'tangramRef == "' + tangram + '"'
        cond = " and ".join((roundCond, gameidCond, tangramCond))
    else :
        cond = " and ".join((roundCond, gameidCond))
    relevantRow = df.query(cond)
    return Counter([item for sublist in relevantRow[countType]
                    for item in sublist])

for countType in ['unigrams', 'bigrams', 'trigrams'] :
    countDict = Counter([item for sublist in d[countType]
                         for item in sublist])
    wordList = [v for (v,count) in countDict.items() if count > 20]

    with open(countType + 'Counts.csv', 'w') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['gameid', 'repetitionNum', 'word', 'count'])
        for gameid in gameidList:  
            for repetitionNum in ['1', '2', '3', '4', '5', '6'] :
                counts = getCounts(countType, d, gameid, repetitionNum)
                for word in wordList :
                    writer.writerow([gameid, repetitionNum, word, counts[word]])

# Extract parts of speech

In [180]:
from spacy.attrs import POS

In [181]:
d['doc'] = [nlp(text) for text in d['contents']]

## Get counts for each POS label

In [182]:
tag_dict = {}
for doc in d['doc'] :
    for w in doc :
        if w.pos not in tag_dict :
            tag_dict[w.pos_] = w.pos
d['posCounts'] = [doc.count_by(POS) for doc in d['doc']]
print(tag_dict)
for posStr in ["NOUN", "DET", "PRON", "VERB", "ADJ", "CCONJ", "ADP"] :
    key_id = tag_dict[posStr]
    d[posStr + 'count'] = [counts[key_id] if key_id in counts else 0 for counts in d['posCounts']]

{'INTJ': 90, 'PUNCT': 96, 'PRON': 94, 'VERB': 99, 'PART': 93, 'ADP': 84, 'NOUN': 91, 'ADV': 85, 'DET': 89, 'ADJ': 83, 'CCONJ': 88, 'NUM': 92, 'SYM': 98, 'SPACE': 102, 'PROPN': 95, 'X': 100}


In [183]:
d['tokens'] = [[element.text for element in l] for l in d['doc']]
d['pos'] = [[element.pos_ for element in l] for l in d['doc']]
d['noun_chunks'] = [list(l.noun_chunks) for l in d['doc']]
d['numWords'] = [len(l) for l in d['doc']]

In [184]:
print(d['pos'][3])
print(d['tokens'][3])

['DET', 'NOUN', 'ADV', 'VERB', 'ADP', 'PRON', 'VERB', 'VERB', 'DET', 'ADJ', 'ADV', 'CCONJ', 'VERB', 'ADP', 'DET', 'NOUN', 'PART']
['This', 'one', 'kinda', 'looks', 'like', 'they', 'are', 'looking', 'a', 'little', 'down', 'and', 'kneeling', 'with', 'both', 'arms', 'out']


In [186]:
d.columns

Index(['gameid', 'msgTime', 'trialNum', 'repetitionNum', 'role',
       'intendedName', 'timeElapsed', 'contents', 'totalLength', 'thinksHuman',
       'comments', 'ratePartner', 'nativeEnglish', 'time', 'intendedObj',
       'clickedObj', 'objBox', 'correct', 'numRawWords', 'repetitionScore',
       'taskVersion', 'doc', 'posCounts', 'NOUNcount', 'DETcount', 'PRONcount',
       'VERBcount', 'ADJcount', 'CCONJcount', 'ADPcount', 'tokens', 'pos',
       'noun_chunks', 'numWords', 'nouns'],
      dtype='object')

## Export to csv for plotting in R

In [188]:
(d.drop(["tokens", "nouns"], 1)
 .to_csv("posTagged.csv", index = False))

# Extract constituency parses

In [189]:
from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor
archive = load_archive(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz"
        )
predictor = Predictor.from_archive(archive, 'constituency-parser')

<class 'allennlp.modules.text_field_embedders.text_field_embedder.TextFieldEmbedder'>
<class 'allennlp.modules.span_extractors.span_extractor.SpanExtractor'>
<class 'allennlp.modules.seq2seq_encoders.seq2seq_encoder.Seq2SeqEncoder'>
<class 'allennlp.modules.feedforward.FeedForward'>
<class 'allennlp.modules.token_embedders.embedding.Embedding'>
<class 'allennlp.nn.initializers.InitializerApplicator'>


## Compute edit distances on successive rounds

In [190]:
def label_distance (label1, label2) :
    if label1 == label2 :
        return 0
    else :
        return 1

def get_root(doc) :
    asdf = dict(('label' if key == 'nodeType' else key, value) for (key, value) in doc.items())
    return asdf

def get_children(subtree) :
    if 'children' in subtree.keys() :
        return [dict(('label' if key == 'nodeType' else key, value) for (key, value) in d.items()) for d in subtree['children']]
    else :
        return []

def get_label(node) :
    return node['label']

def edit_distance(tree1, tree2, return_operations = False) :
    return simple_distance(get_root(tree1), get_root(tree2), 
                           get_children, get_label, label_distance, return_operations=return_operations)

def example() :
    s1 = predictor.predict_json({"sentence": "you are a dog with a big bone"})
    s2 = predictor.predict_json({"sentence": "I am a cat"})
    print('tree1:', Tree.fromstring(s1['trees']))
    print('tree2:', Tree.fromstring(s2['trees']))
    print('operations:', edit_distance(s1, s2, return_operations=True)[1])
    print('cost:', edit_distance(s1, s2, return_operations=True)[0])

In [211]:
vc = d['msgTime'].value_counts()
vc[vc > 1].index.tolist()
#set([x for x in d['msgTime'] if d['msgTime'].count(x) > 1])

[1459990132712.0]

In [214]:
from IPython.display import clear_output

tiny_d = d.query('role == "director"')[['gameid', 'msgTime', 'repetitionNum', 'contents']]
parses = []
for i, s in enumerate(tiny_d['contents']) :
    clear_output(wait=True)
    print(i, '/', len(tiny_d['contents']))
    parses.append(predictor.predict_json({'sentence' : s})['hierplane_tree']['root'])

13314 / 13315


In [215]:
tiny_d['tree_parse'] = parses

In [217]:
tiny_d.head()

Unnamed: 0,gameid,msgTime,repetitionNum,contents,tree_parse
0,7026-d36b3d5a-1ba3-403c-bcbd-31d0d1a0aa6b,1490375000000.0,1,"Hello, I am going to describe them as people a...","{'word': 'Hello , I am going to describe them ..."
1,7026-d36b3d5a-1ba3-403c-bcbd-31d0d1a0aa6b,1490375000000.0,1,Is that okay?,"{'word': 'Is that okay ?', 'nodeType': 'SQ', '..."
3,7026-d36b3d5a-1ba3-403c-bcbd-31d0d1a0aa6b,1490375000000.0,1,This one kinda looks like they are looking a l...,{'word': 'This one kinda looks like they are l...
4,7026-d36b3d5a-1ba3-403c-bcbd-31d0d1a0aa6b,1490375000000.0,1,wearing a dress maybe.,"{'word': 'wearing a dress maybe .', 'nodeType'..."
5,7026-d36b3d5a-1ba3-403c-bcbd-31d0d1a0aa6b,1490375000000.0,1,Good job1 This one is sitting down,"{'word': 'Good job1 This one is sitting down',..."


In [284]:
def finditem(obj, value, initLevel = True):
    if obj['nodeType'] == value and not initLevel:
        return True
    elif 'children' in obj :
        for child in obj['children'] :
            item = finditem(child, value, initLevel = False)
            if item is not None:
                return item
tiny_d['SBAR'] = [finditem(s, 'SBAR') for s in tiny_d['tree_parse']]
tiny_d['PP'] = [finditem(s, 'PP') for s in tiny_d['tree_parse']]
tiny_d['CC'] = [finditem(s, 'CC') for s in tiny_d['tree_parse']]

In [289]:
tiny_d.to_json('constituency_parses.json')
tiny_d.drop('tree_parse', 1).to_csv('constituency_tags.csv')

In [53]:
tiny_d = pd.read_json('constituency_parses.json')

gameids = np.unique(tiny_d['gameid'])
tangramids = np.unique(tiny_d['intendedName'])

transitions = np.zeros([len(gameids), len(tangramids) ,5])
for i, gameid in enumerate(gameids) :
    print(gameid)
    game_d = tiny_d.query('gameid == "{0}"'.format(gameid))
    print(i, '/', len(gameids))
    for j, intendedName in enumerate(tangramids) :
        for k, init_occurrenceNum in enumerate(range(1,6)) :
            dist = []
            sub1 = game_d.query('intendedName == "{0}" and occurrenceNum == {1}'.format(intendedName, init_occurrenceNum))['tree_parse']
            sub2 = game_d.query('intendedName == "{0}" and occurrenceNum == {1}'.format(
                intendedName, init_occurrenceNum+1
            ))['tree_parse']
            for tree1 in sub1 :
                for tree2 in sub2 :
                    dist.append(edit_distance(tree1, tree2))
            if(dist == []) :
                print(sub1)
                print(sub2)
                print(gameid, intendedName, init_occurrenceNum)
            transitions[i, j, k] = np.max(dist) if dist else np.nan

0057-414228f8-c268-40d6-9349-b35df4f080d9
0 / 91
0349-951c1418-40e9-48b3-8290-7ed4461f4d54
1 / 91
0413-e4a76b36-4367-4e30-abf9-93e823913630
2 / 91
Series([], Name: tree_parse, dtype: object)
6843    {'word': 'looks like a man with both arms rais...
6844    {'word': 'leaning left', 'nodeType': 'S', 'att...
Name: tree_parse, dtype: object
0413-e4a76b36-4367-4e30-abf9-93e823913630 C 1
0461-f522f8f4-37dc-4bb0-89bf-9f6bcf43274a
3 / 91
0711-b03679d3-9904-4263-bd2f-8ec8e7a45af7
4 / 91
0723-9b842133-a121-4b53-9c37-784e6023e022
5 / 91
0785-68fedef2-3b2f-466f-a7c3-ab464e5811ca
6 / 91
1202-a64916b2-49d2-4ca4-bd76-cfd3e1ec3954
7 / 91
1242-e6fcc1c9-ab33-44c7-b31c-4a391beb2b8c
8 / 91
1526-db8d7e61-c668-4fe2-9119-cbf018ccb1c1
9 / 91
1567-e80221b4-f349-4b3b-9921-224ee47eea1f
10 / 91
1598-17655913-31a4-4ca6-b86c-383db06bfde9
11 / 91
Series([], Name: tree_parse, dtype: object)
9100    {'word': 'very plain figure', 'nodeType': 'NP'...
Name: tree_parse, dtype: object
1598-17655913-31a4-4ca6-b86c-383db06bf

In [None]:
d['parse'] = [[w for w in utils.stanford_parsetree(text)] for text in d['contents']]

In [None]:
Tree.fromstring(d['parse'][10][0])

In [55]:
np.nanmean(transitions, axis=0)

array([[24.04395604, 14.35164835,  8.04395604,  4.52222222,  4.76666667],
       [25.03333333, 16.68131868, 11.64444444,  7.91111111,  6.18888889],
       [15.49438202,  7.97777778,  5.17582418,  2.78021978,  3.74444444],
       [19.7       , 12.07692308,  7.52747253,  4.3       ,  3.62222222],
       [17.43956044,  7.94505495,  4.86666667,  2.54444444,  2.37777778],
       [18.54444444, 14.20879121,  7.93406593,  5.24175824,  3.63333333],
       [25.16853933, 16.83333333,  9.44444444,  6.24444444,  6.2       ],
       [21.65555556, 11.71111111,  6.42222222,  4.56043956,  3.52747253],
       [21.47777778, 13.18681319,  7.74725275,  5.36666667,  3.55555556],
       [22.21348315, 13.64835165,  7.92222222,  5.65555556,  3.96666667],
       [24.76666667, 17.88764045, 11.9       ,  7.58888889,  6.22222222],
       [21.03333333, 13.69230769,  9.34065934,  6.6043956 ,  5.67032967]])

# Result 2: Calculate indicator words for tangrams/rounds

## First, get list of words in first round

In [None]:
# Filter down to first round
d_round1 = d[d['occurrenceNum'] == 1]

# Pull out all tokens and collapse into count dict
tokenDict = Counter([item for sublist in d_round1['tokens'].tolist()
                     for item in sublist])
# Pull out all words that occur more than once
wordList = [word for (word,count) in tokenDict.items() if count > 1 and not word.isdigit()]
print(wordList[0:10])
print(len(wordList))

# Get POS map; will be longer because it doesn't require count > 1, but it doesn't matter
POSdict = {word: POS for lemma in d_round1['lemmas'] for (word, POS) in lemma}
print(len(POSdict.keys()))

## Helper functions to select words & counts

In [None]:
def getWordCounts(df, gameid, occurrenceNum, tangram = None) :
    roundCond = 'occurrenceNum == ' + occurrenceNum
    gameidCond = 'gameid == "' + gameid + '"'
    if(tangram is not None) :
        tangramCond = 'intendedObj == "' + tangram + '"'
        cond = " and ".join((roundCond, gameidCond, tangramCond))
    else :
        cond = " and ".join((roundCond, gameidCond))
    relevantRow = df.query(cond)
    return Counter([item for sublist in relevantRow['tokens'].tolist() 
                    for item in sublist])

#creates mini dataframe that grabs the words used in round n for a given tangram and gameid
def selectTangramRoundWords(df, tangram, roundNum, gameid):
    wordCounts = getWordCounts(df, gameid, roundNum, tangram)
    return list(wordCounts.keys())

## Helper function to compute PMIs

Note that merging is really costly -- if we need to speed it up, this might be the first target. Can also vectorize the log operation.

In [None]:
#returns a table with the all words above 0 PMI and their counts for a given tangram
#calculate the probability for words given tangram A ------ p(x|y)
def makeMyPMI(df, tangram, roundNum, gameid, totals):

    # count words w/in tangram
    tangramCounts = getWordCounts(df, gameid, roundNum, tangram)

    #total number of words 
    tangramNumWords = (1 if sum(tangramCounts.values()) == 0 
                       else sum(tangramCounts.values()))

    #dataframe to compare 
    indicatorDF = pd.merge(pd.DataFrame(list(tangramCounts.items()), columns=['word', 'count']),
                           pd.DataFrame(list(totals["counts"].items()), columns=['word', 'totalCount']),
                           on='word', how = 'inner')

    #calculate PMI without log first. Having trouble with float issues. 
    indicatorDF['roughPMI'] = ((indicatorDF['count']/tangramNumWords)
                                / (indicatorDF['totalCount']/totals["numWords"]))
    indicatorDF['logPMI'] = [math.log10(num) for num in indicatorDF['roughPMI']]
    
    #remove column rough PMI
    indicatorDF = indicatorDF.drop('roughPMI', 1)
    
    return indicatorDF

## Write out PMIs & matching rates for all words

Note that we do a sloppy optimization by only computing total counts once and only when necessary 

In [None]:
def memoize(d, gameid, counts) : 
    if "counts" not in counts : 
        counts["counts"] = getWordCounts(d, gameid, "1")
        counts["numWords"] = float(sum(counts["counts"].values()))
        return counts
    else :
        return counts

with open('sequential_matchAndPMI.csv', 'a', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['word', 'POS', 'match', 'pmi', 'total'])
    for word in wordList :
        print(word + ":" + POSdict[word])
        pmi = 0
        match = 0
        total = 0
        for gameid in gameidList:  
            memoizedCounts = {}
            for tangram in tangramList:
                memoizedCounts = memoize(d, gameid, memoizedCounts)
                round1WordList = selectTangramRoundWords(d, tangram, "1", gameid)
                total = total + 1 if word in round1WordList else total
                if word in round1WordList :
                    PMI_df = makeMyPMI(d, tangram, "1", gameid, memoizedCounts)
                    pmi = pmi + PMI_df[PMI_df['word'] == word]['logPMI'].tolist()[0]
                    round6WordList = selectTangramRoundWords(d, tangram, "6", gameid)
                    match = (match + 1 if (word in round1WordList and word in round6WordList)
                             else match)
        writer.writerow([word, POSdict[word], float(match) / float(total), pmi/total, total])


Also write out raw word counts on each round (so we can see what most likely words to be dropped are).

In [None]:
for row in d :
    print row

TODO: exclude numbers earlier in the pipeline, 

TODO: don't average over matches and pmis

In [None]:
numSamples = 1000

#grab words with highestPMI for a given tangram/gameid
def highestPMIWords(d, tangram, roundNum, gameid):
    allTangramCounts = {}
    allTangramCounts['counts'] = getWordCounts(d, gameid, "1")
    allTangramCounts['numWords'] = float(sum(allTangramCounts["counts"].values()))

    PMIdf = (makeMyPMI(d, tangram, roundNum, gameid, allTangramCounts))

    # Remove numbers
    PMIdf['POS'] = [POSdict[word] for word in PMIdf['word']]
    PMIdf = PMIdf.query('POS != "CD"'.format())

    #if PMIdf has words, pull out max values, it is empty return it as is
    if len(PMIdf.index) > 0:
        PMI_values = PMIdf.logPMI.unique()
        maxPMI = PMI_values.max()
        PMIdf = PMIdf.loc[PMIdf['logPMI'] == maxPMI]
        PMIdfword = PMIdf['word']
        return PMIdfword.tolist()
    else: 
        return PMIdf

with open('PMIbootstrap.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['sampleNum', 'tangram', 'gameid', 'numCandidates', 'match', 'highest'])
    for gameid in gameidList :
        for tangram in tangramList :
            round1Words = selectTangramRoundWords(d, tangram, "1", gameid)
            if len(round1Words) > 0:
                # First, write highest PMI match
                highPMIWords = highestPMIWords(d, tangram, "1", gameid)
                round6Words = selectTangramRoundWords(d, tangram, "6", gameid)
                match = np.mean([1 if word in round6Words else 0 for word in highPMIWords ])
                writer.writerow([0, tangram, gameid, len(highPMIWords), match, "highest"])

                # Next, take a bunch of null samples
                for i in range(numSamples) :
                    randomWord = np.random.choice(round1Words)
                    match = np.mean([1 if randomWord in round6Words else 0])
                    writer.writerow([i + 1, tangram, gameid, 1, match, "null"])