<div id="toc"> </div>

# Import lots of stuff

In [2]:
import sys
sys.path.append('../')

In [3]:
%load_ext autoreload
%autoreload 2

In [15]:
import math
import csv
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import scipy.spatial.distance as distance
import seaborn as sns
import spacy

import textacy
import textacy.io
from utils.nlp_utils import lemmatize_doc
from sklearn import manifold
from itertools import combinations
from collections import Counter
from spacy.lang.en import English
from spacy.attrs import POS

pd.set_option('display.max_rows', 500)

initialize spacy

In [5]:
nlp = spacy.load('en_core_web_lg')

# Preprocess data for analyses

In [6]:
version_to_use = 'tangramsSequential_collapsed'

## Import annotated file & tag

In [7]:
d_raw = pd.read_csv('../../data/{}.csv'.format(version_to_use))#.rename(index=str, columns={"contents": "text"})
d_raw['text'] = [nlp(text) for text in d_raw['contents']]

In [8]:
for s in (d_raw.iloc[-1].text.sents) :
    print(s)

man on one leg arm left and one leg right


## Run spellchecker (using conservative vectors)

In [None]:
from utils import nlp_utils as utils
conservative_vectors = spacy.load('en_core_web_sm')

In [None]:
print(len(conservative_vectors.vocab))

In [None]:
utils.building_spell_correction_dictionary(
    d_raw.query('taskVersion == "cued"'), 
    conservative_vectors.vocab, 
    []
)

## Get list of all game ids

In [9]:
d = d_raw.copy()

In [10]:
d_raw.columns

Index(['gameid', 'trialNum', 'repetitionNum', 'intendedName', 'contents',
       'numRawWords', 'correct', 'text'],
      dtype='object')

In [11]:
gameidList = pd.unique(d.gameid.ravel()).tolist()
print(gameidList[0:5])
print(len(gameidList))

['0057-414228f8-c268-40d6-9349-b35df4f080d9', '0349-951c1418-40e9-48b3-8290-7ed4461f4d54', '0413-e4a76b36-4367-4e30-abf9-93e823913630', '0461-f522f8f4-37dc-4bb0-89bf-9f6bcf43274a', '0711-b03679d3-9904-4263-bd2f-8ec8e7a45af7']
83


## Get list of all tangram names

In [12]:
tangramList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']
print(tangramList)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']


# What are most common words & phrases to reduce? 

Strangely, spacy has no n-gram function, so we use textacy, a convenience wrapper around spacy

In [17]:
d['doc'] = [textacy.make_spacy_doc(row, lang='en_core_web_lg') for row in d['contents']]

In [38]:
d['tokens'] = [[token.text for token in l if not token.is_punct] for l in d['text']]
d['lemmas'] = [lemmatize_doc(text) for text in d['text']]
d['pos'] = [[token.pos_ for token in l if not token.is_punct] for l in d['text']]
d['noun_chunks'] = [list(l.noun_chunks) for l in d['text']]
d['numWords'] = [len([token for token in l if not token.is_punct]) for l in d['text']]

In [43]:
d['unigrams'] = [list(textacy.extract.ngrams(nlp(' '.join(doc)), 1, filter_stops = False)) for doc in d['lemmas']]

d['bigrams'] = [list(textacy.extract.ngrams(nlp(' '.join(doc)), 2, filter_stops = False)) for doc in d['lemmas']]

d['trigrams'] = [list(textacy.extract.ngrams(nlp(' '.join(doc)), 3, filter_stops = False)) for doc in d['lemmas']]

In [44]:
d.head()

Unnamed: 0,gameid,trialNum,repetitionNum,intendedName,contents,numRawWords,correct,text,doc,unigrams,bigrams,trigrams,tokens,lemmas,pos,noun_chunks,numWords
0,0057-414228f8-c268-40d6-9349-b35df4f080d9,1,1,B,"looking for a diamond at the top, triangle poi...",47,1,"(looking, for, a, diamond, at, the, top, ,, tr...","(looking, for, a, diamond, at, the, top, ,, tr...","[(look), (for), (a), (diamond), (at), (the), (...","[(look, for), (for, a), (a, diamond), (diamond...","[(look, for, a), (for, a, diamond), (a, diamon...","[looking, for, a, diamond, at, the, top, trian...","[look, for, a, diamond, at, the, top, triangle...","[VERB, ADP, DET, NOUN, ADP, DET, NOUN, NOUN, V...","[(a, diamond), (the, left), (a, person), (thei...",48
1,0057-414228f8-c268-40d6-9349-b35df4f080d9,2,1,G,this one looks like a seal,6,0,"(this, one, looks, like, a, seal)","(this, one, looks, like, a, seal)","[(this), (one), (look), (like), (a), (seal)]","[(this, one), (one, look), (look, like), (like...","[(this, one, look), (one, look, like), (look, ...","[this, one, looks, like, a, seal]","[this, one, look, like, a, seal]","[DET, NUM, VERB, SCONJ, DET, NOUN]","[(a, seal)]",6
2,0057-414228f8-c268-40d6-9349-b35df4f080d9,3,1,K,this one looks like a small dog balancing a ba...,13,0,"(this, one, looks, like, a, small, dog, balanc...","(this, one, looks, like, a, small, dog, balanc...","[(this), (one), (look), (like), (a), (small), ...","[(this, one), (one, look), (look, like), (like...","[(this, one, look), (one, look, like), (look, ...","[this, one, looks, like, a, small, dog, balanc...","[this, one, look, like, a, small, dog, balance...","[DET, NUM, VERB, SCONJ, DET, ADJ, NOUN, VERB, ...","[(a, small, dog), (a, ball), (its, nose)]",13
3,0057-414228f8-c268-40d6-9349-b35df4f080d9,4,1,A,this looks like one of the spy vs spy guys loo...,21,1,"(this, looks, like, one, of, the, spy, vs, spy...","(this, looks, like, one, of, the, spy, vs, spy...","[(this), (look), (like), (one), (of), (the), (...","[(this, look), (look, like), (like, one), (one...","[(this, look, like), (look, like, one), (like,...","[this, looks, like, one, of, the, spy, vs, spy...","[this, look, like, one, of, the, spy, vs, spy,...","[DET, VERB, SCONJ, NUM, ADP, DET, NOUN, ADP, N...","[(the, spy), (spy, guys), (a, flag), (his, rig...",21
4,0057-414228f8-c268-40d6-9349-b35df4f080d9,5,1,J,this is a diamond on top of what looks like a ...,20,1,"(this, is, a, diamond, on, top, of, what, look...","(this, is, a, diamond, on, top, of, what, look...","[(this), (be), (a), (diamond), (on), (top), (o...","[(this, be), (be, a), (a, diamond), (diamond, ...","[(this, be, a), (be, a, diamond), (a, diamond,...","[this, is, a, diamond, on, top, of, what, look...","[this, be, a, diamond, on, top, of, what, look...","[DET, AUX, DET, NOUN, ADP, NOUN, ADP, PRON, VE...","[(a, diamond), (top), (what), (a, state, -, to...",19


In [49]:
def getCounts(countType, df, gameid, repetitionNum, tangram = None) :
    roundCond = 'repetitionNum == ' + repetitionNum
    gameidCond = 'gameid == "' + gameid + '"'
    if(tangram is not None) :
        tangramCond = 'tangramRef == "' + tangram + '"'
        cond = " and ".join((roundCond, gameidCond, tangramCond))
    else :
        cond = " and ".join((roundCond, gameidCond))
    relevantRow = df.query(cond)
    return Counter([str(item) for sublist in relevantRow[countType]
                    for item in sublist])

for countType in ['unigrams', 'bigrams', 'trigrams'] :
    flattenedContents = [str(item) for sublist in d[countType]
                         for item in sublist]
    countDict = Counter(flattenedContents)
    wordList = [v for (v,count) in countDict.items() if count > 20]
    with open('../outputs/' + countType + 'Counts.csv', 'w') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['gameid', 'repetitionNum', 'word', 'count'])
        for gameid in gameidList:  
            for repetitionNum in ['1', '2', '3', '4', '5', '6'] :
                counts = getCounts(countType, d, gameid, repetitionNum)
                for word in wordList :
                    writer.writerow([gameid, repetitionNum, word, counts[word]])

# Extract parts of speech

## Get counts for each POS label

In [126]:
tag_dict = {}
tag_counts = {}
for doc in d['text'] :
    for w in doc :
        if w.pos_ not in tag_dict :
            tag_dict[w.pos_] = w.pos
            tag_counts[w.pos_] = 0
        tag_counts[w.pos_] = tag_counts[w.pos_] + 1
print(tag_counts)
d['posCounts'] = [doc.count_by(POS) for doc in d['text']]
for posStr in ["NOUN", "PROPN", "DET", "PRON", "VERB", "ADJ", "CCONJ", "ADP", 'ADV', 'AUX', 'SCONJ', 'NUM'] :
    key_id = tag_dict[posStr]
    d[posStr + 'count'] = [counts[key_id] if key_id in counts else 0 for counts in d['posCounts']]
d['NOUNcount'] = d['NOUNcount'] + d['PROPNcount']

{'VERB': 6656, 'ADP': 5657, 'DET': 4390, 'NOUN': 12834, 'PUNCT': 3773, 'ADV': 2346, 'PRON': 1599, 'AUX': 1331, 'CCONJ': 790, 'SCONJ': 1038, 'NUM': 1045, 'ADJ': 2936, 'SYM': 74, 'PROPN': 1559, 'INTJ': 231, 'PART': 288, 'SPACE': 116, 'X': 48}


In [128]:
print(d.iloc[-10].pos)
print(d.iloc[-10].lemmas)

['DET', 'NOUN', 'VERB', 'ADP', 'PRON', 'NOUN', 'PART', 'DET', 'ADJ', 'NUM']
['the', 'man', 'pray', 'on', '-PRON-', 'knee', 'not', 'the', 's', 'one']


## Export to csv for plotting in R

In [99]:
(d.drop(["tokens", 'posCounts', 'text', 'lemmas', 'noun_chunks', 'numRawWords', 'PROPNcount'], 1)
 .to_csv("../outputs/posTagged_{}.csv".format(version_to_use), index = False))

# syntactic analyses

look at which words are dropped on each round and whether they are more closely related to one another than expected under null model

In [129]:
def make_dep_graph(text) :
    # Load spacy's dependency tree into a networkx graph
    edges = []
    for token in text:
        for child in token.children:
            edges.append(('{0}-{1}'.format(token.lower_,token.i),
                          '{0}-{1}'.format(child.lower_,child.i)))

    return nx.Graph(edges)

def get_shortest_dependency_path (graph, word1, word2) :
    # https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.shortest_paths.html
    return nx.shortest_path_length(graph, source=word1, target=word2)

def flatten(list) :
    return [x for y in list for x in y]

In [159]:
pos_function = ['CCONJ', 'SCONJ', "PRON", "DET", "ADP", "AUX", "PART"]
def get_mean_dependency_lengths(null = None) :
    """
    null can be: 
        'random' to make baseline where random words are dropped, or 
        'functionOnly' where function words are dropped
    """
    dependency_lengths = {1: [], 2 : [], 3: [], 4: [], 5: []}
    print('running')
    for name, df in d.groupby(['gameid', 'intendedName']) :
        df['next_lemmas'] = df['lemmas'].shift(-1)
        for _, row in df.iloc[0:-1].iterrows() :
            # handle case where we've auto-parsed single message into multiple 'sentences'
            for i, sent in enumerate(row['text'].sents):
                graph = make_dep_graph(sent)
                dropped_words = ['{}-{}'.format(token.lower_,token.i) for token in sent
                                 if token.lemma_ not in row['next_lemmas'] and not token.is_punct]
                # for null model, we randomly sample words instead of using the real dropped ones
                if null == 'random' : 
                    num_words_dropped = len(dropped_words)
                    random_words = np.random.choice([token.lemma_ for token in sent], 
                                                    num_words_dropped, replace=False)
                    dropped_words = ['{}-{}'.format(token.lower_,token.i) for token in sent
                                     if token.lemma_ in random_words]
                elif null == 'functionOnly' : 
                    num_words_dropped = len(dropped_words)
                    function_words = [token.lemma_ for token in sent if token.pos_ in pos_function]
                    non_function_words = [token.lemma_ for token in sent if token.pos_ not in pos_function]
                    random_function_words = np.random.choice(function_words, min(num_words_dropped, len(function_words)), replace=False)
                    if len(random_function_words) < num_words_dropped : 
                        random_function_words = np.append(random_function_words,
                                  np.random.choice(non_function_words, num_words_dropped - len(random_function_words), replace=False))
                    assert(len(random_function_words) == num_words_dropped)
                    dropped_words = ['{}-{}'.format(token.lower_,token.i) for token in sent
                                     if token.lemma_ in random_function_words]
                for word1, word2 in combinations(dropped_words, 2) :
                    try:
                        dep_length = get_shortest_dependency_path(graph, word1,word2) 
                        dependency_lengths[row.repetitionNum].append(dep_length)
                    except :
                        nx.draw(graph)
                        plt.show()
    return {k: np.array(v).mean() for k,v in dependency_lengths.items()}
true = get_mean_dependency_lengths(null = False)

running


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [1]:
true

NameError: name 'true' is not defined

In [132]:
random_null = [get_mean_dependency_lengths(null = 'random') for i in range(100)]

running


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


4.8473721656819375
running
4.834049211511321
running
4.829410779345869
running
4.862759643916914
running
4.858828103418287
running
4.834392728477369
running
4.837729981860585
running
4.82808591033187
running
4.881210298585326
running
4.846790633181221
running
4.822295851596419
running
4.835956657514723
running
4.863932898415657
running
4.851657831826214
running
4.8454417283489875
running
4.817258458026617
running
4.840328296959522
running
4.826668596476718
running
4.8490438858470215
running
4.856761825380287
running
4.834087090015144
running
4.850790074718643
running
4.864974787823453
running
4.858877856351532
running
4.833404479057128
running
4.8570421220731275
running
4.850550646098265
running
4.838457250221956
running
4.836284702387752
running
4.8448385185493175
running
4.8441807874601155
running
4.862364256903506
running
4.861945695747407
running
4.846691057673265
running
4.808583673129733
running
4.843788463544383
running
4.835678961467134
running
4.822816660307509
running
4.85076

In [160]:
function_null = [get_mean_dependency_lengths(null = 'functionOnly') for i in range(100)]


running


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running
running


In [161]:
true_rows = np.array([[0, k, v, 'true'] for k,v in true.items()])
random_rows =  np.array([[i, k, v, 'random']  for i,rs in enumerate(random_null) for k,v in rs.items()])
function_rows = np.array([[i, k, v, 'function']  for i,rs in enumerate(function_null) for k,v in rs.items()])
rows = np.vstack([true_rows, random_rows, function_rows])
print(rows)

[['0' '1' '4.688764546646945' 'true']
 ['0' '2' '3.9803159173754556' 'true']
 ['0' '3' '3.8969906355123194' 'true']
 ...
 ['99' '3' '4.14813695109969' 'function']
 ['99' '4' '3.3442191203581775' 'function']
 ['99' '5' '3.143424711958681' 'function']]


In [162]:
(pd.DataFrame(rows, columns = ['sampleNum', 'repetitionNum', 'value', 'baselineName'])
   .to_csv('../outputs/permuted_dependency_distribution.csv'))

# Supplemental analyses

In [None]:
from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor
from nltk import Tree
archive = load_archive(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz"
        )
predictor = Predictor.from_archive(archive, 'constituency-parser')

## Compute edit distances on successive rounds

TODO: it would be nice to use an algorithm like Selkow that uses operations for deleting entire subtrees at once (rather than attaching children to parent, which means that deleting a whole subtree requires as many operations as there are *nodes* in that subtree)

see http://www.aclweb.org/anthology/R13-1002 for a way of altering ZS output

In [None]:
def label_distance (label1, label2) :
    if label1 == label2 :
        return 0
    else :
        return 1

def get_root(doc) :
    asdf = dict(('label' if key == 'nodeType' else key, value) for (key, value) in doc.items())
    return asdf

def get_children(subtree) :
    if 'children' in subtree.keys() :
        return [dict(('label' if key == 'nodeType' else key, value) for (key, value) in d.items()) for d in subtree['children']]
    else :
        return []

def get_label(node) :
    return node['label']

def edit_distance(tree1, tree2, return_operations = False) :
    return simple_distance(get_root(tree1), get_root(tree2), 
                           get_children, get_label, label_distance, return_operations=return_operations)

def example() :
    s1 = predictor.predict_json({"sentence": "I am a cat with a big bone"})
    s2 = predictor.predict_json({"sentence": "I am a cat"})
    ed = edit_distance(s1['hierplane_tree']['root'], s2['hierplane_tree']['root'], return_operations=True)
    print('tree1:', Tree.fromstring(s1['trees']))
    print('tree2:', Tree.fromstring(s2['trees']))
    print('operations:', ed[1])
    print('cost:', ed[0])

Ideally, we could use an algorithm that would give a cost of 1 for this example

In [None]:
Tree.fromstring(predictor.predict_json({"sentence" : "a guy who looks like one of those wavy tube guys leaning towards the left"})['trees'])

In [None]:
from IPython.display import clear_output

tiny_d = d[['gameid', 'repetitionNum', 'contents', 'intendedName']]
parses = []
for i, s in enumerate(tiny_d['contents']) :
    clear_output(wait=True)
    print(i, '/', len(tiny_d['contents']))
    parses.append(predictor.predict_json({'sentence' : s})['hierplane_tree']['root'])
tiny_d['tree_parse'] = parses

In [None]:
def finditem(obj, value, initLevel = True):
    if obj['nodeType'] == value and not initLevel:
        return True
    elif 'children' in obj :
        for child in obj['children'] :
            item = finditem(child, value, initLevel = False)
            if item is not None:
                return item
tiny_d['SBAR'] = [finditem(s, 'SBAR') for s in tiny_d['tree_parse']]
tiny_d['PP'] = [finditem(s, 'PP') for s in tiny_d['tree_parse']]
tiny_d['CC'] = [finditem(s, 'CC') for s in tiny_d['tree_parse']]
tiny_d['NP'] = [finditem(s, 'NP') for s in tiny_d['tree_parse']]

In [None]:
tiny_d.to_json('./outputs/constituency_parses.json')
tiny_d.drop('tree_parse', 1).to_csv('./outputs/constituency_tags.csv')

In [None]:
tiny_d = pd.read_json('./outputs/constituency_parses.json')

In [None]:
gameids = np.unique(tiny_d['gameid'])
tangramids = np.unique(tiny_d['intendedName'])

transitions = np.zeros([len(gameids), len(tangramids) ,5])
for i, gameid in enumerate(gameids) :
    clear_output(wait=True)
    print(i, '/', len(gameids))
    game_d = tiny_d.query('gameid == "{0}"'.format(gameid))
    for j, intendedName in enumerate(tangramids) :
        for k, init_occurrenceNum in enumerate(range(1,6)) :
            dist = []
            sub1 = game_d.query('intendedName == "{0}" and repetitionNum == {1}'.format(intendedName, init_occurrenceNum))['tree_parse']
            sub2 = game_d.query('intendedName == "{0}" and repetitionNum == {1}'.format(
                intendedName, init_occurrenceNum+1
            ))['tree_parse']
            for tree1 in sub1 :
                for tree2 in sub2 :
                    dist.append(edit_distance(tree1, tree2))
            transitions[i, j, k] = np.max(dist) if dist else np.nan

TODO: normalized by tree size
TODO: maybe can show this more straightforward by doing permutation test thing on POS tags instead of words

In [None]:
np.nanmean(transitions, axis=0)

### Across-game version
Instead of looking at edit distance from round $i$ to $i + 1$ for pair $j$, we look at average edit distances between pairs $j$ to $j+1$ on round $i$.

In [None]:
import random
print(random.sample([1,2], len([1,2])))

In [None]:
acrossgame = np.zeros([len(tangramids),6, len(gameids)])
shuffled_gameids = random.sample(list(gameids), len(gameids))
for i, intendedName in enumerate(tangramids) :
    clear_output(wait=True)
    print(i, '/', len(tangramids))
    for repetitionNum in range(1,7) :
        mini_d = tiny_d.query('intendedName == "{0}" and repetitionNum == {1}'.format(intendedName, repetitionNum))      
        for k in range(len(gameids) -1) :
            dist = []
#            print(gameids[k], 'to', gameids[k+1])
            sub1 = mini_d.query('gameid == "{0}"'.format(gameids[k]))['tree_parse']
            sub2 = mini_d.query('gameid == "{0}"'.format(gameids[k+1]))['tree_parse']
            for tree1 in sub1 :
                for tree2 in sub2 :
                    dist.append(edit_distance(tree1, tree2))
#             print(sub1)
#             print(sub2)
            acrossgame[i, repetitionNum-1, k] = np.max(dist) if dist else np.nan

In [None]:
np.nanmean(acrossgame, axis=2)

# Calculate indicator words for tangrams/rounds

## First, get list of words in first round

In [None]:
# Filter down to first round
d_round1 = d[d['repetitionNum'] == 1]

# Pull out all tokens and collapse into count dict
tokenDict = Counter([item for sublist in d_round1['tokens'].tolist()
                     for item in sublist])
# Pull out all words that occur more than once
wordList = [word for (word,count) in tokenDict.items() if count > 1 and not word.isdigit()]
print(wordList[0:10])
print(len(wordList))

# Get POS map; will be longer because it doesn't require count > 1, but it doesn't matter
POSdict = {word.text: word.pos_ for text in d_round1['text'] for word in text}
print(len(POSdict.keys()))

## Helper functions to select words & counts

In [None]:
def getWordCounts(df, gameid, occurrenceNum, tangram = None) :
    roundCond = 'repetitionNum == ' + occurrenceNum
    gameidCond = 'gameid == "' + gameid + '"'
    if(tangram is not None) :
        tangramCond = 'intendedName == "' + tangram + '"'
        cond = " and ".join((roundCond, gameidCond, tangramCond))
    else :
        cond = " and ".join((roundCond, gameidCond))
    relevantRow = df.query(cond)
    return Counter([item for sublist in relevantRow['tokens'].tolist() 
                    for item in sublist])

#creates mini dataframe that grabs the words used in round n for a given tangram and gameid
def selectTangramRoundWords(df, tangram, roundNum, gameid):
    wordCounts = getWordCounts(df, gameid, roundNum, tangram)
    return list(wordCounts.keys())