<div id="toc"> </div>

# Import lots of stuff

In [1]:
import sys
sys.path.append('../')

In [2]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import nlp_utils as utils
from nltk import bigrams
from collections import Counter
from pycorenlp import StanfordCoreNLP
%matplotlib inline
pd.set_option('display.max_rows', 500)

# Preprocess data for analyses

## Import annotated file

In [65]:
d_raw = pd.read_csv('handTagged.csv')
d_nicki = (pd.read_csv('../../data/tangrams_unconstrained/old/oldTangrams.csv')
    .query('tangram != "*"')
    .drop('sender', 1)
    .rename(columns = {'tangram' : 'tangramRef'}))

# Drop time column
d = (d_raw
    .copy()
    .drop('time', 1)
    .query('tangramRef != "0"'))

# Result 1: Generate file for POS analysis

## Start the Stanford CoreNLP server

Before running this notebook, [get CoreNLP](http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip), go into its directory, and run

`java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer 9000`

If you're using port 9000 for something else, change that value and then change `PORT` in the next cell.

In [66]:
PORT = 9000

nlp = StanfordCoreNLP('http://localhost:{}'.format(PORT))

## Get part of speech

Todo: follow Will's advice to parse unicode...

In [107]:
# A lemma is a (word, pos) tag pair.
d['lemmas'] = [utils.stanford_pos(text) for text in d['contents']]

picture 12 is like a guy with both hands in the air, kind of like  ¯_(ツ)_/¯ : cannot parse
11 is  ¯_(ツ)_/¯ : cannot parse
12 is  ¯_(ツ)_/¯ : cannot parse
7 is  ¯_(ツ)_/¯ : cannot parse
9 is  ¯_(ツ)_/¯ : cannot parse
The crouching guy´s feet are a triangle?: cannot parse
2 is  ¯_(ツ)_/¯ : cannot parse


In [108]:
d['tokens'] = [[element[0] for element in l] for l in d['lemmas']]
d['pos'] = [[element[1] for element in l] for l in d['lemmas']]

In [109]:
d['numWords'] = [pd.value_counts(words).sum() for words in d['tokens']]

In [110]:
d['tags'] = [[w['dep'] for w in utils.stanford_constituency(text)] for text in d['contents']]

Here's what I have in order: bunny ears (1), arms in air (2), standing on one leg, other leg to the left (3), robe pointing left (4), tilted square, rectangle stuff cut out of it (5), standing on one leg, other leg to the right (6), sitting with knees (7), sitting without knees (8), spike coming out of stomach, straight back (9), tilted square, rectangle with no stuff cut out (10), spike coming out of both back and stomach (11) and funky chicken (12). : cannot parse
picture 12 is like a guy with both hands in the air, kind of like  ¯_(ツ)_/¯ : cannot parse
11 is  ¯_(ツ)_/¯ : cannot parse
12 is  ¯_(ツ)_/¯ : cannot parse
7 is  ¯_(ツ)_/¯ : cannot parse
9 is  ¯_(ツ)_/¯ : cannot parse
The crouching guy´s feet are a triangle?: cannot parse
2 is  ¯_(ツ)_/¯ : cannot parse


In [111]:
d['num_sbar'] = [utils.sbar_count(parse[0]) for parse in d['parse']]
d['num_pp'] = [utils.pp_count(parse[0]) for parse in d['parse']]
d['num_cc'] = [utils.cc_count(parse[0]) for parse in d['parse']]

## Get counts for each POS label

In [112]:
d['nouns'] = [sum([1 if utils.is_noun(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['prepositions'] = [sum([1 if utils.is_prep(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['verbs'] = [sum([1 if utils.is_verb(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['determiners'] = [sum([1 if utils.is_det(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['pronouns'] = [sum([1 if utils.is_pronoun(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['adjectives'] = [sum([1 if utils.is_adjective(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['adverbs'] = [sum([1 if utils.is_adverb(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['numbers'] = [sum([1 if utils.is_num(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['others'] = [sum([1 if utils.is_other(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]

## Export to csv for plotting in R

In [116]:
(d.drop(["lemmas", "contents", "tokens"], 1)
 .to_csv("posTagged.csv", index = False))

## Compute unigrams and bigrams

In [44]:
d['bigrams'] = [list(bigrams(l)) for l in d['tokens']]
bigramDict = Counter([item for sublist in d['bigrams'].tolist()
                     for item in sublist])
bigramList = [bigram for (bigram,count) in bigramDict.items() if count > 5]

In [45]:
def getBigramCounts(df, gameid, roundNum) :
    roundCond = 'roundNum == ' + roundNum
    gameidCond = 'gameid == "' + gameid + '"'
    cond = " and ".join((roundCond, gameidCond))
    relevantRow = df.query(cond)
    return Counter([item for sublist in relevantRow['bigrams'].tolist() 
                    for item in sublist])

with open('bigramCounts.csv', 'a') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['gameid', 'roundNum', 'word', 'count'])
    for gameid in gameidList:  
        for roundNum in ['1', '2', '3', '4', '5', '6'] :
            counts = getBigramCounts(d, gameid, roundNum)
            for bigram in bigramList :
                writer.writerow([gameid, roundNum, ' '.join(bigram), counts[bigram]])