<div id="toc"> </div>

# Import lots of stuff

In [None]:
import itertools
import math
import re
import csv
import re;
import json
import os
import pandas as pd
import pylab as pyl
import nltk as nltk
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from pycorenlp import StanfordCoreNLP
%matplotlib inline
#enable longer display
pd.set_option('display.max_rows', 500)

# Annotate messages with tangram

## Pull in raw data

In [None]:
d_msgs_raw = (pd.read_csv('../../data/tangrams/message/tangramsMessages.csv', escapechar='\\')
              .assign(tangramRef = 'None'))
d_boards = (pd.read_csv('reformattedBoards.csv'))

## Tag with super simple, conservative heuristic

The most obvious strategy is to (on a first pass) assume that the tangram the matcher moves in response to a message is the one the message is referring to. The second pass is to skip the ones where we know they got it wrong. We'll probably end up hand-tagging those or using some other strategy depending on how many there are.

There are a few obvious problems here:

1. The director will sometimes send several messages before the matcher moves anything. So we can't just use the closest move in time... 
2. instead, we could use the *first* move action after the message and then rule it out so that we won't use it again even if it's the first after later message as well
3. **that**, though, also has a problem. Multiple messages are sent per tangram, and some messages are meta-chatter (e.g. "hello", "thanks", "good job", "this HIT is terrible"). If we assign the drop actions to the first $N$ messages, we'll have a bunch of actual messages about tangrams that aren't tagged and a bunch of messages **not** about tangrams incorrectly tagged.

So... we'll do a simpler thing. Check for numbers occuring in the text and look them up in the board data...

In [None]:
pattern = re.compile('[\W_]+')
for index, row in d_msgs_raw.iterrows():
    stripedStr = pattern.sub(' ', row.contents)
    numbers = [int(s) for s in stripedStr.split() if s.isdigit()]
    gameid = row.gameid
    roundNum = row.roundNum
    if len(numbers) == 1 and 0 < numbers[0] <= 12 and row.sender == 'director':
        boardRow = d_boards.query('gameid == "{0}" and roundNum == {1} and trueLoc == {2}'
                                  .format(gameid, roundNum, numbers[0]))
        d_msgs_raw.set_value(index, 'tangramRef', boardRow.tangramName.tolist()[0])

Check to see how many we tagged...

In [None]:
1 - Counter(d_msgs_raw['tangramRef'])['None'] / float(d_msgs_raw.shape[0])

not bad for a conservative heuristic! Now we're going to use the tagged data to train a classifier that will make predictions for the other 40%.

## Train classifier

###  Set up training set

Used `d_msgs_raw` in `d_combined` the first time and subsequently used the updated hand-tagged version

In [None]:
from sklearn.cross_validation import train_test_split
d_handtagged = pd.read_csv('handTagged.csv')
d_nicki = (pd.read_csv('../../data/tangrams/old/oldTangrams.csv')
    .query('tangram != "*"')
    .drop('sender', 1)
    .rename(columns = {'tangram' : 'tangramRef'}))
d_combined = (d_handtagged # d_msgs_raw
  .query('tangramRef != "None"')
  .query('tangramRef != "*"')
  .drop('sender', 1)
  .append(pd.DataFrame(data = d_nicki), ignore_index=True))
train_msg, test_msg = train_test_split(d_combined, test_size = 0.2)

### Build pipeline

Largely drawn from [here](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html).

Import necessary sklearn modules and grid search params

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)],
              'vect__stop_words': (None, 'english'),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5)
}

Train bag-of-words LR classifier 

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='log', penalty='l2',n_iter=5)),
                    ])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf
_ = gs_clf.fit(train_msg.contents, train_msg.tangramRef)

### Look at performance on held-out test set

Look at success!

In [None]:
predicted = gs_clf.predict(test_msg.contents)
correct = predicted == test_msg.tangramRef
print("test-split accuracy is...")
print(sum(correct)/float(len(correct)))

Plot ROC curve

In [None]:
test_msg.loc[:, 'predicted'] = predicted
test_msg.loc[:, 'correct'] = test_msg['predicted'] == test_msg['tangramRef']
test_msg.loc[:, 'maxProb'] = [max(row) for row in gs_clf.predict_proba(test_msg['contents'])]
# We could also measure confidence using the distance between the top two categories, but this
# turns out not to be quite as good a metric
test_msg.loc[:, 'probDiff'] = [sorted(row)[-1] - sorted(row)[-2] 
                               for row in gs_clf.predict_proba(test_msg['contents'])]

actualNumPos= float(sum(test_msg['correct']))
actualNumNeg= len(test_msg['correct']) - float(sum(test_msg['correct']))

TPRs, FPRs, thresholds = [], [], []
for threshold in np.arange(0,1,.05) :
    thresholds.append(threshold)
    # Get the ones that our policy tags as "correct"
    predYes = test_msg.query('maxProb > {0}'.format(threshold))['correct']
    # TPR: number *correct* positive results relative to overall number positive samples 
    TPRs.append(sum(predYes)/actualNumPos)
    # TPR: number *incorrect* positive results relative to overall number negative samples 
    FPRs.append((len(predYes)-sum(predYes))/actualNumNeg)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, aspect = 'equal')
ax.plot([0,1], [0,1])
ax.set_xlabel("False positive rate")
ax.set_ylabel("True positive rate")
ax.plot(FPRs, TPRs, label = 'maxProb') 

cautiousThreshold = [threshold for threshold, FPR in zip(thresholds, FPRs) if FPR < 0.05 ][0]
print(cautiousThreshold)

What are best params?

In [None]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
from sklearn import metrics
def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

plt.figure()
cm = metrics.confusion_matrix(test_msg.tangramRef, predicted)
tangramLabels = sorted(list(set(test_msg.tangramRef)))
plot_confusion_matrix(cm, tangramLabels)

## Tag full dataset using ROC threshold 

In [None]:
predicted_myData = gs_clf.predict(d_handtagged.contents)
maxProbs = [max(row) for row in gs_clf.predict_proba(d_handtagged.contents)]
existingTags = d_handtagged.tangramRef
autoTags = [prediction if maxProb > cautiousThreshold and existing == 'None' else existing
            for (existing, maxProb, prediction) 
            in zip(existingTags, maxProbs, predicted_myData)]
print(sum(autoTags != existingTags))
d_handtagged.loc[:, 'autoTags'] = autoTags

In [None]:
d_handtagged.drop('tangramRef', axis = 1).to_csv("autoTagged.csv", index = False)

# Preprocess data for analyses

## Import annotated file

In [None]:
d_raw = pd.read_csv('handTagged.csv')
d_nicki = (pd.read_csv('../../data/tangrams/old/oldTangrams.csv')
    .query('tangram != "*"')
    .drop('sender', 1)
    .rename(columns = {'tangram' : 'tangramRef'}))

# Drop time column
d = (d_raw
    .copy()
    .drop('time', 1)
    .query('tangramRef != "0"')
    .query('tangramRef != "None"'))

# Result 1: Generate file for POS analysis

## Start the Stanford CoreNLP server

Before running this notebook, [get CoreNLP](http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip), go into its directory, and run

`java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer 9000`

If you're using port 9000 for something else, change that value and then change `PORT` in the next cell.

In [None]:
PORT = 9000

nlp = StanfordCoreNLP('http://localhost:{}'.format(PORT))

## Get part of speech

Todo: follow Will's advice to parse unicode...

In [None]:
def stanford_pos(text):
    """
    Parameters
    ----------
    text : str
       CoreNLP handles all tokenizing, at the sentence and word level.
       
    Returns
    -------
    list of tuples (str, str)
       The first member of each pair is the word, the second its POS tag.          
    """
    try:
        ann = nlp.annotate(
            text, 
            properties={'annotators': 'pos', 
                        'outputFormat': 'json'})
        lemmas = []
        for sentence in ann['sentences']:
            for token in sentence['tokens']:
                lemmas.append((token['word'], token['pos']))
    except Exception as e:
        print(text + ": cannot parse")
        lemmas = []
    return lemmas

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer

LEMMATIZER = WordNetLemmatizer()

def is_comp_sup(word, pos, tags, check_lemmatizer=False):
    """
    Parameters
    ----------
    word, pos : str, str
        The lemma.
    
    tags : iterable of str
        The tags considered positive evidence for comp/sup morphology.
       
       
    check_lemmatizer : bool
        If True, then if the `pos` is in `tags`, we also check that
        `word` is different from the lemmatized version of word
        according to WordNet, treating it as an adjective. This 
        could be used to achieve greater precision, perhaps at the
        expense of recall.
       
    Returns
    -------
    bool       
    """
    if pos not in tags:
        return False
    if check_lemmatizer and LEMMATIZER.lemmatize(word, 'a') == word:
        return False
    return True

def is_noun(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'NN', 'NNS', 'NNP', 'NNPS'}, check_lemmatizer=check_lemmatizer)

def is_prep(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'IN'}, check_lemmatizer=check_lemmatizer)

def is_verb(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'MD', 'VB', 'VBZ', 'VBP', 'VBD', 'VBN', 'VBG'}, check_lemmatizer=check_lemmatizer)

def is_det(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'DT', 'WDT'}, check_lemmatizer=check_lemmatizer)

def is_pronoun(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'PRP', 'PRP$', 'WP', 'WP$'}, check_lemmatizer=check_lemmatizer)

def is_adjective(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'JJ', 'JJR', 'JJS'}, check_lemmatizer=check_lemmatizer)

def is_adverb(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'RB', 'RBR', 'RBS', 'RP', 'WRB'}, check_lemmatizer=check_lemmatizer)

def is_num(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'CD'}, check_lemmatizer=check_lemmatizer)

def is_other(word, pos, check_lemmatizer=False):
    return is_comp_sup(
        word, pos, {'EX', 'FW', 'LS', 'PDT', 'POS', 'SYM', 'TO', 'UH'}, check_lemmatizer=check_lemmatizer)


In [None]:
# A lemma is a (word, pos) tag pair.
d['lemmas'] = [stanford_pos(text) for text in d['contents']]

In [None]:
d['tokens'] = [[element[0] for element in l] for l in d['lemmas']]
d['pos'] = [[element[1] for element in l] for l in d['lemmas']]

In [None]:
d['numWords'] = [pd.value_counts(words).sum() for words in d['tokens']]

## Get counts for each POS label

In [None]:
d['nouns'] = [sum([1 if is_noun(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['prepositions'] = [sum([1 if is_prep(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['verbs'] = [sum([1 if is_verb(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['determiners'] = [sum([1 if is_det(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['pronouns'] = [sum([1 if is_pronoun(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['adjectives'] = [sum([1 if is_adjective(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['adverbs'] = [sum([1 if is_adverb(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['numbers'] = [sum([1 if is_num(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]
d['others'] = [sum([1 if is_other(*lem) else 0 for lem in lemmas])
                     for lemmas in d['lemmas']]

In [None]:
d.head()

## Export to csv for plotting in R

In [None]:
(d.drop(["lemmas", "contents", "tokens"], 1)
 .to_csv("posTagged.csv", index = False))

# Result 2: Calculate indicator words for tangrams/rounds

## First, get list of words in first round

In [None]:
# Filter down to first round
d_round1 = d[d['roundNum'] == 1]

# Pull out all tokens and collapse into count dict
tokenDict = Counter([item for sublist in d_round1['tokens'].tolist()
                     for item in sublist])

# Pull out all words that occur more than once
wordList = [word for (word,count) in tokenDict.items() if count > 1 and not word.isdigit()]
print(wordList[0:10])
print(len(wordList))

# Get POS map; will be longer because it doesn't require count > 1, but it doesn't matter
POSdict = {word: POS for lemma in d_round1['lemmas'] for (word, POS) in lemma}
print(len(POSdict.keys()))

## Get list of all game ids

In [None]:
gameidList = pd.unique(d.gameid.ravel()).tolist()
print(gameidList[0:5])

## Get list of all tangram names

In [None]:
tangramList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']
print(tangramList)

## Helper functions to select words & counts

In [None]:
def getWordCounts(df, gameid, roundNum, tangram = None) :
    roundCond = 'roundNum == ' + roundNum
    gameidCond = 'gameid == "' + gameid + '"'
    if(tangram is not None) :
        tangramCond = 'tangramRef == "' + tangram + '"'
        cond = " and ".join((roundCond, gameidCond, tangramCond))
    else :
        cond = " and ".join((roundCond, gameidCond))
    relevantRow = df.query(cond)
    return Counter([item for sublist in relevantRow['tokens'].tolist() 
                    for item in sublist])

#creates mini dataframe that grabs the words used in round n for a given tangram and gameid
def selectTangramRoundWords(df, tangram, roundNum, gameid):
    wordCounts = getWordCounts(df, gameid, roundNum, tangram)
    return list(wordCounts.keys())

## Helper function to compute PMIs

Note that merging is really costly -- if we need to speed it up, this might be the first target. Can also vectorize the log operation.

In [None]:
#returns a table with the all words above 0 PMI and their counts for a given tangram
#calculate the probability for words given tangram A ------ p(x|y)
def makeMyPMI(df, tangram, roundNum, gameid, totals):

    # count words w/in tangram
    tangramCounts = getWordCounts(df, gameid, roundNum, tangram)

    #total number of words 
    tangramNumWords = (1 if sum(tangramCounts.values()) == 0 
                       else sum(tangramCounts.values()))

    #dataframe to compare 
    indicatorDF = pd.merge(pd.DataFrame(list(tangramCounts.items()), columns=['word', 'count']),
                           pd.DataFrame(list(totals["counts"].items()), columns=['word', 'totalCount']),
                           on='word', how = 'inner')

    #calculate PMI without log first. Having trouble with float issues. 
    indicatorDF['roughPMI'] = ((indicatorDF['count']/tangramNumWords)
                                / (indicatorDF['totalCount']/totals["numWords"]))
    indicatorDF['logPMI'] = [math.log10(num) for num in indicatorDF['roughPMI']]
    
    #remove column rough PMI
    indicatorDF = indicatorDF.drop('roughPMI', 1)
    
    return indicatorDF

## Write out PMIs & matching rates for all words

Note that we do a sloppy optimization by only computing total counts once and only when necessary 

In [None]:
def memoize(d, gameid, counts) : 
    if "counts" not in counts : 
        counts["counts"] = getWordCounts(d, gameid, "1")
        counts["numWords"] = float(sum(counts["counts"].values()))
        return counts
    else :
        return counts

with open('matchAndPMI.csv', 'a', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['word', 'POS', 'match', 'pmi', 'total'])
    for word in wordList :
        print(word + ":" + POSdict[word])
        pmi = 0
        match = 0
        total = 0
        for gameid in gameidList:  
            memoizedCounts = {}
            for tangram in tangramList:
                memoizedCounts = memoize(d, gameid, memoizedCounts)
                round1WordList = selectTangramRoundWords(d, tangram, "1", gameid)
                total = total + 1 if word in round1WordList else total
                if word in round1WordList :
                    PMI_df = makeMyPMI(d, tangram, "1", gameid, memoizedCounts)
                    pmi = pmi + PMI_df[PMI_df['word'] == word]['logPMI'].tolist()[0]
                    round6WordList = selectTangramRoundWords(d, tangram, "6", gameid)
                    match = (match + 1 if (word in round1WordList and word in round6WordList)
                             else match)
        writer.writerow([word, POSdict[word], float(match) / float(total), pmi/total, total])


## bootstrap analysis (might want to move to R)

TODO: exclude numbers earlier in the pipeline, 

TODO: don't average over matches and pmis

In [None]:
#grab words with highestPMI for a given tangram/gameid
def highestPMIWords(d, tangram, roundNum, gameid):
    allTangramCounts = {}
    allTangramCounts['counts'] = getWordCounts(d, gameid, "1")
    allTangramCounts['numWords'] = float(sum(allTangramCounts["counts"].values()))

    PMIdf = (makeMyPMI(d, tangram, roundNum, gameid, allTangramCounts))

    # Remove numbers
    PMIdf['POS'] = [POSdict[word] for word in PMIdf['word']]
    PMIdf = PMIdf.query('POS != "CD"'.format())

    #if PMIdf has words, pull out max values, it is empty return it as is
    if len(PMIdf.index) > 0:
        PMI_values = PMIdf.logPMI.unique()
        maxPMI = PMI_values.max()
        PMIdf = PMIdf.loc[PMIdf['logPMI'] == maxPMI]
        PMIdfword = PMIdf['word']
        return PMIdfword.tolist()
    else: 
        return PMIdf

with open('PMIbootstrap.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['sampleNum', 'tangram', 'gameid', 'numCandidates', 'match', 'highest'])
    for gameid in gameidList :
        for tangram in tangramList :
            round1Words = selectTangramRoundWords(d, tangram, "1", gameid)
            if len(round1Words) > 0:
                # First, write highest PMI match
                highPMIWords = highestPMIWords(d, tangram, "1", gameid)
                round6Words = selectTangramRoundWords(d, tangram, "6", gameid)
                match = np.mean([1 if word in round6Words else 0 for word in highPMIWords ])
                writer.writerow([0, tangram, gameid, len(highPMIWords), match, "highest"])

                # Next, take a bunch of null samples
                for i in range(numSamples) :
                    randomWord = np.random.choice(round1Words)
                    match = np.mean([1 if randomWord in round6Words else 0])
                    writer.writerow([i + 1, tangram, gameid, 1, match, "null"])