# Import lots of stuff

In [None]:
import itertools
import math
import re
import csv
import re;
import pandas as pd
import pylab as pyl
import nltk as nltk
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
%matplotlib inline
#enable longer display
pd.set_option('display.max_rows', 500)

# Annotate messages with tangram

Get one round from one game to test...

In [None]:
d_msgs = (pd.read_csv('../../data/tangrams/message/tangramsMessages.csv', escapechar='\\')
          .query('sender == "director"')
          .drop('sender', 1))
d_msgs['tangramRef'] = "None"
d_drops = (pd.read_csv('../../data/tangrams/dropObj/tangramsDropObj.csv', escapechar='\\'))
d_boards = (pd.read_csv('reformattedBoards.csv'))

The most obvious strategy is to (on a first pass) assume that the tangram the matcher moves in response to a message is the one the message is referring to. The second pass is to skip the ones where we know they got it wrong. We'll probably end up hand-tagging those or using some other strategy depending on how many there are.

There are a few obvious problems here:

1. The director will sometimes send several messages before the matcher moves anything. So we can't just use the closest move in time... 
2. instead, we could use the *first* move action after the message and then rule it out so that we won't use it again even if it's the first after later message as well
3. **that**, though, also has a problem. Multiple messages are sent per tangram, and some messages are meta-chatter (e.g. "hello", "thanks", "good job", "this HIT is terrible"). If we assign the drop actions to the first $N$ messages, we'll have a bunch of actual messages about tangrams that aren't tagged and a bunch of messages **not** about tangrams incorrectly tagged.

In [None]:
# e.g. this code matches the first N messages on each trial with the first N drop actions... 
# It really doesn't work at all
# d_drops['notUsed'] = True
# d_msgs['tangramRef'] = None
# for index, row in d_msgs.iterrows():
#     msgTime = row['time']    
#     postMsgDrops = d_drops.query('notUsed and dropTime > {0}'.format(msgTime))
#     firstPostMsgDrop = postMsgDrops[:1]
#     d_msgs[index-1:index]['tangramRef'] = firstPostMsgDrop['name']
#     firstPostMsgDrop['notUsed'] = False

So... we'll do a simpler thing. Check for numbers occuring in the text and look them up in the board data...

In [None]:
pattern = re.compile('[\W_]+')
for index, row in d_msgs.iterrows():
    stripedStr = pattern.sub(' ', row.contents)
    numbers = [int(s) for s in stripedStr.split() if s.isdigit()]
    gameid = row.gameid
    roundNum = row.roundNum
    if len(numbers) == 1 and 0 < numbers[0] <= 12 :
        boardRow = d_boards.query('gameid == "{0}" and roundNum == {1} and trueLoc == {2}'
                                  .format(gameid, roundNum, numbers[0]))
        d_msgs.set_value(index, 'tangramRef', boardRow.tangramName.tolist()[0])
d_msgs.to_csv("taggedTangrams.csv", index = False)

Check to see how many we tagged...

In [None]:
1 - Counter(d_msgs['tangramRef'])['None'] / float(d_msgs.shape[0])

60% isn't bad for a conservative heuristic! Now we're going to use the tagged data to train a classifier that will make predictions for the other 40%.

## Set up training set

In [None]:
from sklearn.cross_validation import train_test_split
d_nicki = (pd.read_csv('../../data/tangrams/old/oldTangrams.csv')
    .query('tangram != "*"')
    .drop('sender', 1)
    .rename(columns = {'tangram' : 'tangramRef'}))
d_combined = (d_msgs
  .query('tangramRef != "None"')
  .append(pd.DataFrame(data = d_nicki), ignore_index=True))
train_msg, test_msg = train_test_split(d_combined, test_size = 0.2)

## Build pipeline that tokenizes and trains a naive bayes classifier...

Largely drawn from [here](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
              'vect__stop_words': (None, 'english'),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5)
}
text_clf = Pipeline([('vect', CountVectorizer()),#)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='log', penalty='l2',n_iter=5)),
                    ])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf
_ = gs_clf.fit(train_msg.contents, train_msg.tangramRef)

Look at success!

In [None]:
correct = predicted == test_msg.tangramRef
print("test-split accuracy is...")
print(sum(correct)/float(len(correct)))

Plot ROC curve

In [None]:
predicted = gs_clf.predict(test_msg.contents)
test_msg.loc[:, 'predicted'] = predicted
test_msg.loc[:, 'correct'] = test_msg['predicted'] == test_msg['tangramRef']
test_msg.loc[:, 'maxProb'] = [max(row) for row in gs_clf.predict_proba(test_msg['contents'])]
# print(gs_clf.predict_proba(test_msg['contents'])[0])
# print(test_msg.ix[:,'maxDecisionFunc'])

actualNumPos= float(sum(test_msg['correct']))
actualNumNeg= len(test_msg['correct']) - float(sum(test_msg['correct']))

TPRs, FPRs = [], []
for threshold in np.arange(0, 1, .05) :
    # Get the ones that our policy tags as "correct"
    predYes = test_msg.query('maxProb > {0}'.format(threshold))['correct']
    # TPR: number *correct* positive results relative to overall number positive samples 
    TPRs.append(sum(predYes)/actualNumPos)
    # TPR: number *incorrect* positive results relative to overall number negative samples 
    FPRs.append((len(predYes)-sum(predYes))/actualNumNeg)
    
fig = plt.figure()
ax = fig.add_subplot(111, aspect = 'equal')
#forceAspect(ax)    
ax.plot([0,1], [0,1])
ax.plot(FPRs, TPRs)
ax.set_xlabel("False positive rate")
ax.set_ylabel("True positive rate")

What are best params?

In [None]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
from sklearn import metrics
def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

plt.figure()
cm = metrics.confusion_matrix(test_msg.tangramRef, predicted)
tangramLabels = sorted(list(set(test_msg.tangramRef)))
plot_confusion_matrix(cm, tangramLabels)

# Preprocess data for analyses

## Import annotated file

In [None]:
d_raw = pd.read_csv('../../data/tangrams/old/oldTangrams.csv')

# Drop time column
d = (d_raw
    .copy()
    .drop('time', 1)
    .query('tangram != "0"')
    .query('tangram != "*"'))


## Tokenize

In [None]:
d['tokens'] = [[word for word in nltk.word_tokenize(sentence.lower()) if word.isalpha()]
               for sentence in d['contents']]

## Get number of words used

In [None]:
d['numWords'] = [pd.value_counts(words).sum() for words in d['tokens']]

# Result 1: Generate file for POS analysis

## Get part of speech

In [None]:
d['pos'] = [[pos for (key, pos) in nltk.pos_tag(rowTokens, tagset = 'universal')] 
            for rowTokens in d['tokens']]

## Get list of all unique POS labels

In [None]:
posSet = set({})
for row in d['pos'] :
    for pos in row :
        posSet.add(pos)

## Get counts for each POS label

In [None]:
for pos in posSet :
    colName = pos + "num"
    d[colName] = [posList.count(pos) for posList in d['pos']]

## Export to csv for plotting in R

In [None]:
(d.drop(["pos", "contents", "tokens"], 1)
 .to_csv("posTagged.csv", index = False))

# Result 2: Calculate indicator words for tangrams/rounds

## First, get list of words in first round

In [None]:
# Filter down to first round
d_round1 = d[d['roundNum'] == 1]

# Pull out all tokens and collapse into count dict
tokenDict = Counter([item for sublist in d_round1['tokens'].tolist()
                     for item in sublist])

# Pull out all words that occur more than once
wordList = [word for (word,count) in tokenDict.items() if count > 1]
print(wordList[0:10])

## Get list of all game ids

In [None]:
gameidList = pd.unique(d.gameid.ravel()).tolist()
print(gameidList[0:5])

## Get list of all tangram names

In [None]:
tangramList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']
print(tangramList)

## Helper functions to select words & counts

In [None]:
def getWordCounts(df, gameid, roundNum, tangram = None) :
    roundCond = 'roundNum == ' + roundNum
    gameidCond = 'gameid == "' + gameid + '"'
    if(tangram is not None) :
        tangramCond = 'tangram == "' + tangram + '"'
        cond = " and ".join((roundCond, gameidCond, tangramCond))
    else :
        cond = " and ".join((roundCond, gameidCond))
    relevantRow = df.query(cond)
    return Counter([item for sublist in relevantRow['tokens'].tolist() 
                    for item in sublist])

#creates mini dataframe that grabs the words used in round n for a given tangram and gameid
def selectTangramRoundWords(df, tangram, roundNum, gameid):
    wordCounts = getWordCounts(df, gameid, roundNum, tangram)
    return wordCounts.keys()

## Helper function to compute PMIs

Note that merging is really costly -- if we need to speed it up, this might be the first target. Can also vectorize the log operation.

In [None]:
#returns a table with the all words above 0 PMI and their counts for a given tangram
#calculate the probability for words given tangram A ------ p(x|y)
def makeMyPMI(df, tangram, roundNum, gameid, totals):

    # count words w/in tangram
    tangramCounts = getWordCounts(df, gameid, roundNum, tangram)

    #total number of words 
    tangramNumWords = (1 if sum(tangramCounts.values()) == 0 
                       else sum(tangramCounts.values()))

    #dataframe to compare 
    indicatorDF = pd.merge(pd.DataFrame(tangramCounts.items(), columns=['word', 'count']),
                           pd.DataFrame(totals["counts"].items(), columns=['word', 'totalCount']),
                           on='word', how = 'inner')

    #calculate PMI without log first. Having trouble with float issues. 
    indicatorDF['roughPMI'] = ((indicatorDF['count']/tangramNumWords)
                                / (indicatorDF['totalCount']/totals["numWords"]))
    indicatorDF['logPMI'] = [math.log10(num) for num in indicatorDF['roughPMI']]
    
    #remove column rough PMI
    indicatorDF = indicatorDF.drop('roughPMI', 1)
    
    return indicatorDF

## Write out PMIs & matching rates for all words

Note that we do a sloppy optimization by only computing total counts once and only when necessary 

In [None]:
def memoize(d, gameid, counts) : 
    if "counts" not in counts : 
        counts["counts"] = getWordCounts(d, gameid, "1")
        counts["numWords"] = float(sum(counts["counts"].values()))
        return counts
    else 
        return counts

with open('matchAndPMI.csv', 'ab') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['word', 'match', 'pmi', 'total'])
    for word in wordList :
        print(word)
        pmi = 0
        match = 0
        total = 0
        for gameid in gameidList:  
            memoizedCounts = {}
            for tangram in tangramList:
                memoizedCounts = memoize(d, gameid, memoizedCounts)
                round1WordList = selectTangramRoundWords(d, tangram, "1", gameid)
                total = total + 1 if word in round1WordList else total
                if word in round1WordList :
                    PMI_df = makeMyPMI(d, tangram, "1", gameid, memoizedCounts)
                    pmi = pmi + PMI_df[PMI_df['word'] == word]['logPMI'].tolist()[0]
                    round6WordList = selectTangramRoundWords(d, tangram, "6", gameid)
                    match = (match + 1 if (word in round1WordList and word in round6WordList)
                             else match)
        writer.writerow([word, float(match) / float(total), pmi/total, total])


## bootstrap analysis (might want to move to R)

In [None]:
#grab words with highestPMI for a given tangram/gameid
def highestPMIWords(d, tangram, roundNum, gameid):
    allTangramCounts = {}
    allTangramCounts['counts'] = getWordCounts(d, gameid, "1")
    allTangramCounts['numWords'] = float(sum(allTangramCounts["counts"].values()))

    PMIdf = makeMyPMI(d, tangram, roundNum, gameid, allTangramCounts)
    #if PMIdf has words, pull out max values, it is empty return it as is
    if len(PMIdf.index) > 0:
        PMI_values = PMIdf.logPMI.unique()
        maxPMI = PMI_values.max()
        PMIdf = PMIdf.loc[PMIdf['logPMI'] == maxPMI]
        PMIdfword = PMIdf['word']
        return PMIdfword.tolist()
    else: 
        return PMIdf

numSamples = 1000
with open('PMIbootstrap.csv', 'wb') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['sampleNum', 'tangram', 'gameid', 'numCandidates', 'match', 'highest'])
    for gameid in gameidList :
        for tangram in tangramList :
            round1Words = selectTangramRoundWords(d, tangram, "1", gameid)
            if len(round1Words) > 0:
                # First, write highest PMI match
                highPMIWords = highestPMIWords(d, tangram, "1", gameid)
                round6Words = selectTangramRoundWords(d, tangram, "6", gameid)
                match = np.mean([1 if word in round6Words else 0 for word in highPMIWords ])
                writer.writerow([0, tangram, gameid, len(highPMIWords), match, "highest"])

                # Next, take a bunch of null samples
                for i in range(numSamples) :
                    randomWord = np.random.choice(round1Words)
                    match = np.mean([1 if randomWord in round6Words else 0])
                    writer.writerow([i + 1, tangram, gameid, 1, match, "null"])