# Import lots of stuff

In [None]:
import itertools
import math
import re
import csv
import pandas as pd
import pylab as pyl
import nltk as nltk
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
%matplotlib inline
#enable longer display
pd.set_option('display.max_rows', 500)

# Preprocess data

## Import annotated file

In [None]:
d_raw = pd.read_csv('../../data/tangrams/old/oldTangrams.csv')

# Drop time column
d = (d_raw
    .copy()
    .drop('time', 1)
    .query('tangram != "0"')
    .query('tangram != "*"'))


## Tokenize

In [None]:
d['tokens'] = [[word for word in nltk.word_tokenize(sentence.lower()) if word.isalpha()]
               for sentence in d['contents']]

## Get number of words used

In [None]:
d['numWords'] = [pd.value_counts(words).sum() for words in d['tokens']]

# Result 1: Generate file for POS analysis

## Get part of speech

In [None]:
d['pos'] = [[pos for (key, pos) in nltk.pos_tag(rowTokens, tagset = 'universal')] 
            for rowTokens in d['tokens']]

## Get list of all unique POS labels

In [None]:
posSet = set({})
for row in d['pos'] :
    for pos in row :
        posSet.add(pos)

## Get counts for each POS label

In [None]:
for pos in posSet :
    colName = pos + "num"
    d[colName] = [posList.count(pos) for posList in d['pos']]

## Export to csv for plotting in R

In [None]:
(d.drop(["pos", "contents", "tokens"], 1)
 .to_csv("posTagged.csv", index = False))

# Result 2: Calculate indicator words for tangrams/rounds

## First, get list of words in first round

In [None]:
# Filter down to first round
d_round1 = d[d['roundNum'] == 1]

# Pull out all tokens and collapse into count dict
tokenDict = Counter([item for sublist in d_round1['tokens'].tolist()
                     for item in sublist])

# Pull out all words that occur more than once
wordList = [word for (word,count) in tokenDict.items() if count > 1]
print(wordList[0:10])

## Get list of all game ids

In [None]:
gameidList = pd.unique(d.gameid.ravel()).tolist()
print(gameidList[0:5])

## Get list of all tangram names

In [None]:
tangramList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']
print(tangramList)

## Helper functions to select words & counts

In [None]:
def getWordCounts(df, gameid, roundNum, tangram = None) :
    roundCond = 'roundNum == ' + roundNum
    gameidCond = 'gameid == "' + gameid + '"'
    if(tangram is not None) :
        tangramCond = 'tangram == "' + tangram + '"'
        cond = " and ".join((roundCond, gameidCond, tangramCond))
    else :
        cond = " and ".join((roundCond, gameidCond))
    relevantRow = df.query(cond)
    return Counter([item for sublist in relevantRow['tokens'].tolist() 
                    for item in sublist])

#creates mini dataframe that grabs the words used in round n for a given tangram and gameid
def selectTangramRoundWords(df, tangram, roundNum, gameid):
    wordCounts = getWordCounts(df, gameid, roundNum, tangram)
    return wordCounts.keys()

## Helper function to compute PMIs

Note that merging is really costly -- if we need to speed it up, this might be the first target. Can also vectorize the log operation.

In [None]:
#returns a table with the all words above 0 PMI and their counts for a given tangram
#calculate the probability for words given tangram A ------ p(x|y)
def makeMyPMI(df, tangram, roundNum, gameid, totals):

    # count words w/in tangram
    tangramCounts = getWordCounts(df, gameid, roundNum, tangram)

    #total number of words 
    tangramNumWords = (1 if sum(tangramCounts.values()) == 0 
                       else sum(tangramCounts.values()))

    #dataframe to compare 
    indicatorDF = pd.merge(pd.DataFrame(tangramCounts.items(), columns=['word', 'count']),
                           pd.DataFrame(totals["counts"].items(), columns=['word', 'totalCount']),
                           on='word', how = 'inner')

    #calculate PMI without log first. Having trouble with float issues. 
    indicatorDF['roughPMI'] = ((indicatorDF['count']/tangramNumWords)
                                / (indicatorDF['totalCount']/totals["numWords"]))
    indicatorDF['logPMI'] = [math.log10(num) for num in indicatorDF['roughPMI']]
    
    #remove column rough PMI
    indicatorDF = indicatorDF.drop('roughPMI', 1)
    
    return indicatorDF

## Write out PMIs & matching rates for all words

Note that we do a sloppy optimization by only computing total counts once and only when necessary 

In [None]:
def memoize(d, gameid, counts) : 
    if "counts" not in counts : 
        counts["counts"] = getWordCounts(d, gameid, "1")
        counts["numWords"] = float(sum(counts["counts"].values()))
        return counts
    else 
        return counts

with open('matchAndPMI.csv', 'ab') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['word', 'match', 'pmi', 'total'])
    for word in wordList :
        print(word)
        pmi = 0
        match = 0
        total = 0
        for gameid in gameidList:  
            memoizedCounts = {}
            for tangram in tangramList:
                memoizedCounts = memoize(d, gameid, memoizedCounts)
                round1WordList = selectTangramRoundWords(d, tangram, "1", gameid)
                total = total + 1 if word in round1WordList else total
                if word in round1WordList :
                    PMI_df = makeMyPMI(d, tangram, "1", gameid, memoizedCounts)
                    pmi = pmi + PMI_df[PMI_df['word'] == word]['logPMI'].tolist()[0]
                    round6WordList = selectTangramRoundWords(d, tangram, "6", gameid)
                    match = (match + 1 if (word in round1WordList and word in round6WordList)
                             else match)
        writer.writerow([word, float(match) / float(total), pmi/total, total])


## bootstrap analysis (might want to move to R)

In [None]:
#grab words with highestPMI for a given tangram/gameid
def highestPMIWords(d, tangram, roundNum, gameid):
    allTangramCounts = {}
    allTangramCounts['counts'] = getWordCounts(d, gameid, "1")
    allTangramCounts['numWords'] = float(sum(allTangramCounts["counts"].values()))

    PMIdf = makeMyPMI(d, tangram, roundNum, gameid, allTangramCounts)
    #if PMIdf has words, pull out max values, it is empty return it as is
    if len(PMIdf.index) > 0:
        PMI_values = PMIdf.logPMI.unique()
        maxPMI = PMI_values.max()
        PMIdf = PMIdf.loc[PMIdf['logPMI'] == maxPMI]
        PMIdfword = PMIdf['word']
        return PMIdfword.tolist()
    else: 
        return PMIdf

numSamples = 100
with open('PMIbootstrap.csv', 'wb') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['tangram', 'gameid', 'numCandidates', 'match', 'highest'])
    for gameid in gameidList :
        for tangram in tangramList :
            round1Words = selectTangramRoundWords(d, tangram, "1", gameid)
            if len(round1Words) > 0:
                # First, write highest PMI match
                highPMIWords = highestPMIWords(d, tangram, "1", gameid)
                round6Words = selectTangramRoundWords(d, tangram, "6", gameid)
                match = np.mean([1 if word in round6Words else 0 for word in highPMIWords ])
                writer.writerow([tangram, gameid, len(highPMIWords), match, "highest"])

                # Next, take a bunch of null samples
                for i in range(numSamples) :
                    randomWord = np.random.choice(round1Words)
                    match = np.mean([1 if randomWord in round6Words else 0])
                    writer.writerow([tangram, gameid, 1, match, "null"])

In [None]:
dfForDict = test2.copy()
dfForDict = dfForDict[20:500]

#http://stackoverflow.com/questions/16333296/how-do-you-create-nested-dict-in-python
#set up the nested dictionaries
tangramDict = {}
tangramDict['tangram'] = {}
tangramDict['tangram']['roundNum'] = {}
tangramDict['tangram']['roundNum']['word'] = 'count'
tangramDict

#populate the dictionary 

#http://stackoverflow.com/questions/635483/what-is-the-best-way-to-implement-nested-dictionaries-in-python


dfForDict = dfForDict[['roundNum', 'tangram', 'tokens']]
dfForDict = dfForDict.sort(['roundNum', 'tangram'], ascending = [True, True])
#rearrange columns
cols = dfForDict.columns.tolist()
cols = ['tangram', 'roundNum', 'tokens']
dfForDict = dfForDict[cols]
#dfForDict = dfForDict.groupby('roundNum')
# dfForDict = dfForDict[2:500]


import pprint

class Vividict(dict):
    def __missing__(self, key):
        value = self[key] = type(self)()
        return value
    
d = Vividict()

# d['foo']['bar']
# d['foo']['baz']
# d['fizz']['buzz']
# d['primary']['secondary']['tertiary']['quaternary']



pprint.pprint(d)

#http://stackoverflow.com/questions/18695605/python-pandas-dataframe-to-dictionary

#{g: f['contents'].tolist() for f,g in dfForDict.groupby("tangram") for k,g in dfForDict.groupby("roundNum")}

#{k: g["value"].tolist() for k,g in ptest.groupby("id")}


def retro_dictify(frame):
    d = {}
    for row in frame.values:
        here = d
        for elem in row[:-2]:
            if elem not in here:
                here[elem] = {}
            here = here[elem]
        here[row[-2]] = row[-1]
    return d

retro_dictify(dfForDict)



# d = defaultdict(int)
# for word in bigWordList :
#   for tangram in tangrams :
#     for roundNum in roundNums :
#       d[tangram + ‘,’ + str(roundNum) + ‘,’ + word] += 
#countOccurences(word, tangram, roundNum)

# writer = csv.writer(open(’tangramWordCounts.csv', 'wb'))
# for key, value in d.items():
#   writer.writerow([key, value])

#dfForDict.groupby()

#[f(x) for x in list]

dfForDict

In [None]:
#use highPMI list to filter words for tangrams over the next rounds
#need list of words and their frequency for each round
#need 6 dictionaries/frequencies merged by the highPMI words

#the dataframe with all tangrams and all rounds
bigFrame = test2.copy()
#bigFrameTangram = bigFrame[bigFrame.tangram == 'A']

#cumWordsAndCounts = highPMI

def createFrequencyTable(roundNum, tangram):
    #filter for the tangram desired
    bigFrameTangram = bigFrame[bigFrame.tangram == tangram]
    bigFrameTangram = bigFrame[bigFrame.roundNum == roundNum]
    cumWordsAndCounts = makeMyPMI(tangram)
    #print cumWordsAndCounts
    #go from 2-6 because round1 is included with cumWordsAndCounts 
    #for roundNum in [2,3,4,5,6]:
    #filter by roundNum
    bigFrameTangram = bigFrameTangram[bigFrameTangram.roundNum == roundNum]

    #bigDictionary turns all of the tokens used to talk about tangram A over round into a dictionary
    smallDictionary = bigFrameTangram['tokens'].tolist()
    smallDictionary = [item for sublist in smallDictionary for item in sublist]
    #get dictionary counter for words used for tangram A
    smallDictionary = Counter(smallDictionary)

    #convert to normal dictionary in order to pull out counts more easily
    #smallDictionary = smallDictionary.items() 

    #turn dictionary with counts into dataframe
    #dataframe to look at words and their counts for tangram A in round 1
    smallWordsAndCounts = pd.DataFrame(smallDictionary.items(), columns=['word', 'count'])
    #smallWordsAndCount['tangram'] = tan
    #dataframe which will contain words and the frequencies of them on rounds 1 through 6
    #print cumWordsAndCounts[:5][:5]
    #cumWordsAndCounts = pd.concat([cumWordsAndCounts, smallWordsAndCounts])
    #print cumWordsAndCounts[:5][:5]

    #reset bigFrame so we can go to the next roundNum
    #bigFrameTangram = bigFrame[bigFrame.tangram == tangram]

    return smallWordsAndCounts
    

wordFrequency_A = createFrequencyTable(2, 'A')

wordFrequency_A

# d = defaultdict(int)
# for word in bigWordList :
#   for tangram in tangrams :
#     for roundNum in roundNums :
#       d[tangram + ‘,’ + str(roundNum) + ‘,’ + word] += countOccurences(word, tangram, roundNum)


 
    
    
# allTangramsFreqs = pd.DataFrame()
# for tangram in ['A','B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']:
#     #creates the words and frequencies over rounds df for one tangram
#     oneTangramFreq = createFrequencyTable(tangram)
#     #joins all words and their frequencies over rounds for each tangram
#     allTangramsFreqs = pd.concat([allTangramsFreqs, oneTangramFreq], axis=1)

# #change the column names to tangram letter and correct round numbers
# allTangramsFreqs.columns = ['A', '1', '2', '3', '4', '5', '6', 'B', '1', '2', '3', '4', '5', '6', 
#                            'C', '1', '2', '3', '4', '5', '6', 'D', '1', '2', '3', '4', '5', '6', 
#                            'E', '1', '2', '3', '4', '5', '6', 'F', '1', '2', '3', '4', '5', '6', 
#                            'G', '1', '2', '3', '4', '5', '6', 'H', '1', '2', '3', '4', '5', '6', 
#                            'I', '1', '2', '3', '4', '5', '6', 'J', '1', '2', '3', '4', '5', '6',
#                            'K', '1', '2', '3', '4', '5', '6', 'L', '1', '2', '3', '4', '5', '6']
# allTangramsFreqs.to_csv("allTangramsFreqs.csv")   

    

len(pd.unique(test1.gameid.ravel()))


In [None]:
#use highPMI list to filter words for tangrams over the next rounds
#need list of words and their frequency for each round
#need 6 dictionaries/frequencies merged by the highPMI words

#the dataframe with all tangrams and all rounds
bigFrame = test2.copy()

def createFrequencyTable(tangram, roundNum):
    #filter for the tangram desired
    myDictDf = bigFrame[bigFrame.tangram == tangram]
    myDictDf = myDictDf[myDictDf.roundNum == roundNum]

    #bigDictionary turns all of the tokens used to talk about tangram A over round into a dictionary
    smallDictionary = myDictDf['tokens'].tolist()
    smallDictionary = [item for sublist in smallDictionary for item in sublist]
    #get dictionary counter for words used for tangram A
    smallDictionary = Counter(smallDictionary)

    #turn dictionary with counts into dataframe
    #dataframe to look at words and their counts for tangram A in round 1
    smallWordsAndCounts = pd.DataFrame(smallDictionary.items(), columns=['word', 'count'])

    return smallWordsAndCounts
    

createFrequencyTable('A', 3)

d = defaultdict(int)
for word in bigWordList :
  for tangram in tangrams :
    for roundNum in roundNums :
      d[tangram + ‘,’ + str(roundNum) + ‘,’ + word] += countOccurences(word, tangram, roundNum)


 






#Type and token probabilities over rounds

In [None]:
tanDiffs = test2.copy()
tanDiffs = tanDiffs.groupby(['tangram']).sum()

tans = tanDiffs.loc['A': 'K', 'numWords': 'numWords']
tans2 = tans['numWords'].tolist()

y = tans2
x = ['A','B','C','D','E','F','G','H','I','J','K']
width = 1/1.5
#plt.bar(x, y, width, color="blue")

#fig = plt.gcf()
#plot_url = py.plot_mpl(fig, filename='mpl-basic-bar')



#NN vs. VB over each round for a tangram

#Testing word count function

In [None]:
merged = pd.read_csv('/Users/nickimaslan/Desktop/convention_project/tangrams_replication/data/annotated_names/merged.csv')
merged.drop(['gameid','time'], axis=1)
from itertools import chain
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

def tokenize(listOfStrings):
    mergedStr = []
    for wordStr in listOfStrings :   
        potato = nltk.word_tokenize(wordStr)
        mergedStr.append(potato)
    flat = [item for sublist in mergedStr for item in sublist]
    return flat
#     return list(chain.from_iterable(mergedStr))

def tangram(merged, tangram, roundNum ):   ##select tangram and round you want to look at
    merged.drop(['gameid','time'], axis=1)
    tanMerged = merged[merged['tangram'] == tangram]
    tanRound = tanMerged[tanMerged['roundNum'] == roundNum]
    return tanRound

def tokTan(dataframe):       ##will tokenize the contents grouped by roundNum
    wordsCountedC1 = dataframe.groupby(['roundNum'])['contents'].aggregate(tokenize)
    return wordsCountedC1

def cleanup(listofStrings):  ##only will do one list at a time
    listofStrings = [w for w in listofStrings if w not in stopwords]
    listofStrings = [w for w in listofStrings if w.isalpha()]
    listofStrings = [w for w in listofStrings if len(w) > 2]
    listofStrings = [w.lower() for w in listofStrings if w.isalpha()] 
    listofStrings = [w for w in listofStrings if w not in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'the', 'like', 'either' 'ready', 'yeah' 'really,' 'ok', 'looks', 'okay', 'one', 'got', 'go']]
    return listofStrings

##example with tangram C on round1
tangramCr1 = tangram(merged, 'C', '1')
tokCr1 = tokTan(tangramCr1)
cleanCr1 = cleanup(tokCr1[0])
cleanCr1

##plot the frequency distribution

wordsCounted = merged.groupby(['roundNum'])['contents'].aggregate(tokenize)    #list of words in each round
cleaned1 = cleanup(wordsCounted[0])  # cleaned up list of words in round1
cleaned2 = cleanup(wordsCounted[1])
cleaned3 = cleanup(wordsCounted[2])
cleaned4 = cleanup(wordsCounted[3])
cleaned5 = cleanup(wordsCounted[4])
cleaned6 = cleanup(wordsCounted[5])

c = 0
while c < 6:
    cleanWords = cleanup(wordsCounted[c])
    c = c + 1

#cleanWords
#cleaned2

#wordsCounted
#fdist1 = nltk.FreqDist(cleaned)
#fdist1.plot(30, cumulative=False)





#  Tangram C vs G graph

In [None]:
merged = pd.read_csv('/Users/nickimaslan/Desktop/convention_project/tangrams_replication/data/annotated_names/merged.csv')
from itertools import chain
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

def countWords(listOfStrings):
    wordCount = 0
    for wordStr in listOfStrings :   
        potato = nltk.word_tokenize(wordStr) # tokenize it (returns a list of words)
        cleanPotato = cleanup(potato)
        length = len(potato)   # get length of token list
        wordCount = wordCount + length # add that number to wordCount
    return wordCount

def cleanup(aStr) :
    listofStrings = [w for w in aStr if w not in stopwords]
    listofStrings = [w for w in listofStrings if w.isalpha()]
    listofStrings = [w for w in listofStrings if len(w) > 2]
    listofStrings = [w.lower() for w in listofStrings if w.isalpha()] 
    listofStrings = [w for w in listofStrings if w not in ['like', 'either' 'ready', 'yeah' 'really,' 'ok', 'looks', 'okay', 'one', 'got', 'go']]
    return listofStrings

tanC = merged[merged['tangram'] == 'C']
tanG = merged[merged['tangram'] == 'G']

def getMean(dataframe):
    meanList = []
    for roundNumber in [1,2,3,4,5,6] :
        mergeByRound = dataframe[dataframe['roundNum'] == '%d' %roundNumber]
        wordsCounted = mergeByRound.groupby(['roundNum'])['contents'].aggregate(countWords)
        wordsPerRound = wordsCounted.apply(lambda x: x/(24))
        wordMean = wordsPerRound.mean()
        meanList.append(wordMean)
    return meanList

print(getMean(tanC))
print(getMean(tanG))
# print(tanC)
# print(tanG)



def getStDev(dataframe):
    stdList = []
    for roundNumber in [1,2,3,4,5,6] :
        mergeByRound = dataframe[dataframe['roundNum'] == '%d' %roundNumber]
        wordsCounted = mergeByRound.groupby(['roundNum','gameid'])['contents'].aggregate(countWords)
        wordsPerRound = wordsCounted.apply(lambda x: x/(1))
        std = wordsPerRound.std()
        stdList.append(std)
    return stdList



##df for tangram C
tanC = merged[merged['tangram'] == 'C']
wordsCountedC = tanC.groupby(['roundNum','gameid'])['contents'].aggregate(countWords)
meanC = getMean(tanC)
stErrorC = getStDev(tanC)/np.sqrt(24)
print(meanC)

%matplotlib inline


# ##df for tangram G
tanG = merged[merged['tangram'] == 'G']
wordsCountedG = tanG.groupby(['roundNum', 'gameid'])['contents'].aggregate(countWords)
meanG = getMean(tanG)
stErrorG = getStDev(tanG)/np.sqrt(24)
stErrorG
print(meanG)

##plot it
##data to be plotted
# wordsPerRound = wordsCounted.apply(lambda x: x/(12*24))
rounds = [1,2,3,4,5,6]

##error data

fig = plt.figure(figsize=(10, 10), dpi=100)

##plotting
# fig = plt.figure(figsize=(6, 6))
# axes = fig.add_subplot(111)
plt.plot(rounds, meanG, color='red')
plt.plot(rounds, meanC, color='blue')
# 
##plot error bars
plt.errorbar(rounds,meanG,yerr=stErrorG, linestyle="None", color="red")
plt.errorbar(rounds,meanC,yerr=stErrorC, linestyle="None", color="blue")




#configure x and y axes
plt.ylim([0,40])
plt.xlim([0,7])
plt.title('Tangram C vs. G words per round', size=15)
plt.grid(True)
plt.xlabel('Trials', size=14)
plt.ylabel('Average amount of words players used', size=14)

##save plot
plt.savefig("/Users/nickimaslan/Desktop/GvsCTangrams.png", bbox_inches='tight')

##show plot
plt.show()


# Avg Director word count for each tangram


In [None]:
merged = pd.read_csv('/Users/nickimaslan/Desktop/convention_project/tangrams_replication/data/annotated_names/merged.csv')
from itertools import chain
def countWords(listOfStrings):
    wordCount = 0
    for wordStr in listOfStrings :   
        potato = nltk.word_tokenize(wordStr) # tokenize it (returns a list of words)
        length = len(potato)   # get length of token list
        wordCount = wordCount + length # add that number to wordCount
    return wordCount
        
# merged.groupby(['tangram'])['roundNum'].apply(plus1)
merged = merged[merged['sender'] == 'director']
merged = merged[merged['tangram'] != '0']
merged = merged[merged['tangram'] != '10']
merged = merged[merged['tangram'] != ':']

def getStDev(dataframe):
    stdList = []
    for roundNumber in [1,2,3,4,5,6] :
        mergeByRound = merged[merged['roundNum'] == '%d' %roundNumber]
        wordsCounted = mergeByRound.groupby(['roundNum','gameid'])['contents'].aggregate(countWords)
        wordsPerRound = wordsCounted.apply(lambda x: x/(12))
        std = wordsPerRound.std()
        stdList.append(std)
    return stdList

error = np.sqrt(24)

%matplotlib inline

wordsCounted = merged.groupby(['roundNum'])['contents'].aggregate(countWords)

##data to be plotted
wordsPerRound = wordsCounted.apply(lambda x: x/(12*24))
rounds = [1,2,3,4,5,6]

##error data
stdErrorList = getStDev(merged)/error

fig = plt.figure(figsize=(10, 10), dpi=100)

##plotting
# fig = plt.figure(figsize=(6, 6))
# axes = fig.add_subplot(111)
plt.plot(rounds, wordsPerRound)

##plot error bars
plt.errorbar(rounds,wordsPerRound,yerr=stdList, linestyle="None", color="green")

#configure x and y axes
plt.ylim([0,20])
plt.xlim([0,7])
plt.title('Avg director word count for each tangram', size=15)
plt.grid(True)
plt.xlabel('trials', size=14)
plt.ylabel('mean number words (by director) per figure', size=14)

##save plot
plt.savefig("/Users/nickimaslan/Desktop/avgWordsTallFont.png", bbox_inches='tight')

##show plot
plt.show()


In [None]:
merged = pd.read_csv('/Users/nickimaslan/Desktop/convention_project/tangrams_replication/data/annotated_names/merged.csv')
# merged.ix[0:, ['contents', 'tangram']]
tangramMerge = merged[merged['tangram'] != '0']
tangram = merged[merged['tangram'] == 'G']
tangramRound = tangram[tangram['roundNum'] == '6']
contents = tangramRound.ix[0:, ['contents']]
contents

