In [196]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter
from scipy.stats import pearsonr as pearsonr
from scipy.stats import skew as skew
import math
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize
from scipy.stats.mstats import kruskalwallis as kruskalwallis
from nltk.corpus import stopwords


In [197]:
def y_Ao(annotations):
    total = 0.0
    pairings = [x for x in itertools.combinations(annotations,2)]
    for a1,a2 in pairings:
        total+=int(a1==a2)
    return total / len(pairings)


def cleanstring(s):
    if type(s) == float or type(s) == int:
        if math.isnan(s):
            return ""
    if not s:
        return ""
    s = str(s).strip()
    s = s.replace("&#44",", ")
    if not s:
        return ""
    else:
        return s

class AnnotatedInstance:
    def __init__(self):
        self.leftcontext = ''
        self.rightcontext = ''
        self.headword = ''
        self.labels = []
        self.times = []
        self.lemmafreq = 0
                
    def length(self):
        return len((self.leftcontext+" "+self.rightcontext).split())
    
    def contentlength(self):
        return len([x for x  in (self.leftcontext+" "+self.rightcontext).lower().split() if x not in stopwords.words('english')])
    
    def Ao(self):
        return y_Ao(self.labels)
    
    def avg_time(self,threshold=100):
        return [x for x in self.times if x < threshold]
    
    def max_time(self,threshold=100):
        return max([x for x in self.times if x < threshold])

    
    def _normlabels(self):
        d = {}
        d['ANIM']='LIT'
        d['Category1']='LIT'
        
        d['MEAT']='MET'
        d['Categpory2']='MET'
        d['Category2']='MET'
        
        d['DOT']='DOT'
        d['Category4']='DOT'
        d['Category3']='DOT'
        return [d[x] for x in self.labels]
    
    def label(self):
        #LIT, MET, DOT
        normlabels = self._normlabels()
        if normlabels.count('LIT') > normlabels.count('MET') and normlabels.count('DOT') <= normlabels.count('LIT'):
            return 'LIT'
        if normlabels.count('MET') > normlabels.count('LIT') and normlabels.count('DOT') <= normlabels.count('MET'):
            return 'MET'
        else:
            return 'DOT'
        
    def __str__(self):
        return '-'.join([i for i in [self.leftcontext,self.headword,self.rightcontext]+self.labels+[str(x) for x in self.times]])
    
    
def CreateInstanceDict(dataframe,freqdict):
    instancedict = {}
    for hitid in set(dataframe.Input_globalindex):
        inst = AnnotatedInstance()
        inst.times = list(dataframe[dataframe.Input_globalindex==hitid].WorkTimeInSeconds)
        inst.labels = list(dataframe[dataframe.Input_globalindex==hitid].Answer_Category)
        inst.leftcontext = cleanstring(list(dataframe[dataframe.Input_globalindex==hitid].Input_lleftcontext)[0])
        inst.rightcontext = cleanstring(list(dataframe[dataframe.Input_globalindex==hitid].Input_lrightcontext)[0])
        inst.headword = cleanstring(list(dataframe[dataframe.Input_globalindex==hitid].Input_lheadword)[0])
        inst.lemma = cleanstring(list(dataframe[dataframe.Input_globalindex==hitid].Input_lemma)[0]).lower()
        inst.lemmafreq = math.log(sum(list(freqdict[freqdict.Lemma==inst.lemma].Freq)))
        instancedict[hitid]=inst
    return instancedict
    

def Datasettime(instancedict):
    t =[]
    for k in instancedict:
        t.extend(instancedict[k].times)
    return t
    
def ReportDataset(instancedict):
    alltimes = Datasettime(instancedict)
    percentile95 =  np.percentile(alltimes, 95) #right-hand outliers filtered, remove anything above the 95th percentile
    alltimes= [x for x in alltimes if x < percentile95]
    mean = np.mean(alltimes)
    median = np.median(alltimes)
    std = np.std(alltimes)
    Ao_indiv_list = [instancedict[k].Ao() for k in sorted(instancedict.keys())]
    Ao_avg = np.mean(Ao_indiv_list)
    lemmafreqs =  [instancedict[k].lemmafreq for k in sorted(instancedict.keys())]
    time_indiv_list =  [np.mean(instancedict[k].avg_time(percentile95)) for k in sorted(instancedict.keys())]
    #time_indiv_list =  [instancedict[k].max_time(percentile95) for k in sorted(instancedict.keys())]

    lengths = [instancedict[k].length() for k in sorted(instancedict.keys())]
    contentlengths = [instancedict[k].contentlength() for k in sorted(instancedict.keys())]
    labels = [instancedict[k].label() for k in sorted(instancedict.keys())]
    #return (mean, median, std,percentile95,Ao_avg,pearsonr(Ao_indiv_list,time_indiv_list),pearsonr(lenghts,time_indiv_list)[0],Counter(labels)['LIT'])
    return (pearsonr(Ao_indiv_list,time_indiv_list),pearsonr(lengths,time_indiv_list),pearsonr(Ao_indiv_list,contentlengths),pearsonr(time_indiv_list,contentlengths))
    #return (' & '.join([str(x) for x in [Ao_avg,"_", np.mean(lengths),np.mean(alltimes), np.median(alltimes), np.std(alltimes), np.mean(lemmafreqs)]]))



def getLiteral(inputlist):
    litlabels = ['Category1', 'ANIM']
    return [int(x in litlabels) for x in inputlist]






In [198]:
animeat = pd.read_csv('data/raw/animeatbatchresults.csv')
contcont = pd.read_csv('data/raw/container_5turks.tab.csv',sep='\t')
artinfo = pd.read_csv('data/raw/artinfo_mturk_500.tab.csv',sep='\t')
procres = pd.read_csv('data/raw/procres_mturk.tab.csv',sep='\t')
locorg = pd.read_csv('data/raw/locorg_5turks.tab.csv',sep='\t')
freqdict =  pd.read_csv('data/ANC-all-lemma.txt',sep='\t')



In [199]:
datataframes = [animeat, artinfo, contcont, locorg, procres]
names = ['animeat', 'artinfo', 'contcont', 'locorg','procres']
instancedicts = {}
for dataframe,name in zip(datataframes,names):
    instancedicts[name] = CreateInstanceDict(dataframe,freqdict)
    
for k in sorted(instancedicts.keys()):
    print(k,'&',ReportDataset(instancedicts[k]))


animeat & ((-0.37962472276825493, 6.2812478301620924e-19), (0.30349345562040358, 2.507678312321528e-12), (-0.090502627882570269, 0.041050833789383491), (0.31124205750215744, 6.4336911842716168e-13))
artinfo & ((-0.077477910735849531, 0.080458386793397715), (0.39617819840821344, 1.2949504682619278e-20), (-0.068731545884461051, 0.12109308343671212), (0.3707587474279373, 4.5938874944703922e-18))
contcont & ((-0.074817980159857772, 0.091121674199407587), (0.052722076290081819, 0.23416128805212155), (-0.032213689079616983, 0.46746755368196491), (0.066052680406299399, 0.13593172510299137))
locorg & ((-0.060481137096010759, 0.17264513106924814), (0.063840031407057574, 0.14996867701964953), (-0.024121064635798068, 0.58680432688642281), (0.058807605728609214, 0.18485568626889542))
procres & ((-0.07320787067266385, 0.09865156203155169), (0.10057235289900357, 0.023120827193312352), (0.061421324872990436, 0.1660545633348231), (0.11471706578419408, 0.009517481007490617))


In [200]:
"From here we can see that all datasets have a different ... AND very different mean annotation time."


'From here we can see that all datasets have a different ... AND very different mean annotation time.'

In [201]:

def TimeStochasticDominanceReport(instancedict):
    lit_times = [instancedict[k].length() for k in sorted(instancedict.keys()) if instancedict[k].label() == 'LIT']
    met_times = [instancedict[k].length() for k in sorted(instancedict.keys()) if instancedict[k].label() == 'MET']
    dot_times = [instancedict[k].length() for k in sorted(instancedict.keys()) if instancedict[k].label() == 'DOT']
    
    return(kruskalwallis(dot_times,lit_times,met_times)[1],kruskalwallis(dot_times,lit_times)[1],kruskalwallis(dot_times,met_times)[1],kruskalwallis(lit_times,met_times)[1])
    


In [202]:
for k in sorted(instancedicts.keys()):
    print(k,TimeStochasticDominanceReport(instancedicts[k]))


animeat (0.094369166647093439, 0.37558382851465788, 0.1855649967594365, 0.054435061456270514)
artinfo (0.035283678067990855, 0.2834837963286807, 0.58772734602222521, 0.0093734488583511709)
contcont (0.10580686686326722, 0.2815205483439136, 0.079521362729517875, 0.091201095701691798)
locorg (0.98199869027376896, 0.89127426077502103, 0.86604148780064705, 0.90698976456014957)
procres (0.70951161264087814, 0.79879402487028772, 0.55721729259553077, 0.49240651278298209)
