In [109]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter
from scipy.stats import pearsonr as pearsonr
from scipy.stats import skew as skew



In [115]:
def y_Ao(annotations):
    total = 0.0
    pairings = [x for x in itertools.combinations(annotations,2)]
    for a1,a2 in pairings:
        total+=int(a1==a2)
    return total / len(pairings)


class AnnotatedInstance:
    def __init__(self):
        self.leftcontext = ''
        self.rightcontext = ''
        self.headword = ''
        self.labels = []
        self.times = []
        
    def length(self):
        return len((str(self.leftcontext)+" "+str(self.rightcontext)).split())
    
    def Ao(self):
        return y_Ao(self.labels)
    
    def avg_time(self,threshold=100):
        return [x for x in self.times if x < threshold]
    
    def _normlabels(self):
        d = {}
        d['ANIM']='LIT'
        d['Category1']='LIT'
        
        d['MEAT']='MET'
        d['Categpory2']='MET'
        d['Category2']='MET'
        
        d['DOT']='DOT'
        d['Category4']='DOT'
        d['Category3']='DOT'
        return [d[x] for x in self.labels]
    
    def label(self):
        #LIT, MET, DOT
        normlabels = self._normlabels()
        if normlabels.count('LIT') > normlabels.count('MET') and normlabels.count('DOT') <= normlabels.count('LIT'):
            return 'LIT'
        if normlabels.count('MET') > normlabels.count('LIT') and normlabels.count('DOT') <= normlabels.count('MET'):
            return 'MET'
        else:
            return 'DOT'
        
    def __str__(self):
        return '-'.join([self.leftcontext,self.headword,self.rightcontext]+self.labels+[str(x) for x in self.times])
    
    
def CreateInstanceDict(dataframe):
    instancedict = {}
    for hitid in set(dataframe.HITId):
        inst = AnnotatedInstance()
        inst.times = list(dataframe[dataframe.HITId==hitid].WorkTimeInSeconds)
        inst.labels = list(dataframe[dataframe.HITId==hitid].Answer_Category)
        inst.leftcontext = list(dataframe[dataframe.HITId==hitid].Input_lleftcontext)[0]
        inst.rightcontext = list(dataframe[dataframe.HITId==hitid].Input_lrightcontext)[0]
        inst.headword = list(dataframe[dataframe.HITId==hitid].Input_lheadword)[0]
        instancedict[hitid]=inst
    return instancedict
        

def Datasettime(instancedict):
    t =[]
    for k in instancedict:
        t.extend(instancedict[k].times)
    return t
    
def ReportDataset(instancedict):
    alltimes = Datasettime(instancedict)
    percentile95 =  np.percentile(alltimes, 95) #right-hand outliers filtered, remove anything above the 95th percentile
    alltimes= [x for x in alltimes if x < percentile95]
    
    mean = np.mean(alltimes)
    median = np.median(alltimes)
    std = np.std(alltimes)
    Ao_indiv_list = [instancedict[k].Ao() for k in instancedict.keys()]
    Ao_avg = np.mean(Ao_indiv_list)
    time_indiv_list =  [np.mean(instancedict[k].avg_time(percentile95)) for k in instancedict.keys()]
    lenghts = [instancedict[k].length() for k in instancedict.keys()]
    labels = [instancedict[k].label() for k in instancedict.keys()]
    
    return (mean, median, std,percentile95,Ao_avg,pearsonr(Ao_indiv_list,time_indiv_list),pearsonr(lenghts,time_indiv_list)[0],Counter(labels)['LIT'])
    
def getLiteral(inputlist):
    litlabels = ['Category1', 'ANIM']
    return [int(x in litlabels) for x in inputlist]



In [116]:
animeat = pd.read_csv('data/raw/animeatbatchresults.csv')
contcont = pd.read_csv('data/raw/container_5turks.tab.csv',sep='\t')
artinfo = pd.read_csv('data/raw/artinfo_mturk_500.tab.csv',sep='\t')
procres = pd.read_csv('data/raw/procres_mturk.tab.csv',sep='\t')
locorg = pd.read_csv('data/raw/locorg_5turks.tab.csv',sep='\t')

In [117]:
datataframes = [animeat, artinfo, contcont, locorg, procres]
names = ['animeat', 'artinfo', 'contcont', 'locorg','procres']
instancedicts = {}
for dataframe,name in zip(datataframes,names):
    instancedicts[name] = CreateInstanceDict(dataframe)
    
for k in sorted(instancedicts.keys()):
    print(k,ReportDataset(instancedicts[k]))
    

animeat (7.1381878361605295, 6.0, 3.8495627455207746, 24.0, 0.85705882352941165, (-0.37962472276825499, 6.2812478301620924e-19), 0.30029217888812848, 362)
artinfo (11.786038826931021, 10.0, 9.0720455678843717, 59.0, 0.47784313725490196, (-0.077477910735849531, 0.080458386793397715), 0.39151208220063244, 147)
contcont (30.672577319587628, 8.0, 48.608183208187519, 179.0, 0.65088062622309195, (-0.0748179801598578, 0.091121674199407587), 0.048698401295471434, 361)
locorg (21.514037985136252, 11.0, 25.91720462840906, 125.54999999999973, 0.7290849673202614, (nan, 1.0), nan, 558)
procres (25.189669421487604, 12.0, 27.989652321541982, 99.0, 0.50078431372549026, (-0.07320787067266385, 0.09865156203155169), 0.090982170121922029, 158)




In [113]:
"From here we can see that all datasets have a different ... AND very different mean annotation time."


'From here we can see that all datasets have a different ... AND very different mean annotation time.'

In [114]:
for x in [animeat, artinfo, contcont, locorg,procres]:
    lit = getLiteral()
    print(pearsonr(lit,np.array(x.WorkTimeInSeconds)))

TypeError: getLiteral() missing 1 required positional argument: 'inputlist'