In [258]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter, defaultdict
from scipy.stats import pearsonr as pearsonr
from scipy.stats import skew as skew
import xml.etree.ElementTree as ET
import math


In [259]:
def y_Ao(annotations):
    total = 0.0
    pairings = [x for x in itertools.combinations(annotations,2)]
    for a1,a2 in pairings:
        total+=int(a1==a2)
    return total / len(pairings)


def Datasettime(instancedict):
    t =[]
    for k in instancedict:
        t.extend(instancedict[k].times)
    return t
    

class AnnotatedInstance:
    def __init__(self):
        self.sentence = ''
        self.headword = ''
        self.labels = []
        self.times = []
        
    def avg_time(self,threshold=999999999999):
        return np.mean(self.times)
        
    def length(self):
        return len(self.sentence.split())
    
    def Ao(self):
        return y_Ao(self.labels)
    
    def __str__(self):
        return '-'.join([self.sentence,self.headword,self.rightcontext]+self.labels+[str(x) for x in self.times])
    

def ReportDataset(instancedict):
    alltimes = Datasettime(instancedict)
    percentile95 =  np.percentile(alltimes, 95) #right-hand outliers filtered, remove anything above the 95th percentile
    alltimes= [x for x in alltimes if x < percentile95]
    mean = np.mean(alltimes)
    median = np.median(alltimes)
    std = np.std(alltimes)
    Ao_indiv_list = [instancedict[k].Ao() for k in instancedict.keys()]
    Ao_avg = np.mean(Ao_indiv_list)
    time_indiv_list =  [instancedict[k].avg_time() for k in instancedict.keys()]
    lengths = [instancedict[k].length() for k in instancedict.keys()]
    mean_length = np.mean(lengths)
    mean_time = np.mean(time_indiv_list)
    
    corr_len = [mean_length if math.isnan(x)  else x for x in lengths]

    print(len(alltimes),len(time_indiv_list),len(lengths),len(Ao_indiv_list))
    #return (mean, median, std,percentile95,Ao_avg,pearsonr(Ao_indiv_list,time_indiv_list),pearsonr(lenghts,time_indiv_list)[0])
    print(time_indiv_list[:50])
    print(lengths[:50])
    return (pearsonr(Ao_indiv_list,time_indiv_list),pearsonr(lengths,time_indiv_list))

    

In [260]:
instances = defaultdict(AnnotatedInstance)

In [261]:
for idx in '1 2 3 4 5'.split():
    tree = ET.parse('/Users/hmartine/data/TakeLab-Cro36WSD/annotation_data/annotation_set_'+idx+'.xml')
    root = tree.getroot()
    print(root)
    for instance in root:
        ID =  instance.findall(".//ID")[0].text
        
        try:
        
            timespent = float(instance.findall(".//TIMESPENT")[0].text)
            if math.isnan(timespent):
                print('nan',idx,ID)
            sentence =  instance.findall(".//SENTENCE")[0].text
            senseid = instance.findall(".//SENSEID")[0].text
            #print(sentence,[x.text for x in instance.findall(".//TIMESPENT")])
            instances[ID].times.append(timespent)
            instances[ID].sentence = sentence
            instances[ID].labels.append(senseid) #If there is no annotation it has been discarded, which is also an interesting task
        except:
            pass# Maybe store under discarded
            

keptinstances = defaultdict(AnnotatedInstance)
for k in instances.keys():
    if len(instances[k].labels) >= 2:
           keptinstances[k]=instances[k]

            
    

<Element 'SET' at 0x115b13228>
<Element 'SET' at 0x1179061d8>
<Element 'SET' at 0x116e6d818>
<Element 'SET' at 0x1149a62c8>
<Element 'SET' at 0x114ce0048>


In [262]:
print(ReportDataset(keptinstances))

19342 10180 10180 10180
[11692.220499999999, 4446.0078000000003, 1996.8035, 5701.8100000000004, 5779.8101500000002, 5070.0089500000004, 6396.0111999999999, 5015.4088000000002, 4032.6071000000002, 4797.0084000000006, 8151.0143000000007, 12285.0216, 13548.623800000001, 3588.0062500000004, 3205.8056500000002, 8439.6147500000006, 16855.829600000001, 9718.8171000000002, 5779.8101500000002, 5592.6098000000002, 7254.0127000000011, 107023.98795000001, 3057.6053499999998, 9609.6168500000003, 3416.4059999999999, 14757.625950000001, 8626.8151500000004, 7862.4138000000003, 3385.20595, 10319.418150000001, 4664.4081500000002, 5545.8096999999998, 6294.6110500000004, 6942.012200000001, 10974.6193, 6505.2114000000001, 3580.2062500000002, 17721.631100000002, 6091.8107, 4110.6072500000009, 6981.0122499999998, 2386.8042000000005, 6021.6105500000003, 16364.428749999999, 1747.2030500000001, 5709.6100000000006, 3907.8069, 3198.0056000000004, 1419.6025500000001, 3439.8060500000001]
[69, 21, 63, 30, 10, 18, 71

** Ignore the previous stuff, we are going to try a classifier **

In [263]:
train = ET.parse('/Users/hmartine/data/TakeLab-Cro36WSD/datasets/Cro36WSD-M/train/aktivan-a.xml')
root = tree.getroot()
    print(root)
    for instance in root:
        ID =  instance.findall(".//ID")[0].text
        
        try:
        
            timespent = float(instance.findall(".//TIMESPENT")[0].text)
            if math.isnan(timespent):
                print('nan',idx,ID)
            sentence =  instance.findall(".//SENTENCE")[0].text
            senseid = instance.findall(".//SENSEID")[0].text
            #print(sentence,[x.text for x in instance.findall(".//TIMESPENT")])
            instances[ID].times.append(timespent)
            instances[ID].sentence = sentence
            instances[ID].labels.append(senseid) #If there is no annotation it has been discarded, which is also an interesting task
        except:
            pass# Maybe store under discarded

test = ET.parse('/Users/hmartine/data/TakeLab-Cro36WSD/datasets/Cro36WSD-M/test/aktivan-a.xml')
