In [126]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter, defaultdict
from scipy.stats import pearsonr as pearsonr
from scipy.stats import skew as skew
import xml.etree.ElementTree as ET
import math


In [127]:
def y_Ao(annotations):
    total = 0.0
    pairings = [x for x in itertools.combinations(annotations,2)]
    for a1,a2 in pairings:
        total+=int(a1==a2)
    return total / len(pairings)


def Datasettime(instancedict):
    t =[]
    for k in instancedict:
        t.extend(instancedict[k].times)
    return t
    

class AnnotatedInstance:
    def __init__(self):
        self.sentence = ''
        self.headword = ''
        self.labels = []
        self.times = []
        self.lemma = ''
        self.pos = ''
        self.lemmafreq = 0
        
    def avg_time(self,threshold=999999999999):
        return np.mean(self.times)
        
    def length(self):
        return len(self.sentence.split())
    
    def Ao(self):
        return y_Ao(self.labels)
    
    def __str__(self):
        return '-'.join([self.sentence,self.headword,self.rightcontext]+self.labels+[str(x) for x in self.times])
    

def ReportDataset(instancedict):
    alltimes = Datasettime(instancedict)
    wordpos=[instancedict[k].lemma+"_"+instancedict[k].pos for k in instancedict.keys()][0]
    percentile95 =  np.percentile(alltimes, 100) #right-hand outliers filtered, remove anything above the 95th percentile
    alltimes= [x for x in alltimes if x < percentile95*4]
    mean = np.mean(alltimes)
    median = np.median(alltimes)
    std = np.std(alltimes)
    Ao_indiv_list = [instancedict[k].Ao() for k in instancedict.keys()]
    Ao_avg = np.mean(Ao_indiv_list)
    time_indiv_list =  [instancedict[k].avg_time() for k in instancedict.keys()]
    lengths = [instancedict[k].length() for k in instancedict.keys()]
    mean_length = np.mean(lengths)
    mean_time = np.mean(time_indiv_list)
    
    corr_len = [mean_length if math.isnan(x)  else x for x in lengths]
    return (wordpos,mean_time,pearsonr(Ao_indiv_list,time_indiv_list)[1],pearsonr(lengths,time_indiv_list)[1],pearsonr(lengths,Ao_indiv_list)[1])

    

In [128]:
instances = defaultdict(AnnotatedInstance)

In [129]:

freqdict =  pd.read_csv('data/hr_unigrams',sep='\t')

for idx in '1 2 3 4 5'.split():
    tree = ET.parse('/Users/hmartine/data/TakeLab-Cro36WSD/annotation_data/annotation_set_'+idx+'.xml')
    root = tree.getroot()
    print(root)
    for instance in root:
        ID =  instance.findall(".//ID")[0].text
        
        try:
        
            for timespent in instance.findall(".//TIMESPENT"):
                timeval = float(timespent.text)
                instances[ID].times.append(timeval)

            # = 0.5* (float(instance.findall(".//TIMESPENT")[0].text)+float(instance.findall(".//TIMESPENT")[1].text))
            sentence =  instance.findall(".//SENTENCE")[0].text
            senseid = instance.findall(".//SENSEID")[0].text
            
            instances[ID].sentence = sentence


            #print(sentence,[x.text for x in instance.findall(".//TIMESPENT")])
            instances[ID].sentence = sentence
            instances[ID].headword = instance.findall(".//TARGETWORD")[0].text
            instances[ID].lemma = instance.findall(".//TARGETLEMMA")[0].text
            instances[ID].pos = instance.findall(".//TARGETPOS")[0].text
            instances[ID].labels.append(senseid) #If there is no annotation it has been discarded, which is also an interesting task
            instances[ID].lemmafreq = math.log(sum(list(freqdict[freqdict.Lemma==instances[ID].lemma].Freq)))
        except:
            pass# Maybe store under discarded
            

keptinstances = defaultdict(AnnotatedInstance)
for k in instances.keys():
    if len(instances[k].labels) >= 2:
           keptinstances[k]=instances[k]


            


<Element 'SET' at 0x113efcd68>
<Element 'SET' at 0x1144fdf48>
<Element 'SET' at 0x116a44f48>
<Element 'SET' at 0x116f51f98>
<Element 'SET' at 0x113580958>


In [130]:
bywords = defaultdict(dict)
for k in keptinstances.keys():
    bywords[keptinstances[k].lemma][k]=keptinstances[k]

for word in bywords:
    print(word,[bywords[word][x].lemmafreq for x in bywords[word].keys()][0],ReportDataset(bywords[word]))

vanjski 5.147494476813453 ('vanjski_a', 6050.1213572555553, 0.023607980368144051, 0.33509732913078383, 0.48384333534958146)
osuditi 3.1354942159291497 ('osuditi_v', 6732.2844105013901, 1.0, 0.018935737740882714, 1.0)
gorjeti 0 ('gorjeti_v', 5744.7345689981212, 0.00020141491016474624, 0.019201912707892739, 0.77416470146513605)
nastaviti 4.2626798770413155 ('nastaviti_v', 6384.1458513066673, 0.99824157939656943, 0.62120028687397144, 0.60100704938566718)
normalan 2.302585092994046 ('normalan_a', 9899.9899197528084, 0.042640056862549208, 9.4118834817653063e-07, 0.10860312978448615)
oprati 0.0 ('oprati_v', 5229.0666522449619, 1.0, 0.0063480497979314186, 1.0)
brusiti 0 ('brusiti_v', 12205.391575212587, 0.0024871567554018152, 0.016367295958378818, 0.67985815013955331)
rezerva 0 ('rezerva_n', 6181.4131265307324, 0.73792614663993739, 0.47149798896528683, 0.98506479847448314)
lak 0.0 ('lak_a', 10024.99045699212, 0.019545676099322996, 0.59773190358886497, 0.0342493165161767)
mrtav 1.0986122886681

In [131]:
print(ReportDataset(keptinstances))

('težina_n', 8387.1509423716925, 3.6193713156126732e-57, 1.6787405803093101e-16, 0.060741471701289969)


** Ignore the previous stuff, we are going to try a classifier **

In [132]:
train = ET.parse('/Users/hmartine/data/TakeLab-Cro36WSD/datasets/Cro36WSD-M/train/aktivan-a.xml')
root = tree.getroot()
    print(root)
    for instance in root:
        ID =  instance.findall(".//ID")[0].text
        
        try:
        
            timespent = float(instance.findall(".//TIMESPENT")[0].text)
            if math.isnan(timespent):
                print('nan',idx,ID)
            sentence =  instance.findall(".//SENTENCE")[0].text
            senseid = instance.findall(".//SENSEID")[0].text
            #print(sentence,[x.text for x in instance.findall(".//TIMESPENT")])
            instances[ID].times.append(timespent)
            instances[ID].sentence = sentence
            instances[ID].labels.append(senseid) #If there is no annotation it has been discarded, which is also an interesting task
        except:
            pass# Maybe store under discarded

test = ET.parse('/Users/hmartine/data/TakeLab-Cro36WSD/datasets/Cro36WSD-M/test/aktivan-a.xml')


IndentationError: unexpected indent (<ipython-input-132-d35638bcbdbb>, line 3)