In [1]:
import pandas as pd
import os
from collections import defaultdict

# High

In [2]:
high = pd.read_table("tsv/CoNLL-SIGMORPHON Shared Task 1 Results - high.tsv")
high.tail()

Unnamed: 0.1,Unnamed: 0,waseda-01,uzh-02,uzh-01,ua-08,ua-07,ua-06,ua-05,ua-04,ua-03,...,iitbhu-iiith-02,iitbhu-iiith-01,iit-varanasi-01,hamburg-01,bme-03,bme-02,bme-01,baseline,axsemantics-02,axsemantics-01
82,north-frisian,92.0,95.0,94.0,0,0,0,0,0,0,...,96.0,96.0,96.0,83.0,94.0,95.0,96.0,37.0,33.0,80.0
83,old-english,83.7,87.9,88.7,0,0,0,0,0,0,...,86.0,86.0,83.4,50.9,87.1,88.2,87.1,40.9,34.3,69.3
84,polish,89.3,93.4,93.0,0,0,0,0,0,0,...,88.7,88.7,82.8,76.4,89.3,90.8,89.3,87.1,82.9,87.6
85,russian,88.4,94.4,94.0,0,0,0,0,0,0,...,91.0,91.0,85.4,85.2,90.4,92.0,90.4,86.5,76.1,88.0
86,average,91.115116,95.965116,96.004651,0,0,0,0,0,0,...,94.42907,94.42907,91.733721,77.531395,93.973256,94.662791,93.884884,75.298837,74.774419,84.188372


In [3]:
high.iloc[:-1].mean().sort_values(ascending=False).astype(float).to_frame().reset_index()

  return arr.astype(dtype, copy=True)


Unnamed: 0,index,0
0,uzh-01,96.00465
1,uzh-02,95.96512
2,bme-02,94.66279
3,iitbhu-iiith-02,94.42907
4,iitbhu-iiith-01,94.42907
5,bme-03,93.97326
6,bme-01,93.88488
7,msu-04,91.86512
8,iit-varanasi-01,91.73372
9,waseda-01,91.11512


# Medium

In [4]:
medium = pd.read_table("tsv/CoNLL-SIGMORPHON Shared Task 1 Results - medium.tsv")
medium.iloc[:-1].mean().sort_values(ascending=False).to_frame().reset_index()

Unnamed: 0,index,0
0,iitbhu-iiith-02,84.185294
1,iitbhu-iiith-01,82.895098
2,msu-04,76.398039
3,msu-03,75.736275
4,hamburg-01,74.028431
5,iit-varanasi-01,70.173529
6,msu-02,69.44902
7,bme-01,67.430392
8,bme-03,67.357843
9,bme-02,67.258824


# Low

In [5]:
low = pd.read_table("tsv/CoNLL-SIGMORPHON Shared Task 1 Results - low.tsv")
low.iloc[:-1].mean().sort_values(ascending=False).to_frame().reset_index()

Unnamed: 0,index,0
0,uzh-02,57.213592
1,uzh-01,57.180583
2,ua-08,53.215534
3,iitbhu-iiith-02,52.596117
4,ua-05,50.533981
5,iitbhu-iiith-01,49.791262
6,ua-06,49.733981
7,ua-03,44.820388
8,waseda-01,44.087379
9,msu-02,41.612621


In [6]:
r = high.iloc[:-1].mean()  
r[r.index.str.startswith('bme')].sort_index()

bme-01    93.884884
bme-02    94.662791
bme-03    93.973256
dtype: float64

In [7]:
r = medium.iloc[:-1].mean()  
r[r.index.str.startswith('bme')].sort_index()

bme-01    67.430392
bme-02    67.258824
bme-03    67.357843
dtype: float64

In [8]:
r = low.iloc[:-1].mean()  
r[r.index.str.startswith('bme')].sort_index()

bme-01    3.742718
bme-02    2.429126
bme-03    3.634951
dtype: float64

# Training data stats

In [19]:
DATA_DIR = "/home/judit/repo/external/conll2018/task1/all"
SURPRISE_DIR = "/home/judit/repo/external/conll2018/task1/surprise"

def load_stats(fn):
    maxlen_lemma = 0
    maxlen_infl = 0
    maxlen_tags = 0
    types_lemma = set()
    types_infl = set()
    types_tags = set()
    alphabet = set()
    cnt = 0
    sumlen_lemma = 0
    sumlen_infl = 0
    sumlen_tags = 0
    upper_cnt = 0
    with open(fn) as f:
        for l in f:
            cnt += 1
            lemma, infl, tags = l.rstrip('\n').split('\t')
            tags = tags.split(';')
            maxlen_lemma = max(maxlen_lemma, len(lemma))
            maxlen_infl = max(maxlen_infl, len(infl))
            maxlen_tags = max(maxlen_tags, len(tags))
            sumlen_lemma += len(lemma)
            sumlen_infl += len(infl)
            sumlen_tags += len(tags)
            alphabet |= set(lemma)
            alphabet |= set(infl)
            types_lemma.add(lemma)
            types_infl.add(infl)
            types_tags |= set(tags)
            upper_cnt += sum(c.isupper() for c in lemma)
    return {
        'lemma maxlen': maxlen_lemma,
        'inflected maxlen': maxlen_infl,
        'tags maxlen': maxlen_tags,
        'alphabet size': len(alphabet),
        'lemma types': len(types_lemma),
        'inflected types': len(types_infl),
        'tag types': len(types_tags),
        'lemma avglen' : sumlen_lemma / cnt,
        'infl avglen' : sumlen_infl / cnt,
        'tags avglen' : sumlen_tags / cnt,
        'upper ratio': upper_cnt / sumlen_lemma,
    }
        
stats = []
for fn in os.listdir(DATA_DIR):
    if fn.endswith('train-high'):
        stat = load_stats(os.path.join(DATA_DIR, fn))
        stat['language'] = '-'.join(fn.split('-')[:-2])
        stats.append(stat)
for fn in os.listdir(SURPRISE_DIR):
    if fn.endswith('train-high'):
        stat = load_stats(os.path.join(SURPRISE_DIR, fn))
        stat['language'] = '-'.join(fn.split('-')[:-2])
        stats.append(stat)
    
stats = pd.DataFrame(stats).set_index('language').sort_index()

# Analysis

In [20]:
df = high.rename(columns={'Unnamed: 0': 'language'}).set_index('language')
print(df['bme-02'].quantile(.8))
good = df[df['bme-02'] >= df['bme-02'].quantile(.8)]
good

99.0


Unnamed: 0_level_0,waseda-01,uzh-02,uzh-01,ua-08,ua-07,ua-06,ua-05,ua-04,ua-03,ua-02,...,iitbhu-iiith-02,iitbhu-iiith-01,iit-varanasi-01,hamburg-01,bme-03,bme-02,bme-01,baseline,axsemantics-02,axsemantics-01
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bashkir,93.2,99.9,99.9,0,0,0,0,0,0,0,...,99.8,99.8,99.7,99.1,99.8,99.8,99.8,90.7,99.8,99.8
bengali,98.0,99.0,99.0,0,0,0,0,0,0,0,...,99.0,99.0,99.0,93.0,99.0,99.0,99.0,81.0,78.0,96.0
classical-syriac,98.0,100.0,100.0,0,0,0,0,0,0,0,...,99.0,99.0,100.0,100.0,98.0,99.0,97.0,97.0,99.0,100.0
crimean-tatar,97.0,98.0,98.0,0,0,0,0,0,0,0,...,99.0,99.0,100.0,98.0,99.0,99.0,99.0,95.0,98.0,98.0
friulian,99.0,99.0,99.0,0,0,0,0,0,0,0,...,99.0,99.0,99.0,85.0,99.0,99.0,99.0,96.0,99.0,97.0
galician,98.6,99.3,99.5,0,0,0,0,0,0,0,...,98.9,98.9,98.7,93.9,99.2,99.3,99.2,95.1,95.0,96.7
haida,66.0,99.0,99.0,0,0,0,0,0,0,0,...,93.0,93.0,100.0,15.0,100.0,99.0,100.0,66.0,100.0,93.0
hebrew,98.4,99.3,99.5,0,0,0,0,0,0,0,...,98.8,98.8,97.3,83.7,98.5,99.3,98.5,53.7,54.5,84.3
hindi,73.6,100.0,100.0,0,0,0,0,0,0,0,...,99.7,99.7,99.4,98.7,99.9,100.0,100.0,93.0,80.0,100.0
kabardian,95.0,96.0,99.0,0,0,0,0,0,0,0,...,99.0,99.0,99.0,100.0,98.0,99.0,98.0,86.0,99.0,99.0


## Overall stats for all languages

In [21]:
stats.describe()

Unnamed: 0,alphabet size,infl avglen,inflected maxlen,inflected types,lemma avglen,lemma maxlen,lemma types,tag types,tags avglen,tags maxlen,upper ratio
count,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0
mean,42.755814,9.159997,24.255814,7650.534884,6.883528,18.860465,2308.348837,25.325581,4.415738,6.081395,0.001359
std,15.179658,1.89408,10.857566,2887.857511,1.352391,10.412846,2480.044092,8.282429,1.124486,1.85482,0.006304
min,19.0,5.5176,10.0,631.0,3.8286,6.0,15.0,7.0,2.049266,3.0,0.0
25%,31.0,8.119794,18.25,6648.25,5.829803,14.0,259.25,19.25,3.663575,5.0,0.0
50%,40.0,9.0045,22.0,9101.5,6.96965,17.0,1065.0,25.0,4.24927,6.0,0.0
75%,54.75,10.081028,27.0,9766.0,7.77855,21.0,4063.25,30.75,5.001112,7.0,0.000638
max,102.0,17.059649,90.0,9994.0,10.0502,89.0,8643.0,48.0,8.6318,14.0,0.056466


## Same stats for top 20% languages

In [22]:
good[['bme-02']].join(stats)

Unnamed: 0_level_0,bme-02,alphabet size,infl avglen,inflected maxlen,inflected types,lemma avglen,lemma maxlen,lemma types,tag types,tags avglen,tags maxlen,upper ratio
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
bashkir,99.8,55,8.6162,22,9106,5.5044,16,1084,11,3.3361,4,0.00278
bengali,99.0,49,7.771624,23,3746,6.033231,18,136,26,4.122319,5,0.0
classical-syriac,99.0,22,5.796745,10,1927,5.217863,8,160,19,3.077212,4,0.0
crimean-tatar,99.0,40,8.446541,20,7123,6.583949,17,1230,17,2.051135,4,0.001184
friulian,99.0,29,7.479736,15,4953,5.145344,9,168,20,5.000381,6,0.0
galician,99.3,29,8.6716,17,8890,6.9881,12,486,25,5.029,6,0.0
haida,99.0,30,17.059649,39,6605,6.422076,14,41,30,4.308041,7,0.0
hebrew,99.3,29,5.5176,13,8836,3.9745,9,510,24,4.5361,5,0.0
hindi,100.0,55,11.1765,27,8709,6.9513,20,258,25,5.8099,6,0.0
kabardian,99.0,33,8.80982,22,2786,5.725795,16,250,10,3.518326,4,0.002053


# Bottom 20%

In [23]:
print(df['bme-02'].quantile(.2))
bad = df[df['bme-02'] <= df['bme-02'].quantile(.2)]
bad

90.83999999999999


Unnamed: 0_level_0,waseda-01,uzh-02,uzh-01,ua-08,ua-07,ua-06,ua-05,ua-04,ua-03,ua-02,...,iitbhu-iiith-02,iitbhu-iiith-01,iit-varanasi-01,hamburg-01,bme-03,bme-02,bme-01,baseline,axsemantics-02,axsemantics-01
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
danish,90.7,94.6,95.5,0,0,0,0,0,0,0,...,91.5,91.5,91.3,76.0,89.4,90.4,89.4,87.0,90.4,92.7
faroese,81.3,86.4,85.6,0,0,0,0,0,0,0,...,83.9,83.9,81.1,56.3,83.8,87.1,85.3,76.1,76.8,79.6
greek,83.7,91.0,91.7,0,0,0,0,0,0,0,...,88.2,88.2,80.8,58.9,86.6,89.1,86.6,78.3,54.8,78.2
hungarian,79.6,86.6,87.2,0,0,0,0,0,0,0,...,85.9,85.9,82.3,59.2,83.7,85.5,83.7,68.8,80.9,76.9
icelandic,81.9,91.1,91.3,0,0,0,0,0,0,0,...,85.0,85.0,83.9,55.2,86.0,87.0,86.0,76.9,79.3,80.9
latin,65.4,74.6,75.9,0,0,0,0,0,0,0,...,73.7,73.7,61.5,18.1,77.4,78.9,77.4,47.6,37.2,46.2
livonian,94.0,100.0,98.0,0,0,0,0,0,0,0,...,98.0,98.0,97.0,68.0,92.0,87.0,92.0,67.0,76.0,82.0
norwegian-bokmaal,90.5,92.1,92.0,0,0,0,0,0,0,0,...,89.0,89.0,89.0,81.7,88.2,88.3,88.2,91.0,87.2,90.8
norwegian-nynorsk,83.4,94.9,94.6,0,0,0,0,0,0,0,...,84.6,84.6,82.6,56.6,90.8,85.3,90.8,74.8,88.0,82.8
old-armenian,88.8,90.4,90.4,0,0,0,0,0,0,0,...,89.1,89.1,87.3,68.0,87.7,89.3,87.7,79.2,82.2,84.9


## Stat table again

In [24]:
stats.describe()

Unnamed: 0,alphabet size,infl avglen,inflected maxlen,inflected types,lemma avglen,lemma maxlen,lemma types,tag types,tags avglen,tags maxlen,upper ratio
count,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0
mean,42.755814,9.159997,24.255814,7650.534884,6.883528,18.860465,2308.348837,25.325581,4.415738,6.081395,0.001359
std,15.179658,1.89408,10.857566,2887.857511,1.352391,10.412846,2480.044092,8.282429,1.124486,1.85482,0.006304
min,19.0,5.5176,10.0,631.0,3.8286,6.0,15.0,7.0,2.049266,3.0,0.0
25%,31.0,8.119794,18.25,6648.25,5.829803,14.0,259.25,19.25,3.663575,5.0,0.0
50%,40.0,9.0045,22.0,9101.5,6.96965,17.0,1065.0,25.0,4.24927,6.0,0.0
75%,54.75,10.081028,27.0,9766.0,7.77855,21.0,4063.25,30.75,5.001112,7.0,0.000638
max,102.0,17.059649,90.0,9994.0,10.0502,89.0,8643.0,48.0,8.6318,14.0,0.056466


## Same stats for bottom 20%

In [25]:
bad[['bme-02']].join(stats)

Unnamed: 0_level_0,bme-02,alphabet size,infl avglen,inflected maxlen,inflected types,lemma avglen,lemma maxlen,lemma types,tag types,tags avglen,tags maxlen,upper ratio
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
danish,90.4,46,9.1451,27,9768,7.4976,24,3137,16,3.9667,4,0.000547
faroese,87.1,34,8.1166,22,9040,6.9436,20,2959,26,3.9336,5,0.000187
greek,89.1,102,10.3883,23,9674,8.6483,21,5130,31,4.0867,6,0.002694
hungarian,85.5,55,11.3833,27,9954,8.1411,22,7123,35,3.4101,6,0.000454
icelandic,87.0,55,8.1833,26,9416,6.7061,23,4115,21,4.1212,5,0.001327
latin,78.9,55,10.3512,23,9896,8.5249,19,6517,33,4.5863,7,0.000669
livonian,87.0,40,8.590705,21,3090,6.202535,16,203,38,4.095062,10,0.0
norwegian-bokmaal,88.3,40,9.0377,30,9868,7.4367,28,5041,21,2.881,4,0.000148
norwegian-nynorsk,85.3,46,8.5693,22,9222,7.0535,19,4420,23,2.8854,4,0.000482
old-armenian,89.3,49,7.4069,20,9412,6.2015,19,3413,44,5.4392,8,0.0


# bme-02 wins on these languages 

In [26]:
df.loc[df.idxmax(axis=1) == 'bme-02'][['bme-02', 'uzh-01', 'uzh-02']]

Unnamed: 0_level_0,bme-02,uzh-01,uzh-02
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
albanian,98.9,97.7,96.5
armenian,96.9,96.4,96.8
faroese,87.1,85.6,86.4
latin,78.9,75.9,74.6
navajo,91.0,88.1,87.4
persian,99.9,99.8,99.8


# bme-02 is worse than best-sigma on these language

In [27]:
df.loc[df.max(axis=1) - df.std(axis=1) > df['bme-02']]

Unnamed: 0_level_0,waseda-01,uzh-02,uzh-01,ua-08,ua-07,ua-06,ua-05,ua-04,ua-03,ua-02,...,iitbhu-iiith-02,iitbhu-iiith-01,iit-varanasi-01,hamburg-01,bme-03,bme-02,bme-01,baseline,axsemantics-02,axsemantics-01
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# bme-02 is worse than mean acc on these language

In [28]:
df.loc[df.mean(axis=1) > df['bme-02']]

Unnamed: 0_level_0,waseda-01,uzh-02,uzh-01,ua-08,ua-07,ua-06,ua-05,ua-04,ua-03,ua-02,...,iitbhu-iiith-02,iitbhu-iiith-01,iit-varanasi-01,hamburg-01,bme-03,bme-02,bme-01,baseline,axsemantics-02,axsemantics-01
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# bme-02 is better than mean + sigma

The best teams are listed.

In [29]:
d = df.loc[df.mean(axis=1) + df.std(axis=1) < df['bme-02']]
d[d.idxmax(axis=1).unique()]

Unnamed: 0_level_0,uzh-01,uzh-02,bme-02,iitbhu-iiith-02
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
arabic,93.7,93.5,92.2,93.3
basque,98.9,98.7,98.9,98.6
belarusian,94.9,94.7,93.1,92.1
finnish,95.4,94.9,93.3,92.3
irish,91.4,91.5,91.1,87.6
khaling,99.7,99.6,99.6,99.5
latin,75.9,74.6,78.9,73.7
maltese,96.0,97.0,94.0,95.0
navajo,88.1,87.4,91.0,83.5
slovene,97.4,97.3,94.9,94.8


# bme-02 is better than mean + 2\*sigma

In [30]:
df.loc[df.mean(axis=1) + 2*2*df.std(axis=1) < df['bme-02']]

Unnamed: 0_level_0,waseda-01,uzh-02,uzh-01,ua-08,ua-07,ua-06,ua-05,ua-04,ua-03,ua-02,...,iitbhu-iiith-02,iitbhu-iiith-01,iit-varanasi-01,hamburg-01,bme-03,bme-02,bme-01,baseline,axsemantics-02,axsemantics-01
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
