In [1]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import hashlib
import pandas as pd
import numpy as np

# WMT19 system-level data

### Checksums

In [2]:
mine = '93f6c7fa3ca5d81b1837ccf34c1a80aa'
yours = hashlib.md5(open('data/downloads/wmt19-submitted-data-v3.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

mine = '8c901c91fde19207cf8494c4af541455'
yours = hashlib.md5(open('data/downloads/wmt19-metrics-task-package.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

# unzip to data/
# os.system('tar -xvf data/downloads/newstest2017-segment-level-human.tar.gz -p data/')
# os.system('tar -xvf data/downloads/wmt17-metrics-task-package.tgz -p data/')

93f6c7fa3ca5d81b1837ccf34c1a80aa
93f6c7fa3ca5d81b1837ccf34c1a80aa
True
8c901c91fde19207cf8494c4af541455
8c901c91fde19207cf8494c4af541455
True


### Official system-level da scores

In [3]:
da_sys = pd.read_csv('data/wmt19-metrics-task-package/manual-evaluation/DA-syslevel.csv', delimiter=' ')
da_sys.columns = ['lp', 'score', 'system']
da_sys

Unnamed: 0,lp,score,system
0,en-cs,0.402,CUNI-DocTransformer-T2T.6751
1,en-cs,0.401,CUNI-Transformer-T2T-2018.6457
2,en-cs,0.388,CUNI-Transformer-T2T-2019.6851
3,en-cs,0.223,CUNI-DocTransformer-Marian.6922
4,en-cs,0.206,uedin.6667
...,...,...,...
220,de-fr,0.019,online-Y.0
221,de-fr,-0.104,TartuNLP-c.6897
222,de-fr,-0.194,online-A.0
223,de-fr,-0.240,online-G.0


In [4]:
sys_scores = pd.DataFrame(data={'lp':[], 'system':[]})

for submission in glob.glob('data/wmt19-metrics-task-package/baselines/*.sys.*'):
    if 'chrF' in submission or 'mteval' in submission:
        continue

    metric_name = submission.split('/')[-1][:-len('.sys.score.gz')]
    print(metric_name)

    hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open(submission, 'rt') if 'hybrid' not in i)
    reader = io.StringIO(hybrid_filtered)
    metric_syss = pd.read_csv(reader, delimiter='\t', header=None)
    
    metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
    
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

TER
CDER
sacreBLEU-BLEU
PER
WER


In [5]:
# BLEU
metric_name = 'BLEU'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open('data/wmt19-metrics-task-package/baselines/mteval-inter.sys.score.gz', 'rt') if 'hybrid' not in i and 'BLEU' in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)

metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

# NIST
metric_name = 'NIST'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open('data/wmt19-metrics-task-package/baselines/mteval-inter.sys.score.gz', 'rt') if 'hybrid' not in i and 'NIST' in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)

metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

### Join metric and da scores

In [6]:
sys_scores_da = da_sys.merge(sys_scores)
sys_scores_da = sys_scores_da[['lp', 'score', 'system', 'BLEU', 'NIST', 'CDER', 'PER', 'TER', 'WER']]
sys_scores_da

Unnamed: 0,lp,score,system,BLEU,NIST,CDER,PER,TER,WER
0,en-cs,0.402,CUNI-DocTransformer-T2T.6751,0.1904,5.1045,0.5042,0.6061,0.4697,0.4427
1,en-cs,0.401,CUNI-Transformer-T2T-2018.6457,0.1968,5.2036,0.5042,0.6037,0.4714,0.4452
2,en-cs,0.388,CUNI-Transformer-T2T-2019.6851,0.1892,5.0849,0.4977,0.5971,0.4647,0.4386
3,en-cs,0.223,CUNI-DocTransformer-Marian.6922,0.1769,4.8610,0.4877,0.5878,0.4513,0.4243
4,en-cs,0.206,uedin.6667,0.1916,5.1417,0.4895,0.5877,0.4589,0.4323
...,...,...,...,...,...,...,...,...,...
220,de-fr,0.019,online-Y.0,0.3641,8.3309,0.5197,0.6065,0.4936,0.4665
221,de-fr,-0.104,TartuNLP-c.6897,0.3415,7.9147,0.4967,0.5908,0.4629,0.4348
222,de-fr,-0.194,online-A.0,0.3515,7.9717,0.5002,0.6134,0.4508,0.4212
223,de-fr,-0.240,online-G.0,0.3412,8.0219,0.4724,0.5738,0.4308,0.4067


### Validate correlations

In [7]:
sys_scores_da.groupby('lp') \
    ['lp'] \
    .count()

lp
de-cs    11
de-en    16
de-fr    11
en-cs    11
en-de    22
en-fi    12
en-gu    11
en-kk    11
en-lt    12
en-ru    12
en-zh    12
fi-en    12
fr-de    10
gu-en    11
kk-en    11
lt-en    11
ru-en    14
zh-en    15
Name: lp, dtype: int64

In [8]:
sys_scores_da[sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,de-en,fi-en,gu-en,kk-en,lt-en,ru-en,zh-en
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.849,0.982,0.834,0.946,0.961,0.879,0.899
CDER,0.89,0.988,0.876,0.967,0.975,0.892,0.917
NIST,0.813,0.986,0.93,0.942,0.944,0.925,0.921
PER,0.883,0.991,0.91,0.737,0.947,0.922,0.952
TER,0.874,0.984,0.89,0.799,0.96,0.917,0.84
WER,0.863,0.983,0.861,0.793,0.961,0.911,0.82
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
sys_scores_da[sys_scores_da.lp.str.startswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,en-cs,en-de,en-fi,en-gu,en-kk,en-lt,en-ru,en-zh
Unnamed: 0_level_1,score,score,score,score,score,score,score,score
BLEU,0.897,0.921,0.969,0.737,0.852,0.989,0.986,0.901
CDER,0.985,0.973,0.978,0.84,0.927,0.985,0.993,0.905
NIST,0.896,0.321,0.971,0.786,0.93,0.993,0.988,0.884
PER,0.976,0.97,0.982,0.839,0.921,0.985,0.981,0.895
TER,0.98,0.969,0.981,0.865,0.94,0.994,0.995,0.856
WER,0.982,0.966,0.98,0.861,0.939,0.991,0.994,0.875
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
sys_scores_da[~sys_scores_da.lp.str.startswith('en') & ~sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,de-cs,de-fr,fr-de
Unnamed: 0_level_1,score,score,score
BLEU,0.941,0.891,0.864
CDER,0.864,0.949,0.852
NIST,0.954,0.916,0.862
PER,0.875,0.857,0.899
TER,0.89,0.956,0.895
WER,0.872,0.956,0.894
score,1.0,1.0,1.0


### WMT19 system-level data (raw)

In [11]:
lp_df = []

for file in glob.glob('data/wmt-human-evaluation/newstest2019-humaneval/*-sntlevel-humaneval-newstest2019/analysis/ad-seg-scores-*.csv'):
    lp = file[-9:-4]
    print(lp)
    
    df = pd.read_csv(file, delimiter=' ')
    df['lp'] = [lp] * len(df)
    
    lp_df.append(df)
raw_seg_scores_da = pd.concat(lp_df)

de-cs
de-fr
fr-de
zh-en
kk-en
fi-en
gu-en
ru-en
lt-en


In [12]:
raw_sys_scores_da = raw_seg_scores_da.groupby(['lp', 'SYS'], as_index=False).mean()

raw_sys_scores_da.columns = ['lp', 'system', 'sid', 'raw_score', 'score', 'N', '5']
raw_sys_scores_da = raw_sys_scores_da[['lp', 'system', 'raw_score', 'score']]

In [13]:
print(len(sys_scores_da))
sys_scores_da = sys_scores_da.merge(raw_sys_scores_da[['lp','system','raw_score']])
print(len(sys_scores_da))
sys_scores_da[['lp','raw_score', 'score']].groupby('lp').corr('pearson')

225
81


Unnamed: 0_level_0,Unnamed: 1_level_0,raw_score,score
lp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
de-cs,raw_score,1.0,0.999074
de-cs,score,0.999074,1.0
de-fr,raw_score,1.0,0.997829
de-fr,score,0.997829,1.0
fi-en,raw_score,1.0,0.998487
fi-en,score,0.998487,1.0
fr-de,raw_score,1.0,0.995781
fr-de,score,0.995781,1.0
gu-en,raw_score,1.0,0.996633
gu-en,score,0.996633,1.0


In [14]:
# we only have 81 systems left because most of the systems were
# labelled with DA + document context. The remaining 81 systems
# are evaluated with vanilla DA.

### WMT system-level (src, ref, out)

In [15]:
raw_seg_scores_da = raw_seg_scores_da[['SYS','SID','RAW.SCR','Z.SCR','N','lp']]
raw_seg_scores_da.columns = ['system', 'sid', 'raw_score', 'score', 'N', 'lp']

raw_seg_scores_da['system'] = raw_seg_scores_da['system'].apply(lambda x: x[:-6] if x.endswith('.zh-en') else x)

In [16]:
raw_seg_scores_da.system.unique()

array(['online-Y.0', 'online-B.0', 'NICT.6938', 'CAiRE.6949',
       'online-G.0', 'online-A.0', 'lmu-unsup-nmt-de-cs.6845',
       'Unsupervised.6935', 'Unsupervised.6929', 'NEU_KingSoft.6766',
       'CUNI-Unsupervised-NER-post.6934', 'online-X.0', 'LIUM.6719',
       'MSRA.MADL.6888', 'Kyoto_University_T2T.6679',
       'lingua_custodia_primary.6690', 'MLLP-UPV.6647', 'TartuNLP-c.6897',
       'TartuNLP-c.6514', 'LIUM.6720', 'eTranslation.6262',
       'MSRA.MADL.6893', 'MLLP-UPV.6654', 'NEU.6832', 'Baidu-system.6940',
       'BTRANS.6825', 'NICT.6814', 'MSRA.MASS.6996', 'KSAI-system.6927',
       'Apprentice-c.6706', 'UEDIN.6530', 'BTRANS-ensemble.6992',
       'MSRA.MASS.6942', 'DBMS-KU-KKEN.6726',
       'CUNI-T2T-transfer-kken.6436', 'UMD.6736', 'NICT.6770',
       'rug-kken-morfessor.6677', 'talp-upc-2019-kken.6657', 'NEU.6753',
       'Frank-s-MT.6127', 'NRC-CNRC.6895', 'HUMAN', 'parfda.6526',
       'TartuNLP-c.6905', 'apertium-fin-eng-unconstrained-fien.6449',
       'Helsin

In [17]:
# srcs and refs
srcs, refs, lps, sids = [], [], [], []
for lp in sys_scores_da.lp.unique():
    fr, to = lp[:2], lp[3:]
    print(lp)
    
    refs_ = list(open('data/wmt19-submitted-data/txt/references/newstest2019-%s%s-ref.%s' % (fr, to, to)))
    srcs_ = list(open('data/wmt19-submitted-data/txt/sources/newstest2019-%s%s-src.%s' % (fr, to, fr)))
    sids_ = list(range(1, len(refs_)+1))
    refs.extend(refs_)
    srcs.extend(srcs_)
    sids.extend(sids_)
    
    assert(len(refs_) == len(srcs_))
    lps.extend([lp]*len(refs_))
    
src_ref_df = pd.DataFrame({'reference' : refs, 'source':srcs, 'lp': lps, 'sid': sids})
print(len(src_ref_df), len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(src_ref_df, on=['lp','sid'], how='inner')
print(len(raw_seg_scores_da))

de-cs
fi-en
fr-de
ru-en
kk-en
gu-en
lt-en
de-fr
12411 129126
108927


In [18]:
# outs
lps, outs, sids, syss = [], [], [], []
for file in glob.glob('data/wmt19-submitted-data/txt/system-outputs/newstest2019/*/*'):
    lp = file.split('.')[-1]
    system = '.'.join(file.split('.')[1:-1])
    
    if 'Unsupervised.' in system:
        system = system[:13] + system[-4:]
    elif '_' in system and not any(['NEU_KingSoft' in system, 'Kyoto_University' in system, 'lingua_custodia_primary' in system]):
        system = system.replace('_', '-')
    
    outs_ = list(open(file, 'rt'))
    sids_ = list(range(1, len(outs_)+1))
    lps_ = len(outs_) * [lp]
    syss_ = len(outs_) * [system]
    
    outs.extend(outs_)
    sids.extend(sids_)
    lps.extend(lps_)
    syss.extend(syss_)

out_df = pd.DataFrame({'lp': lps, 'output':outs, 'sid': sids, 'system': syss})
print(len(out_df), len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(out_df, on=['lp','sid', 'system'], how='left')
print(len(raw_seg_scores_da))

391536 108927
108927


In [19]:
raw_seg_scores_da[raw_seg_scores_da.output.isna()]

Unnamed: 0,system,sid,raw_score,score,N,lp,reference,source,output
27611,HUMAN,751,99.300000,1.105350,10,kk-en,Three professional boxers from Kazakhstan will...,Қазақстандық үш кәсіпқой боксшы Мәскеуде жекпе...,
27623,HUMAN,243,95.000000,0.352857,2,kk-en,"If our country is independent indeed, I will o...",Егер біздің мемлекетіміз шын мәнінде тәуелсіз ...,
27635,HUMAN,992,74.000000,0.730244,2,kk-en,During teaching lessons teachers and productio...,Оқытушылар мен өндірістік оқыту шеберлері оқу ...,
27647,HUMAN,956,99.333333,1.192517,3,kk-en,"Regulation of archaeological, restoration work...",Тарихи-мәдени мұра нысандарында жүргізілетін а...,
27659,HUMAN,177,70.000000,0.639202,1,kk-en,It means we are glad that people get opportuni...,"Яғни, жұрт газетті оқып қана қоймай, естіп те ...",
...,...,...,...,...,...,...,...,...,...
108856,HUMAN,90,99.000000,0.767574,1,lt-en,But the most interesting thing that happened o...,"Tačiau įdomiausias dalykas, nutikęs aikštelėje...",
108868,HUMAN,713,99.500000,0.991348,6,lt-en,A. Širinskienė proposed to expand this period ...,A. Širinskienė šį terminą pasiūlė pratęsti nur...,
108902,HUMAN,400,100.000000,0.928725,1,lt-en,The Eurogroup will overview draft budgets of t...,Euro grupėje bus apžvelgti Europos Komisijai (...,
108914,HUMAN,525,86.500000,0.453017,4,lt-en,"After her death, government officials were acc...",Po jos mirties valdžios pareigūnai sulaukė vis...,


### Pickle

In [20]:
pickle.dump(raw_seg_scores_da, open('data/pickles/wmt19-sys_level-all.pkl', 'wb'))
pickle.dump(sys_scores_da, open('data/pickles/wmt19-sys_level-agg.pkl', 'wb'))