In [1]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import hashlib
import pandas as pd
import numpy as np

# WMT18 system-level data

### Checksums

In [2]:
mine = '5e0446343c0238a36c2b91c548e3c7e9'
yours = hashlib.md5(open('data/downloads/wmt18-submitted-data-v1.0.1.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

mine = 'c9941e7139b62a349b19bcf20f8a54ec'
yours = hashlib.md5(open('data/downloads/wmt18-metrics-task-package.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

# unzip to data/
# os.system('tar -xvf data/downloads/newstest2017-segment-level-human.tar.gz -p data/')
# os.system('tar -xvf data/downloads/wmt17-metrics-task-package.tgz -p data/')

5e0446343c0238a36c2b91c548e3c7e9
5e0446343c0238a36c2b91c548e3c7e9
True
c9941e7139b62a349b19bcf20f8a54ec
c9941e7139b62a349b19bcf20f8a54ec
True


### Official system-level da scores

In [3]:
da_sys = pd.read_csv('data/wmt18-metrics-task-package/manual-evaluation/DA-syslevel.csv', delimiter=' ')
da_sys.columns = ['lp', 'score', 'system']
da_sys

Unnamed: 0,lp,score,system
0,en-cs,0.594,CUNI-Transformer.5595
1,en-cs,0.384,uedin.5630
2,en-cs,0.101,online-B.0
3,en-cs,-0.115,online-A.0
4,en-cs,-0.246,online-G.0
...,...,...,...
144,cs-en,0.298,CUNI-Transformer.5560
145,cs-en,0.165,uedin.5561
146,cs-en,0.115,online-B.0
147,cs-en,-0.023,online-A.0


In [4]:
sys_scores = pd.DataFrame(data={'lp':[], 'system':[]})

for submission in glob.glob('data/wmt18-metrics-task-package/final-metric-scores/baselines/*.sys.*'):
    if 'chrF' in submission or 'mteval' in submission:
        continue

    metric_name = submission.split('/')[-1][:-len('.sys.score.gz')]
    print(metric_name)

    hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open(submission, 'rt') if 'hybrid' not in i)
    reader = io.StringIO(hybrid_filtered)
    metric_syss = pd.read_csv(reader, delimiter='\t', header=None)
    
    metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
    
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

PER
WER
CDER
TER


In [5]:
# BLEU
metric_name = 'BLEU'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open('data/wmt18-metrics-task-package/final-metric-scores/baselines/mteval.sys.score.gz', 'rt') if 'hybrid' not in i and 'BLEU' in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)

metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

# NIST
metric_name = 'NIST'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open('data/wmt18-metrics-task-package/final-metric-scores/baselines/mteval.sys.score.gz', 'rt') if 'hybrid' not in i and 'NIST' in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)

metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

### Join metric and da scores

In [6]:
sys_scores_da = da_sys.merge(sys_scores)
sys_scores_da = sys_scores_da[['lp', 'score', 'system', 'BLEU', 'NIST', 'CDER', 'PER', 'TER', 'WER']]
sys_scores_da

Unnamed: 0,lp,score,system,BLEU,NIST,CDER,PER,TER,WER
0,en-cs,0.594,CUNI-Transformer.5595,0.2690,7.2082,0.4732,0.5687,0.4334,0.4094
1,en-cs,0.384,uedin.5630,0.2438,6.8232,0.4498,0.5441,0.4079,0.3824
2,en-cs,0.101,online-B.0,0.2024,6.2197,0.4090,0.5152,0.3686,0.3439
3,en-cs,-0.115,online-A.0,0.1688,5.7414,0.3766,0.4843,0.3322,0.3087
4,en-cs,-0.246,online-G.0,0.1641,5.5616,0.3680,0.4546,0.3085,0.2826
...,...,...,...,...,...,...,...,...,...
144,cs-en,0.298,CUNI-Transformer.5560,0.3569,8.5550,0.5498,0.6581,0.5152,0.4862
145,cs-en,0.165,uedin.5561,0.3363,8.2234,0.5314,0.6470,0.4885,0.4580
146,cs-en,0.115,online-B.0,0.3416,8.4381,0.5363,0.6448,0.5068,0.4759
147,cs-en,-0.023,online-A.0,0.2849,7.6462,0.4905,0.6163,0.4483,0.4175


### WMT18 system-level data (raw)

In [7]:
lp_df = []

for file in glob.glob('data/wmt-human-evaluation/newstest2018-humaneval/analysis/ad-seg-scores-*.csv'):
    lp = file[-9:-4]
    print(lp)
    
    df = pd.read_csv(file, delimiter=' ')
    df['lp'] = [lp] * len(df)
    
    lp_df.append(df)
raw_seg_scores_da = pd.concat(lp_df)

de-en
en-fi
en-zh
fi-en
en-cs
ru-en
en-et
en-tr
tr-en
en-de
en-ru
zh-en
cs-en
et-en


In [8]:
raw_sys_scores_da = raw_seg_scores_da.groupby(['lp', 'SYS'], as_index=False).mean()

raw_sys_scores_da.columns = ['lp', 'system', 'sid', 'raw_score', 'score', 'N', '5']
raw_sys_scores_da = raw_sys_scores_da[['lp', 'system', 'raw_score', 'score']]
raw_sys_scores_da

Unnamed: 0,lp,system,raw_score,score
0,cs-en,CUNI-Transformer.5560,71.822066,0.298055
1,cs-en,HUMAN,92.790392,0.857144
2,cs-en,online-A.0,62.094013,-0.022957
3,cs-en,online-B.0,66.626455,0.114941
4,cs-en,online-G.0,57.518629,-0.183208
...,...,...,...,...
158,zh-en,online-B.0,77.667910,0.111083
159,zh-en,online-F.0,64.429426,-0.376822
160,zh-en,online-G.0,65.927673,-0.326750
161,zh-en,online-Y.0,74.963090,-0.005269


In [9]:
sys_scores_da = sys_scores_da.merge(raw_sys_scores_da[['lp','system','raw_score']])
sys_scores_da[['lp','raw_score', 'score']].groupby('lp').corr('pearson')

Unnamed: 0_level_0,Unnamed: 1_level_0,raw_score,score
lp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cs-en,raw_score,1.0,0.999646
cs-en,score,0.999646,1.0
de-en,raw_score,1.0,0.999822
de-en,score,0.999822,1.0
en-cs,raw_score,1.0,0.999874
en-cs,score,0.999874,1.0
en-de,raw_score,1.0,0.999731
en-de,score,0.999731,1.0
en-et,raw_score,1.0,0.999853
en-et,score,0.999853,1.0


### Validate correlations

In [10]:
sys_scores_da.groupby('lp') \
    ['lp'] \
    .count()

lp
cs-en     5
de-en    16
en-cs     5
en-de    16
en-et    14
en-fi    12
en-ru     9
en-tr     8
en-zh    14
et-en    14
fi-en     9
ru-en     8
tr-en     5
zh-en    14
Name: lp, dtype: int64

In [11]:
sys_scores_da[sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::8] \
    .round(3) \
    .T \
    .sort_index()

lp,cs-en,de-en,et-en,fi-en,ru-en,tr-en,zh-en
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.97,0.971,0.986,0.973,0.979,-0.657,0.978
CDER,0.972,0.98,0.99,0.984,0.98,-0.664,0.982
NIST,0.954,0.984,0.983,0.975,0.973,0.97,0.968
PER,0.97,0.985,0.983,0.993,0.967,0.159,0.931
TER,0.95,0.97,0.99,0.968,0.97,0.533,0.975
WER,0.951,0.961,0.991,0.961,0.968,0.041,0.975
raw_score,1.0,1.0,1.0,0.999,0.998,0.999,0.999
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
sys_scores_da[~sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::8] \
    .round(3) \
    .T \
    .sort_index()

lp,en-cs,en-de,en-et,en-fi,en-ru,en-tr,en-zh
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.995,0.981,0.975,0.962,0.983,0.826,0.947
CDER,0.997,0.986,0.984,0.964,0.984,0.861,0.961
NIST,0.999,0.986,0.983,0.949,0.99,0.902,0.95
PER,0.991,0.981,0.958,0.906,0.988,0.859,0.964
TER,0.997,0.988,0.981,0.942,0.987,0.867,0.963
WER,0.997,0.986,0.981,0.945,0.985,0.853,0.957
raw_score,1.0,1.0,1.0,1.0,0.999,0.993,0.997
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### WMT18 system-level (src, ref, out)

In [13]:
raw_seg_scores_da = raw_seg_scores_da[['SYS','SID','RAW.SCR','Z.SCR','N','lp']]
raw_seg_scores_da.columns = ['system', 'sid', 'raw_score', 'score', 'N', 'lp']
 
raw_seg_scores_da

Unnamed: 0,system,sid,raw_score,score,N,lp
0,uedin.5766,638,99.500000,0.970839,2,de-en
1,uedin.5766,2079,80.666667,0.256704,3,de-en
2,uedin.5766,1419,89.666667,0.887826,3,de-en
3,uedin.5766,1154,100.000000,1.461489,2,de-en
4,uedin.5766,1087,56.000000,-0.313921,2,de-en
...,...,...,...,...,...,...
23993,HUMAN,213,100.000000,1.227032,2,et-en
23994,HUMAN,657,100.000000,1.414036,1,et-en
23995,HUMAN,245,100.000000,1.414036,1,et-en
23996,HUMAN,1803,100.000000,1.414036,1,et-en


In [14]:
# srcs and refs
srcs, refs, lps, sids = [], [], [], []
for lp in sys_scores_da.lp.unique():
    fr, to = lp[:2], lp[3:]
    print(lp)
    
    refs_ = list(open('data/wmt18-submitted-data/txt/references/newstest2018-%s%s-ref.%s' % (fr, to, to)))
    srcs_ = list(open('data/wmt18-submitted-data/txt/sources/newstest2018-%s%s-src.%s' % (fr, to, fr)))
    sids_ = list(range(1, len(refs_)+1))
    refs.extend(refs_)
    srcs.extend(srcs_)
    sids.extend(sids_)
    
    assert(len(refs_) == len(srcs_))
    lps.extend([lp]*len(refs_))
    
src_ref_df = pd.DataFrame({'reference' : refs, 'source':srcs, 'lp': lps, 'sid': sids})
print(len(src_ref_df), len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(src_ref_df, on=['lp','sid'], how='inner')
print(len(raw_seg_scores_da))

en-cs
tr-en
fi-en
zh-en
en-zh
ru-en
en-de
en-ru
en-fi
et-en
en-et
en-tr
de-en
cs-en
41924 244337
244337


In [15]:
# outs
lps, outs, sids, syss = [], [], [], []
for file in glob.glob('data/wmt18-submitted-data/txt/system-outputs/newstest2018/*/*'):
    lp = file.split('.')[-1]
    system = '.'.join(file.split('.')[1:-1])
    
    outs_ = list(open(file, 'rt'))
    sids_ = list(range(1, len(outs_)+1))
    lps_ = len(outs_) * [lp]
    syss_ = len(outs_) * [system]
    
    outs.extend(outs_)
    sids.extend(sids_)
    lps.extend(lps_)
    syss.extend(syss_)

out_df = pd.DataFrame({'lp': lps, 'output':outs, 'sid': sids, 'system': syss})
print(len(out_df), len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(out_df, on=['lp','sid', 'system'], how='left')
print(len(raw_seg_scores_da))

449234 244337
244337


In [16]:
# we do not have the human generated references
# so we are missing some of the outputs from "systems"
raw_seg_scores_da[raw_seg_scores_da.output.isna()]

Unnamed: 0,system,sid,raw_score,score,N,lp,reference,source,output
58,HUMAN,1154,100.000000,1.202478,2,de-en,"Arsenal manager Arsene Wenger, whose club is o...","Arsenal-Manager Arsene Wenger, dessen Verein i...",
75,HUMAN,1087,99.333333,0.903733,3,de-en,Minister of Agriculture in interview – How do ...,Landwirtschaftsminister im Interview - Wie sch...,
124,HUMAN,2701,98.000000,1.383347,1,de-en,"The blue ""business survey"" is only filled out ...","Der blaue ""Betriebsbogen"" ist nur auszufüllen,...",
141,HUMAN,366,78.250000,0.302420,4,de-en,"Earlier this year, French journalists Franck E...",Anfang des Jahres wurden die französischen Jou...,
168,HUMAN,637,94.333333,0.572058,3,de-en,"A hundred yards back, a group of about four do...",Ungefähr hundert Meter dahinter wischt sich ei...,
...,...,...,...,...,...,...,...,...,...
244263,HUMAN,137,91.500000,0.852289,2,et-en,The Estonian living in Canada also won a bronz...,Kanadas elav eestlanna võitis veel pronksmedal...,
244312,HUMAN,130,94.000000,0.937040,1,et-en,Gold was just over half a second off Alina Ken...,Alina Kendziorile kuuluvast Eesti rekordist (2...,
244321,HUMAN,825,81.000000,0.774078,4,et-en,The additional 1.6 million tonne aluminium dem...,Sellega on seletatav juba 2013-2016 aastani tä...,
244330,HUMAN,606,99.666667,1.132610,3,et-en,And I like the feeling that I’m on a summer to...,"Ja mulle meeldib see tunne, et olen oma moeloo...",


### Pickle

In [17]:
pickle.dump(raw_seg_scores_da, open('data/pickles/wmt18-sys_level-all.pkl', 'wb'))
pickle.dump(sys_scores_da, open('data/pickles/wmt18-sys_level-agg.pkl', 'wb'))