In [1]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import hashlib
import pandas as pd
import numpy as np

# WMT17 system-level data

### Checksums

In [2]:
mine = '6971c87e22cf24c11bbf6551af64ab13'
yours = hashlib.md5(open('data/downloads/wmt17-submitted-data-v1.0.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

mine = 'f45f3160ff90e64f275944028739bd41'
yours = hashlib.md5(open('data/downloads/wmt17-metrics-task-package.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

# unzip to data/
# os.system('tar -xvf data/downloads/newstest2017-segment-level-human.tar.gz -p data/')
# os.system('tar -xvf data/downloads/wmt17-metrics-task-package.tgz -p data/')

6971c87e22cf24c11bbf6551af64ab13
6971c87e22cf24c11bbf6551af64ab13
True
f45f3160ff90e64f275944028739bd41
f45f3160ff90e64f275944028739bd41
True


### Official system-level da scores

In [3]:
official_da_sys = pd.read_csv('data/wmt17-metrics-task-package/manual-evaluation/DA-syslevel.csv', delimiter=' ', header=0)
official_da_sys.columns = ['lp', 'score', 'system']
official_da_sys['system'] = official_da_sys['system'].apply(lambda x: x.split('.')[0])

### Metric scores

In [4]:
sys_scores = pd.DataFrame(data={'lp':[], 'system':[]})

baseline_syss = glob.glob('data/wmt17-metrics-task-package/final-metric-scores/baselines/*.sys.*')

for submission in itertools.chain(baseline_syss):
    metric_name = submission.split('/')[-1]
    metric_name = metric_name[:-len('.sys.score.gz')] if metric_name.endswith('.gz') else metric_name[:-len('.sys.score')]
    print(metric_name)

    if submission.endswith('.gz'):
        hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open(submission, 'rt') if 'hybrid' not in i)
        reader = io.StringIO(hybrid_filtered)
        metric_syss = pd.read_csv(reader, delimiter='\t', header=None)
    else:
        metric_syss = pd.read_csv(open(submission, 'rt'), delimiter='\t', header=None)
    metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
    
    # fix system names
    metric_syss.dropna(inplace=True)
    metric_syss['system'] = metric_syss['system'].apply(lambda x: x.split('.')[0])
    
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

CDER
TER
BLEU
PER
WER
NIST


In [5]:
# en-zh
en_zh_sys = pd.read_csv(gzip.open('data/wmt17-metrics-task-package/final-metric-scores/baselines/baselines.en-zh.sys_hide.score.gz', 'rt'), delimiter='\t', header=None)
en_zh_sys.columns = ['metric', 'lp', 'testset', 'system', 'score']
en_zh_sys = en_zh_sys[~en_zh_sys.system.str.contains('hybrid')]
en_zh_sys['system'] = en_zh_sys['system'].apply(lambda x: x.split('.')[0])


for metric_name in en_zh_sys['metric'].unique():
    if metric_name in ['chrF']:
        continue
    print(metric_name)
        
    metric_syss = en_zh_sys[en_zh_sys.metric == metric_name]
    metric_syss.columns = ['metric', 'lp', 'testset', 'system', metric_name]
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')
    
    # fix _x and _y
    sys_scores[metric_name] = [ x['%s_x' % metric_name] if not np.isnan(x['%s_x' % metric_name]) else x['%s_y' % metric_name] for i, x in sys_scores.iterrows() ]
    sys_scores = sys_scores.drop(['%s_x' % metric_name, '%s_y' % metric_name], axis=1)

BLEU
CDER
NIST
PER
TER
WER


### Join metric scores

In [6]:
sys_scores_da = official_da_sys.merge(sys_scores, on=['lp', 'system'], how='left')
sys_scores_da

Unnamed: 0,lp,score,system,BLEU,CDER,NIST,PER,TER,WER
0,en-lv,0.196,tilde-nc-nmt-smt-hybrid,,,,,,
1,en-lv,0.121,online-B,0.1852,0.3925,5.7343,0.4859,0.3373,0.3139
2,en-lv,0.104,tilde-c-nmt-smt-hybrid,,,,,,
3,en-lv,0.075,limsi-factored-norm,0.1798,0.3799,5.5456,0.4447,0.3028,0.2787
4,en-lv,0.058,usfd-consensus-qt21,0.1927,0.3962,5.8896,0.4690,0.3240,0.2982
...,...,...,...,...,...,...,...,...,...
147,de-en,-0.260,online-F,0.1955,0.4201,6.6679,0.5513,0.3658,0.3285
148,cs-en,0.181,uedin-nmt,0.3248,0.5253,8.0391,0.6385,0.4749,0.4430
149,cs-en,0.068,online-B,0.2877,0.4971,7.7562,0.6169,0.4573,0.4265
150,cs-en,-0.068,online-A,0.2680,0.4765,7.4157,0.6082,0.4257,0.3939


### WMT17 system-level data (raw)

In [7]:
lp_df = []

for file in glob.glob('data/wmt-human-evaluation/newstest2017-system-level-human/anon-proc-hits-sys-combined/analysis/ad-seg-scores-*.csv.gz'):
    lp = file[-12:-7]
    print(lp)
    
    df = pd.read_csv(gzip.open(file, 'rt'), delimiter=' ')
    df['lp'] = [lp] * len(df)
    
    lp_df.append(df)
raw_seg_scores_da = pd.concat(lp_df)

en-de
en-cs
en-ru
tr-en
en-lv
zh-en
fi-en
en-zh
de-en
cs-en
ru-en
lv-en
en-fi
en-tr


In [8]:
raw_sys_scores_da = raw_seg_scores_da.groupby(['lp', 'SYS'], as_index=False).mean()

raw_sys_scores_da.columns = ['lp', 'system', 'sid', 'raw_score', 'score', 'N', '5']
raw_sys_scores_da = raw_sys_scores_da[['lp', 'system', 'raw_score', 'score']]
raw_sys_scores_da['system'] = raw_sys_scores_da['system'].apply(lambda x: x.split('.')[0])
raw_sys_scores_da

Unnamed: 0,lp,system,raw_score,score
0,cs-en,PJATK,62.723466,-0.267913
1,cs-en,online-A,68.261460,-0.068109
2,cs-en,online-B,71.877972,0.068185
3,cs-en,uedin-nmt,74.569465,0.180920
4,de-en,C-3MA,68.607824,-0.102653
...,...,...,...,...
147,zh-en,online-B,69.885869,0.112879
148,zh-en,online-F,59.622814,-0.278690
149,zh-en,online-G,59.258396,-0.304650
150,zh-en,uedin-nmt,73.799021,0.208171


In [9]:
sys_scores_da = sys_scores_da.merge(raw_sys_scores_da[['lp','system','raw_score']])
sys_scores_da[['lp','raw_score', 'score']].groupby('lp').corr('pearson')

Unnamed: 0_level_0,Unnamed: 1_level_0,raw_score,score
lp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cs-en,raw_score,1.0,0.999526
cs-en,score,0.999526,1.0
de-en,raw_score,1.0,0.999429
de-en,score,0.999429,1.0
en-cs,raw_score,1.0,0.999788
en-cs,score,0.999788,1.0
en-de,raw_score,1.0,0.999175
en-de,score,0.999175,1.0
en-fi,raw_score,1.0,0.999757
en-fi,score,0.999757,1.0


### Validate correlations

In [10]:
official_da_sys \
    .groupby('lp') \
    ['system'] \
    .count()

lp
cs-en     4
de-en    11
en-cs    14
en-de    16
en-fi    12
en-lv    17
en-ru     9
en-tr     8
en-zh    11
fi-en     6
lv-en     9
ru-en     9
tr-en    10
zh-en    16
Name: system, dtype: int64

In [11]:
# can't be reproduced:
# lv-en (differences +-0.002)
# en-lv (differences +-0.02)

In [12]:
sys_scores_da[sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::9] \
    .round(3) \
    .T \
    .sort_index()

lp,cs-en,de-en,fi-en,lv-en,ru-en,tr-en,zh-en
Unnamed: 0_level_1,score,BLEU,CDER,NIST,PER,TER,WER
BLEU,0.971,1.0,0.996,0.971,0.995,0.964,0.912
CDER,0.989,0.998,1.0,0.964,0.993,0.986,0.945
NIST,1.0,0.995,0.99,1.0,0.993,0.888,0.893
PER,0.968,0.994,0.983,0.991,1.0,0.797,0.884
TER,0.989,0.982,0.971,0.966,0.995,1.0,0.995
WER,0.987,0.973,0.953,0.945,0.994,0.996,1.0
raw_score,1.0,0.921,0.931,0.931,0.923,0.957,0.844
score,1.0,0.923,0.927,0.946,0.911,0.954,0.839


In [13]:
sys_scores_da[~sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::9] \
    .round(3) \
    .T \
    .sort_index()

lp,en-cs,en-de,en-fi,en-lv,en-ru,en-tr,en-zh
Unnamed: 0_level_1,score,BLEU,CDER,NIST,PER,TER,WER
BLEU,0.956,1.0,0.973,0.966,0.991,0.988,0.952
CDER,0.968,0.998,1.0,0.989,0.987,0.998,0.953
NIST,0.962,0.997,0.997,1.0,0.99,0.958,0.949
PER,0.954,0.969,0.988,0.935,1.0,0.949,0.862
TER,0.955,0.993,0.988,0.972,0.991,1.0,0.989
WER,0.954,0.988,0.988,0.965,0.991,0.998,1.0
raw_score,1.0,0.792,0.966,0.92,0.881,0.968,0.95
score,1.0,0.804,0.965,0.922,0.887,0.967,0.954


### WMT17 system-level (src, ref, out)

In [14]:
raw_seg_scores_da = raw_seg_scores_da[['SYS','SID','RAW.SCR','Z.SCR','N','lp']]
raw_seg_scores_da.columns = ['system', 'sid', 'raw_score', 'score', 'N', 'lp']

raw_seg_scores_da['system'] = raw_seg_scores_da['system'].apply(lambda x: x.split('.')[0]) 
raw_seg_scores_da

Unnamed: 0,system,sid,raw_score,score,N,lp
0,RWTH-nmt-ensemble,1467,76.0,0.225732,1,en-de
1,RWTH-nmt-ensemble,1913,23.0,-1.547248,1,en-de
2,RWTH-nmt-ensemble,2766,53.0,-0.677767,1,en-de
3,RWTH-nmt-ensemble,823,67.0,-0.127811,1,en-de
4,RWTH-nmt-ensemble,881,85.0,0.875052,2,en-de
...,...,...,...,...,...,...
2034,uedin-nmt,1470,42.0,0.267117,1,en-tr
2035,uedin-nmt,1213,4.0,-0.814587,1,en-tr
2036,uedin-nmt,960,2.0,-0.871518,1,en-tr
2037,uedin-nmt,833,49.0,0.466378,1,en-tr


In [15]:
# srcs and refs
srcs, refs, lps, sids = [], [], [], []
for lp in sys_scores_da.lp.unique():
    fr, to = lp[:2], lp[3:]
    
    refs_ = list(open('data/wmt17-submitted-data/txt/references/newstest2017-%s%s-ref.%s' % (fr, to, to)))
    srcs_ = list(open('data/wmt17-submitted-data/txt/sources/newstest2017-%s%s-src.%s' % (fr, to, fr)))
    sids_ = list(range(1, len(refs_)+1))
    refs.extend(refs_)
    srcs.extend(srcs_)
    sids.extend(sids_)
    
    assert(len(refs_) == len(srcs_))
    lps.extend([lp]*len(refs_))
    
src_ref_df = pd.DataFrame({'reference' : refs, 'source':srcs, 'lp': lps, 'sid': sids})
print('# of entries before merge: %d' % len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(src_ref_df, on=['lp','sid'], how='inner')
print('# of entries after merge: %d' % len(raw_seg_scores_da))
print('These two should be equal.')

# of entries before merge: 203012
# of entries after merge: 203012
These two should be equal.


In [16]:
# outs
lps, outs, sids, syss = [], [], [], []
for file in glob.glob('data/wmt17-submitted-data/txt/system-outputs/newstest2017/*/*'):
    lp = file.split('.')[-1]
    system = file.split('.')[-3]
    
    # manual fixes
    if 'tuning' in file:
        system = file.split('.')[1]
    elif system == 'CASICT-DCU-NMT':
        system = 'CASICT-cons'
    elif system == 'FBK':
        system = 'fbk-nmt-combination'    
    
    outs_ = list(open(file, 'rt'))
    sids_ = list(range(1, len(outs_)+1))
    lps_ = len(outs_) * [lp]
    syss_ = len(outs_) * [system]
    
    outs.extend(outs_)
    sids.extend(sids_)
    lps.extend(lps_)
    syss.extend(syss_)

out_df = pd.DataFrame({'lp': lps, 'output':outs, 'sid': sids, 'system': syss})
print('# of entries before merge: %d' % len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(out_df, on=['lp','sid', 'system'], how='inner')
print('# of entries after merge: %d' % len(raw_seg_scores_da))
print('These two should be equal.')

# of entries before merge: 203012
# of entries after merge: 203012
These two should be equal.


### Pickle

In [17]:
pickle.dump(raw_seg_scores_da, open('data/pickles/wmt17-sys_level-all.pkl', 'wb'))
pickle.dump(sys_scores_da, open('data/pickles/wmt17-sys_level-agg.pkl', 'wb'))

# WMT17 segment-level data

In [18]:
seg_scores_da = pd.read_csv('data/wmt17-metrics-task-package/manual-evaluation/DA-seglevel.csv', delimiter=' ')
seg_scores_da.columns = ['lp', 'testset', 'system', 'sid', 'score']
seg_scores_da['system'] = seg_scores_da['system'].apply(lambda x: x.split('.')[0]) 
print(len(seg_scores_da))
seg_scores_da = seg_scores_da.merge(out_df[['lp', 'system', 'sid', 'output']], on=['lp', 'system', 'sid'], how='inner')
seg_scores_da = seg_scores_da.merge(src_ref_df[['lp', 'sid', 'source', 'reference']], on=['lp', 'sid'], how='inner')
print(len(seg_scores_da))

5040
5040


In [19]:
seg_scores_da = seg_scores_da[['lp', 'system', 'sid', 'output', 'source', 'reference', 'score']]

In [20]:
seg_scores_da

Unnamed: 0,lp,system,sid,output,source,reference,score
0,en-zh,UU-HNMT,11,“他找到了一个公寓，他和约会一个女孩，”路易斯·卡利亚说。\n,"""He found an apartment, he was dating a girl,""...",Louis Galicia 告诉 KGO：“Frank找到一间公寓，同时在跟一个女孩交往。”\n,-0.522268
1,en-zh,jhu-nmt,28,这场纠纷导致了今年参加六次罢工的初级医生，其中包括NHS历史上第一次停工。\n,The dispute has led to junior doctors taking p...,该纠纷已导致初级医生今年共参与六次罢工，包括英国国家医疗服务体系历史上的首次全面罢工。\n,-0.150441
2,en-zh,CASICT-cons,39,我迷失了。\n,I lost count.\n,我记不清了。\n,-0.330678
3,en-zh,online-A,49,"自11月以来, 俄罗斯公众舆论也出现了转变。\n",Russian public opinion has also turned since N...,自11月份开始，俄罗斯民意也有所扭转。\n,0.490614
4,en-zh,CASICT-cons,68,安卡拉对西方感到愤怒，因为它认为对企图收购的反应是微弱的。\n,Ankara is angry with the West for what it cons...,安卡拉对于西方世界对接管意图的微弱反应感到愤怒。\n,-0.431865
...,...,...,...,...,...,...,...
5035,en-ru,jhu-pbmt,2642,"Кендалл, который является послом марки Estee Л...","Kendall, who is an Estee Lauder brand ambassad...","Кендалл, которая является лицом бренда Estee L...",-0.140292
5036,en-ru,jhu-pbmt,2719,"Во вторник, в ""Фейсбуке"" сообщил, что его наст...","On Tuesday, Facebook said that its desktop web...","Во вторник Facebook объявил, что настольная ве...",0.087274
5037,en-ru,online-F,2760,Результат средние температуры климата которые ...,The result is mean climate temperatures that a...,Результатом являются средние климатические тем...,-0.836368
5038,en-ru,uedin-nmt,2803,"По их пиковым ставкам, эти матовые звезды отни...","At their peak spin rates, these maturing stars...",На своих пиковых скоростях вращения эти взросл...,-1.162147


In [21]:
seg_scores_da.groupby('lp').count()

Unnamed: 0_level_0,system,sid,output,source,reference,score
lp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cs-en,560,560,560,560,560,560
de-en,560,560,560,560,560,560
en-ru,560,560,560,560,560,560
en-zh,560,560,560,560,560,560
fi-en,560,560,560,560,560,560
lv-en,560,560,560,560,560,560
ru-en,560,560,560,560,560,560
tr-en,560,560,560,560,560,560
zh-en,560,560,560,560,560,560


### Pickle

In [22]:
pickle.dump(seg_scores_da, open('data/pickles/wmt17-seg_level-agg.pkl', 'wb'))