In [23]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import pandas as pd
import numpy as np

# WMT17 segment-level data

### Official system-level da scores

In [24]:
official_da_sys = pd.read_csv('data/wmt17-metrics-task-package/manual-evaluation/DA-syslevel.csv', delimiter=' ', header=0)
official_da_sys.columns = ['lp', 'score', 'system']
official_da_sys['system'] = official_da_sys['system'].apply(lambda x: x.split('.')[0])

In [25]:
official_da_sys \
    .groupby('lp') \
    .count()

Unnamed: 0_level_0,score,system
lp,Unnamed: 1_level_1,Unnamed: 2_level_1
cs-en,4,4
de-en,11,11
en-cs,14,14
en-de,16,16
en-fi,12,12
en-lv,17,17
en-ru,9,9
en-tr,8,8
en-zh,11,11
fi-en,6,6


### Metric scores

In [26]:
sys_scores = pd.DataFrame(data={'lp':[], 'system':[]})

baseline_syss = glob.glob('data/wmt17-metrics-task-package/final-metric-scores/baselines/*.sys.*')

for submission in itertools.chain(baseline_syss):
    metric_name = submission.split('/')[-1]
    metric_name = metric_name[:-len('.sys.score.gz')] if metric_name.endswith('.gz') else metric_name[:-len('.sys.score')]
    print(metric_name)

    if submission.endswith('.gz'):
        hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open(submission, 'rt') if 'hybrid' not in i)
        reader = io.StringIO(hybrid_filtered)
        metric_syss = pd.read_csv(reader, delimiter='\t', header=None)
    else:
        metric_syss = pd.read_csv(open(submission, 'rt'), delimiter='\t', header=None)
    metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
    
    # fix system names
    metric_syss.dropna(inplace=True)
    metric_syss['system'] = metric_syss['system'].apply(lambda x: x.split('.')[0])
    
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

CDER
TER
BLEU
PER
WER
NIST


In [27]:
# en-zh
en_zh_sys = pd.read_csv(gzip.open('data/wmt17-metrics-task-package/final-metric-scores/baselines/baselines.en-zh.sys_hide.score.gz', 'rt'), delimiter='\t', header=None)
en_zh_sys.columns = ['metric', 'lp', 'testset', 'system', 'score']
en_zh_sys = en_zh_sys[~en_zh_sys.system.str.contains('hybrid')]
en_zh_sys['system'] = en_zh_sys['system'].apply(lambda x: x.split('.')[0])


for metric_name in en_zh_sys['metric'].unique():
    if metric_name in ['chrF']:
        continue
    print(metric_name)
        
    metric_syss = en_zh_sys[en_zh_sys.metric == metric_name]
    metric_syss.columns = ['metric', 'lp', 'testset', 'system', metric_name]
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')
    
    # fix _x and _y
    sys_scores[metric_name] = [ x['%s_x' % metric_name] if not np.isnan(x['%s_x' % metric_name]) else x['%s_y' % metric_name] for i, x in sys_scores.iterrows() ]
    sys_scores = sys_scores.drop(['%s_x' % metric_name, '%s_y' % metric_name], axis=1)

BLEU
CDER
NIST
PER
TER
WER


### Join metric scores

In [28]:
sys_scores_da = official_da_sys.merge(sys_scores, on=['lp', 'system'], how='left')
sys_scores_da

Unnamed: 0,lp,score,system,BLEU,CDER,NIST,PER,TER,WER
0,en-lv,0.196,tilde-nc-nmt-smt-hybrid,,,,,,
1,en-lv,0.121,online-B,0.1852,0.3925,5.7343,0.4859,0.3373,0.3139
2,en-lv,0.104,tilde-c-nmt-smt-hybrid,,,,,,
3,en-lv,0.075,limsi-factored-norm,0.1798,0.3799,5.5456,0.4447,0.3028,0.2787
4,en-lv,0.058,usfd-consensus-qt21,0.1927,0.3962,5.8896,0.4690,0.3240,0.2982
...,...,...,...,...,...,...,...,...,...
147,de-en,-0.260,online-F,0.1955,0.4201,6.6679,0.5513,0.3658,0.3285
148,cs-en,0.181,uedin-nmt,0.3248,0.5253,8.0391,0.6385,0.4749,0.4430
149,cs-en,0.068,online-B,0.2877,0.4971,7.7562,0.6169,0.4573,0.4265
150,cs-en,-0.068,online-A,0.2680,0.4765,7.4157,0.6082,0.4257,0.3939


In [29]:
# can't be reproduced:
# lv-en (differences +-0.002)
# en-lv (differences +-0.02)

In [30]:
sys_scores_da[sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,cs-en,de-en,fi-en,lv-en,ru-en,tr-en,zh-en
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.971,0.923,0.903,0.975,0.912,0.976,0.864
CDER,0.989,0.93,0.927,0.986,0.922,0.973,0.904
NIST,1.0,0.931,0.931,0.946,0.912,0.971,0.849
PER,0.968,0.951,0.896,0.949,0.911,0.932,0.877
TER,0.989,0.906,0.952,0.965,0.912,0.954,0.847
WER,0.987,0.896,0.948,0.967,0.907,0.925,0.839
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
sys_scores_da[~sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,en-cs,en-de,en-fi,en-lv,en-ru,en-tr,en-zh
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.956,0.804,0.92,0.839,0.898,0.924,0.981
CDER,0.968,0.813,0.965,0.916,0.924,0.957,0.983
NIST,0.962,0.769,0.957,0.922,0.92,0.986,0.976
PER,0.954,0.687,0.949,0.819,0.887,0.963,0.934
TER,0.955,0.796,0.961,0.893,0.933,0.967,0.97
WER,0.954,0.802,0.96,0.89,0.934,0.956,0.954
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Pickle

In [32]:
pickle.dump(sys_scores_da, open('data/pickles/wmt17-data.pkl', 'wb'))