In [1]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import pandas as pd
import numpy as np

# WMT18 system-level data

### Official system-level da scores

In [2]:
da_sys = pd.read_csv('data/wmt18-metrics-task-package/manual-evaluation/DA-syslevel.csv', delimiter=' ')
da_sys.columns = ['lp', 'score', 'system']
da_sys

Unnamed: 0,lp,score,system
0,en-cs,0.594,CUNI-Transformer.5595
1,en-cs,0.384,uedin.5630
2,en-cs,0.101,online-B.0
3,en-cs,-0.115,online-A.0
4,en-cs,-0.246,online-G.0
...,...,...,...
144,cs-en,0.298,CUNI-Transformer.5560
145,cs-en,0.165,uedin.5561
146,cs-en,0.115,online-B.0
147,cs-en,-0.023,online-A.0


In [3]:
sys_scores = pd.DataFrame(data={'lp':[], 'system':[]})

for submission in glob.glob('data/wmt18-metrics-task-package/final-metric-scores/baselines/*.sys.*'):
    if 'chrF' in submission or 'mteval' in submission:
        continue

    metric_name = submission.split('/')[-1][:-len('.sys.score.gz')]
    print(metric_name)

    hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open(submission, 'rt') if 'hybrid' not in i)
    reader = io.StringIO(hybrid_filtered)
    metric_syss = pd.read_csv(reader, delimiter='\t', header=None)
    
    metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
    
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

PER
WER
CDER
TER


In [4]:
# BLEU
metric_name = 'BLEU'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open('data/wmt18-metrics-task-package/final-metric-scores/baselines/mteval.sys.score.gz', 'rt') if 'hybrid' not in i and 'BLEU' in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)

metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

# NIST
metric_name = 'NIST'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open('data/wmt18-metrics-task-package/final-metric-scores/baselines/mteval.sys.score.gz', 'rt') if 'hybrid' not in i and 'NIST' in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)

metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

### Join metric and da scores

In [5]:
sys_scores_da = da_sys.merge(sys_scores)
sys_scores_da = sys_scores_da[['lp', 'score', 'system', 'BLEU', 'NIST', 'CDER', 'PER', 'TER', 'WER']]
sys_scores_da

Unnamed: 0,lp,score,system,BLEU,NIST,CDER,PER,TER,WER
0,en-cs,0.594,CUNI-Transformer.5595,0.2690,7.2082,0.4732,0.5687,0.4334,0.4094
1,en-cs,0.384,uedin.5630,0.2438,6.8232,0.4498,0.5441,0.4079,0.3824
2,en-cs,0.101,online-B.0,0.2024,6.2197,0.4090,0.5152,0.3686,0.3439
3,en-cs,-0.115,online-A.0,0.1688,5.7414,0.3766,0.4843,0.3322,0.3087
4,en-cs,-0.246,online-G.0,0.1641,5.5616,0.3680,0.4546,0.3085,0.2826
...,...,...,...,...,...,...,...,...,...
144,cs-en,0.298,CUNI-Transformer.5560,0.3569,8.5550,0.5498,0.6581,0.5152,0.4862
145,cs-en,0.165,uedin.5561,0.3363,8.2234,0.5314,0.6470,0.4885,0.4580
146,cs-en,0.115,online-B.0,0.3416,8.4381,0.5363,0.6448,0.5068,0.4759
147,cs-en,-0.023,online-A.0,0.2849,7.6462,0.4905,0.6163,0.4483,0.4175


### Validate correlations

In [6]:
sys_scores_da.groupby('lp') \
    ['lp'] \
    .count()

lp
cs-en     5
de-en    16
en-cs     5
en-de    16
en-et    14
en-fi    12
en-ru     9
en-tr     8
en-zh    14
et-en    14
fi-en     9
ru-en     8
tr-en     5
zh-en    14
Name: lp, dtype: int64

In [7]:
sys_scores_da[sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,cs-en,de-en,et-en,fi-en,ru-en,tr-en,zh-en
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.97,0.971,0.986,0.973,0.979,-0.657,0.978
CDER,0.972,0.98,0.99,0.984,0.98,-0.664,0.982
NIST,0.954,0.984,0.983,0.975,0.973,0.97,0.968
PER,0.97,0.985,0.983,0.993,0.967,0.159,0.931
TER,0.95,0.97,0.99,0.968,0.97,0.533,0.975
WER,0.951,0.961,0.991,0.961,0.968,0.041,0.975
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
sys_scores_da[~sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,en-cs,en-de,en-et,en-fi,en-ru,en-tr,en-zh
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.995,0.981,0.975,0.962,0.983,0.826,0.947
CDER,0.997,0.986,0.984,0.964,0.984,0.861,0.961
NIST,0.999,0.986,0.983,0.949,0.99,0.902,0.95
PER,0.991,0.981,0.958,0.906,0.988,0.859,0.964
TER,0.997,0.988,0.981,0.942,0.987,0.867,0.963
WER,0.997,0.986,0.981,0.945,0.985,0.853,0.957
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Pickle

In [9]:
pickle.dump(sys_scores_da, open('data/pickles/wmt18-data.pkl', 'wb'))