In [1]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import pandas as pd
import numpy as np

# WMT18 system-level data

### Official system-level da scores

In [2]:
da_sys = pd.read_csv('data/wmt19-metrics-task-package/manual-evaluation/DA-syslevel.csv', delimiter=' ')
da_sys.columns = ['lp', 'score', 'system']
da_sys

Unnamed: 0,lp,score,system
0,en-cs,0.402,CUNI-DocTransformer-T2T.6751
1,en-cs,0.401,CUNI-Transformer-T2T-2018.6457
2,en-cs,0.388,CUNI-Transformer-T2T-2019.6851
3,en-cs,0.223,CUNI-DocTransformer-Marian.6922
4,en-cs,0.206,uedin.6667
...,...,...,...
220,de-fr,0.019,online-Y.0
221,de-fr,-0.104,TartuNLP-c.6897
222,de-fr,-0.194,online-A.0
223,de-fr,-0.240,online-G.0


In [3]:
sys_scores = pd.DataFrame(data={'lp':[], 'system':[]})

for submission in glob.glob('data/wmt19-metrics-task-package/baselines/*.sys.*'):
    if 'chrF' in submission or 'mteval' in submission:
        continue

    metric_name = submission.split('/')[-1][:-len('.sys.score.gz')]
    print(metric_name)

    hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open(submission, 'rt') if 'hybrid' not in i)
    reader = io.StringIO(hybrid_filtered)
    metric_syss = pd.read_csv(reader, delimiter='\t', header=None)
    
    metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
    
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

TER
CDER
sacreBLEU-BLEU
PER
WER


In [4]:
# BLEU
metric_name = 'BLEU'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open('data/wmt19-metrics-task-package/baselines/mteval-inter.sys.score.gz', 'rt') if 'hybrid' not in i and 'BLEU' in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)

metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

# NIST
metric_name = 'NIST'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open('data/wmt19-metrics-task-package/baselines/mteval-inter.sys.score.gz', 'rt') if 'hybrid' not in i and 'NIST' in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)

metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

### Join metric and da scores

In [5]:
sys_scores_da = da_sys.merge(sys_scores)
sys_scores_da = sys_scores_da[['lp', 'score', 'system', 'BLEU', 'NIST', 'CDER', 'PER', 'TER', 'WER']]
sys_scores_da

Unnamed: 0,lp,score,system,BLEU,NIST,CDER,PER,TER,WER
0,en-cs,0.402,CUNI-DocTransformer-T2T.6751,0.1904,5.1045,0.5042,0.6061,0.4697,0.4427
1,en-cs,0.401,CUNI-Transformer-T2T-2018.6457,0.1968,5.2036,0.5042,0.6037,0.4714,0.4452
2,en-cs,0.388,CUNI-Transformer-T2T-2019.6851,0.1892,5.0849,0.4977,0.5971,0.4647,0.4386
3,en-cs,0.223,CUNI-DocTransformer-Marian.6922,0.1769,4.8610,0.4877,0.5878,0.4513,0.4243
4,en-cs,0.206,uedin.6667,0.1916,5.1417,0.4895,0.5877,0.4589,0.4323
...,...,...,...,...,...,...,...,...,...
220,de-fr,0.019,online-Y.0,0.3641,8.3309,0.5197,0.6065,0.4936,0.4665
221,de-fr,-0.104,TartuNLP-c.6897,0.3415,7.9147,0.4967,0.5908,0.4629,0.4348
222,de-fr,-0.194,online-A.0,0.3515,7.9717,0.5002,0.6134,0.4508,0.4212
223,de-fr,-0.240,online-G.0,0.3412,8.0219,0.4724,0.5738,0.4308,0.4067


### Validate correlations

In [6]:
sys_scores_da.groupby('lp') \
    ['lp'] \
    .count()

lp
de-cs    11
de-en    16
de-fr    11
en-cs    11
en-de    22
en-fi    12
en-gu    11
en-kk    11
en-lt    12
en-ru    12
en-zh    12
fi-en    12
fr-de    10
gu-en    11
kk-en    11
lt-en    11
ru-en    14
zh-en    15
Name: lp, dtype: int64

In [7]:
sys_scores_da[sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,de-en,fi-en,gu-en,kk-en,lt-en,ru-en,zh-en
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.849,0.982,0.834,0.946,0.961,0.879,0.899
CDER,0.89,0.988,0.876,0.967,0.975,0.892,0.917
NIST,0.813,0.986,0.93,0.942,0.944,0.925,0.921
PER,0.883,0.991,0.91,0.737,0.947,0.922,0.952
TER,0.874,0.984,0.89,0.799,0.96,0.917,0.84
WER,0.863,0.983,0.861,0.793,0.961,0.911,0.82
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
sys_scores_da[sys_scores_da.lp.str.startswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,en-cs,en-de,en-fi,en-gu,en-kk,en-lt,en-ru,en-zh
Unnamed: 0_level_1,score,score,score,score,score,score,score,score
BLEU,0.897,0.921,0.969,0.737,0.852,0.989,0.986,0.901
CDER,0.985,0.973,0.978,0.84,0.927,0.985,0.993,0.905
NIST,0.896,0.321,0.971,0.786,0.93,0.993,0.988,0.884
PER,0.976,0.97,0.982,0.839,0.921,0.985,0.981,0.895
TER,0.98,0.969,0.981,0.865,0.94,0.994,0.995,0.856
WER,0.982,0.966,0.98,0.861,0.939,0.991,0.994,0.875
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
sys_scores_da[~sys_scores_da.lp.str.startswith('en') & ~sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,de-cs,de-fr,fr-de
Unnamed: 0_level_1,score,score,score
BLEU,0.941,0.891,0.864
CDER,0.864,0.949,0.852
NIST,0.954,0.916,0.862
PER,0.875,0.857,0.899
TER,0.89,0.956,0.895
WER,0.872,0.956,0.894
score,1.0,1.0,1.0


### Pickle

In [10]:
pickle.dump(sys_scores_da, open('data/pickles/wmt19-data.pkl', 'wb'))