In [1]:
import glob
import gzip
import itertools
import csv
import io
import tqdm
import pickle
import pandas as pd
import seaborn as sns
sns.set()
import scipy
import matplotlib.pyplot as plt
import numpy as np

### WMT16

In [2]:
wmt16 = pickle.load(open('data/pickles/wmt16-sys_level-agg.pkl', 'rb'))
wmt16_seg = pickle.load(open('data/pickles/wmt16-seg_level-agg.pkl', 'rb'))

##### System level

In [3]:
wmt16 \
    .groupby('lp') \
    ['system'] \
    .count()

lp
cs-en     6
de-en    10
en-ru    12
fi-en     9
ro-en     7
ru-en    10
tr-en     8
Name: system, dtype: int64

In [4]:
wmt16[wmt16.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[6::8] \
    .round(3) \
    .T \
    .sort_index()

lp,cs-en,de-en,fi-en,ro-en,ru-en,tr-en
Unnamed: 0_level_1,score,score,score,score,score,score
BLEU,0.989,0.808,0.864,0.84,0.837,0.895
CDER,0.988,0.827,0.86,0.8,0.855,0.826
NIST,0.978,0.801,0.929,0.807,0.854,0.938
PER,0.97,0.73,0.767,0.748,0.887,0.94
TER,0.969,0.834,0.846,0.793,0.847,0.788
WER,0.967,0.822,0.768,0.762,0.837,0.651
raw_score,1.0,0.999,1.0,0.999,0.997,0.999
score,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
wmt16[~wmt16.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[6::8] \
    .round(3) \
    .T \
    .sort_index()

lp,en-ru
Unnamed: 0_level_1,score
BLEU,0.838
CDER,0.874
NIST,0.897
PER,0.854
TER,0.879
WER,0.876
raw_score,0.994
score,1.0


##### Segment level

In [6]:
wmt16_seg[wmt16_seg.lp.str.endswith('en')].groupby('lp').corr()[2::3][['sentBLEU']].T

lp,cs-en,de-en,fi-en,ro-en,ru-en,tr-en
Unnamed: 0_level_1,score,score,score,score,score,score
sentBLEU,0.556577,0.483888,0.448357,0.498979,0.501937,0.531602


The de-en and fi-en columns are switched in our results compared to the paper. I will assume there is a mistake in the paper.

### WMT17

In [7]:
wmt17 = pickle.load(open('data/pickles/wmt17-sys_level-agg.pkl', 'rb'))
wmt17_seg = pickle.load(open('data/pickles/wmt17-seg_level-agg.pkl', 'rb'))

##### System level

In [8]:
wmt17 \
    .groupby('lp') \
    ['system'] \
    .count()

lp
cs-en     4
de-en    11
en-cs    14
en-de    16
en-fi    12
en-lv    17
en-ru     9
en-tr     8
en-zh    11
fi-en     6
lv-en     9
ru-en     9
tr-en    10
zh-en    16
Name: system, dtype: int64

can't be reproduced:
lv-en (differences +-0.002)
en-lv (differences +-0.02)

In [9]:
wmt17[wmt17.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::8] \
    .round(3) \
    .T \
    .sort_index()

lp,cs-en,de-en,fi-en,lv-en,ru-en,tr-en,zh-en
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.971,0.923,0.903,0.975,0.912,0.976,0.864
CDER,0.989,0.93,0.927,0.986,0.922,0.973,0.904
NIST,1.0,0.931,0.931,0.946,0.912,0.971,0.849
PER,0.968,0.951,0.896,0.949,0.911,0.932,0.877
TER,0.989,0.906,0.952,0.965,0.912,0.954,0.847
WER,0.987,0.896,0.948,0.967,0.907,0.925,0.839
raw_score,1.0,0.999,1.0,0.999,0.997,0.999,0.997
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
wmt17[~wmt17.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::8] \
    .round(3) \
    .T \
    .sort_index()

lp,en-cs,en-de,en-fi,en-lv,en-ru,en-tr,en-zh
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.956,0.804,0.92,0.839,0.898,0.924,0.981
CDER,0.968,0.813,0.965,0.916,0.924,0.957,0.983
NIST,0.962,0.769,0.957,0.922,0.92,0.986,0.976
PER,0.954,0.687,0.949,0.819,0.887,0.963,0.934
TER,0.955,0.796,0.961,0.893,0.933,0.967,0.97
WER,0.954,0.802,0.96,0.89,0.934,0.956,0.954
raw_score,1.0,0.999,1.0,0.999,1.0,1.0,0.999
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


##### Segment level

In [11]:
wmt17_seg[wmt17_seg.lp.str.endswith('en')].groupby('lp').corr()[1::3][['sentBLEU']].T

lp,cs-en,de-en,fi-en,lv-en,ru-en,tr-en,zh-en
Unnamed: 0_level_1,score,score,score,score,score,score,score
sentBLEU,0.434955,0.432482,0.571167,0.392805,0.484211,0.538433,0.523828


In [12]:
wmt17_seg[~wmt17_seg.lp.str.endswith('en')].groupby('lp').corr()[1::3][['sentBLEU']].T

lp,en-ru,en-zh
Unnamed: 0_level_1,score,score
sentBLEU,0.467901,


can't be reproduced en-zh (two scores per segment) and zh-en (difference 0.01).