In [1]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import hashlib
import pandas as pd
import numpy as np

# WMT19 system-level data

### Checksums

In [2]:
mine = '93f6c7fa3ca5d81b1837ccf34c1a80aa'
'./data/'
yours = hashlib.md5(open('data/downloads/wmt19-submitted-data-v3.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

mine = '8c901c91fde19207cf8494c4af541455'
yours = hashlib.md5(open('data/downloads/wmt19-metrics-task-package.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

# unzip to data/
# os.system('tar -xvf data/downloads/newstest2017-segment-level-human.tar.gz -p data/')
# os.system('tar -xvf data/downloads/wmt17-metrics-task-package.tgz -p data/')

93f6c7fa3ca5d81b1837ccf34c1a80aa
93f6c7fa3ca5d81b1837ccf34c1a80aa
True


FileNotFoundError: [Errno 2] No such file or directory: 'data/downloads/wmt19-metrics-task-package.tgz'

### Official system-level da scores

In [3]:
da_sys = pd.read_csv('data/wmt19-metrics-task-package/manual-evaluation/DA-syslevel.csv', delimiter=' ')
da_sys.columns = ['lp', 'score', 'system']
da_sys

Unnamed: 0,lp,score,system
0,en-cs,0.402,CUNI-DocTransformer-T2T.6751
1,en-cs,0.401,CUNI-Transformer-T2T-2018.6457
2,en-cs,0.388,CUNI-Transformer-T2T-2019.6851
3,en-cs,0.223,CUNI-DocTransformer-Marian.6922
4,en-cs,0.206,uedin.6667
...,...,...,...
220,de-fr,0.019,online-Y.0
221,de-fr,-0.104,TartuNLP-c.6897
222,de-fr,-0.194,online-A.0
223,de-fr,-0.240,online-G.0


In [4]:
sys_scores = pd.DataFrame(data={'lp':[], 'system':[]})

for submission in glob.glob('data/wmt19-metrics-task-package/baselines/*.sys.*'):
    if 'chrF' in submission or 'mteval' in submission:
        continue

    metric_name = submission.split('/')[-1][:-len('.sys.score.gz')]
    print(metric_name)

    hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open(submission, 'rt') if 'hybrid' not in i)
    reader = io.StringIO(hybrid_filtered)
    metric_syss = pd.read_csv(reader, delimiter='\t', header=None)
    
    metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
    
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

TER
CDER
sacreBLEU-BLEU
PER
WER


In [5]:
# BLEU
metric_name = 'BLEU'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open('data/wmt19-metrics-task-package/baselines/mteval-inter.sys.score.gz', 'rt') if 'hybrid' not in i and 'BLEU' in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)

metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

# NIST
metric_name = 'NIST'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open('data/wmt19-metrics-task-package/baselines/mteval-inter.sys.score.gz', 'rt') if 'hybrid' not in i and 'NIST' in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)

metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

### Join metric and da scores

In [6]:
sys_scores_da = da_sys.merge(sys_scores)
sys_scores_da = sys_scores_da[['lp', 'score', 'system', 'BLEU', 'NIST', 'CDER', 'PER', 'TER', 'WER']]
sys_scores_da

Unnamed: 0,lp,score,system,BLEU,NIST,CDER,PER,TER,WER
0,en-cs,0.402,CUNI-DocTransformer-T2T.6751,0.1904,5.1045,0.5042,0.6061,0.4697,0.4427
1,en-cs,0.401,CUNI-Transformer-T2T-2018.6457,0.1968,5.2036,0.5042,0.6037,0.4714,0.4452
2,en-cs,0.388,CUNI-Transformer-T2T-2019.6851,0.1892,5.0849,0.4977,0.5971,0.4647,0.4386
3,en-cs,0.223,CUNI-DocTransformer-Marian.6922,0.1769,4.8610,0.4877,0.5878,0.4513,0.4243
4,en-cs,0.206,uedin.6667,0.1916,5.1417,0.4895,0.5877,0.4589,0.4323
...,...,...,...,...,...,...,...,...,...
220,de-fr,0.019,online-Y.0,0.3641,8.3309,0.5197,0.6065,0.4936,0.4665
221,de-fr,-0.104,TartuNLP-c.6897,0.3415,7.9147,0.4967,0.5908,0.4629,0.4348
222,de-fr,-0.194,online-A.0,0.3515,7.9717,0.5002,0.6134,0.4508,0.4212
223,de-fr,-0.240,online-G.0,0.3412,8.0219,0.4724,0.5738,0.4308,0.4067


### Validate correlations

In [7]:
sys_scores_da.groupby('lp') \
    ['lp'] \
    .count()

lp
de-cs    11
de-en    16
de-fr    11
en-cs    11
en-de    22
en-fi    12
en-gu    11
en-kk    11
en-lt    12
en-ru    12
en-zh    12
fi-en    12
fr-de    10
gu-en    11
kk-en    11
lt-en    11
ru-en    14
zh-en    15
Name: lp, dtype: int64

In [8]:
sys_scores_da[sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,de-en,fi-en,gu-en,kk-en,lt-en,ru-en,zh-en
Unnamed: 0_level_1,score,score,score,score,score,score,score
BLEU,0.849,0.982,0.834,0.946,0.961,0.879,0.899
CDER,0.89,0.988,0.876,0.967,0.975,0.892,0.917
NIST,0.813,0.986,0.93,0.942,0.944,0.925,0.921
PER,0.883,0.991,0.91,0.737,0.947,0.922,0.952
TER,0.874,0.984,0.89,0.799,0.96,0.917,0.84
WER,0.863,0.983,0.861,0.793,0.961,0.911,0.82
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
sys_scores_da[sys_scores_da.lp.str.startswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,en-cs,en-de,en-fi,en-gu,en-kk,en-lt,en-ru,en-zh
Unnamed: 0_level_1,score,score,score,score,score,score,score,score
BLEU,0.897,0.921,0.969,0.737,0.852,0.989,0.986,0.901
CDER,0.985,0.973,0.978,0.84,0.927,0.985,0.993,0.905
NIST,0.896,0.321,0.971,0.786,0.93,0.993,0.988,0.884
PER,0.976,0.97,0.982,0.839,0.921,0.985,0.981,0.895
TER,0.98,0.969,0.981,0.865,0.94,0.994,0.995,0.856
WER,0.982,0.966,0.98,0.861,0.939,0.991,0.994,0.875
score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
sys_scores_da[~sys_scores_da.lp.str.startswith('en') & ~sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::7] \
    .round(3) \
    .T \
    .sort_index()

lp,de-cs,de-fr,fr-de
Unnamed: 0_level_1,score,score,score
BLEU,0.941,0.891,0.864
CDER,0.864,0.949,0.852
NIST,0.954,0.916,0.862
PER,0.875,0.857,0.899
TER,0.89,0.956,0.895
WER,0.872,0.956,0.894
score,1.0,1.0,1.0


### WMT19 system-level data (raw)

In [26]:
lp_df = []

for file in glob.glob('data/newstest2019-humaneval/*-sntlevel-humaneval-newstest2019/analysis/ad-seg-scores-*.csv'):
    lp = file[-9:-4]
    print(lp)
    
    df = pd.read_csv(file, delimiter=' ')
    df['lp'] = [lp] * len(df)
    
    lp_df.append(df)
raw_seg_scores_da = pd.concat(lp_df)

de-cs
de-fr
zh-en
fr-de
lt-en
ru-en
kk-en
fi-en
gu-en


mturk = pd.read_csv('data/newstest2019-humaneval/mturk-sntlevel-humaneval-newstest2019/analysis/ad-good-raw-redup.csv', sep='\t')
print(len(mturk))
turkle = pd.read_csv('data/newstest2019-humaneval/turkle-sntlevel-humaneval-newstest2019/analysis/ad-good-raw-redup.csv', sep='\t')
print(len(turkle))
raw_seg_scores_da = pd.concat([mturk, turkle])
print(len(raw_seg_scores_da))

In [28]:
raw_seg_scores_da.head(1)

Unnamed: 0,SYS,SID,RAW.SCR,Z.SCR,N,Unnamed: 5,lp
0,online-Y.0,1478,50.0,-0.444243,1,,de-cs


In [29]:
raw_sys_scores_da = raw_seg_scores_da.groupby(['lp', 'SYS'], as_index=False).mean()

raw_sys_scores_da.columns = ['lp', 'system', 'sid', 'raw_score', 'score', 'N', '5']
raw_sys_scores_da = raw_sys_scores_da[['lp', 'system', 'raw_score', 'score']]

In [30]:
print(len(sys_scores_da))
sys_scores_da = sys_scores_da.merge(raw_sys_scores_da[['lp','system','raw_score']])
print(len(sys_scores_da))
sys_scores_da[['lp','raw_score', 'score']].groupby('lp').corr('pearson')

NameError: name 'sys_scores_da' is not defined

In [31]:
# we only have 81 systems left because most of the systems were
# labelled with DA + document context. The remaining 81 systems
# are evaluated with vanilla DA.

### WMT system-level (src, ref, out)

In [32]:
raw_seg_scores_da.head(1)

Unnamed: 0,SYS,SID,RAW.SCR,Z.SCR,N,Unnamed: 5,lp
0,online-Y.0,1478,50.0,-0.444243,1,,de-cs


In [33]:
raw_seg_scores_da = raw_seg_scores_da[['SYS','SID','RAW.SCR','Z.SCR','N','lp']]
raw_seg_scores_da.columns = ['system', 'sid', 'raw_score', 'score', 'N', 'lp']

raw_seg_scores_da['system'] = raw_seg_scores_da['system'].apply(lambda x: x[:-6] if x.endswith('.zh-en') else x)

In [34]:
raw_seg_scores_da.head(1)

Unnamed: 0,system,sid,raw_score,score,N,lp
0,online-Y.0,1478,50.0,-0.444243,1,de-cs


In [118]:
# srcs and refs
srcs, refs, lps, sids = [], [], [], []
for lp in raw_seg_scores_da.lp.unique():
    fr, to = lp[:2], lp[3:]
    print(lp)
    
    refs_ = list(open('data/wmt19-submitted-data/txt/references/newstest2019-%s%s-ref.%s' % (fr, to, to)))
    srcs_ = list(open('data/wmt19-submitted-data/txt/sources/newstest2019-%s%s-src.%s' % (fr, to, fr)))
    sids_ = list(range(1, len(refs_)+1))
    refs.extend(refs_)
    srcs.extend(srcs_)
    sids.extend(sids_)
    
    assert(len(refs_) == len(srcs_))
    lps.extend([lp]*len(refs_))
    
src_ref_df = pd.DataFrame({'reference' : refs, 'source':srcs, 'lp': lps, 'sid': sids})
print(len(src_ref_df), len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(src_ref_df, on=['lp','sid'], how='inner')
print(len(raw_seg_scores_da))

de-cs
de-fr
zh-en
fr-de
lt-en
ru-en
kk-en
fi-en
gu-en
14411 129126
129126


In [119]:
# outs
lps, outs, sids, syss = [], [], [], []
for file in glob.glob('data/wmt19-submitted-data/txt/system-outputs/newstest2019/*/*'):
    lp = file.split('.')[-1]
    system = '.'.join(file.split('.')[1:-1])
    print(lp, system)
    
    if 'Unsupervised.' in system:
        system = system[:13] + system[-4:]
    elif '_' in system and not any(['NEU_KingSoft' in system, 'Kyoto_University' in system, 'lingua_custodia_primary' in system]):
        system = system.replace('_', '-')
    
    outs_ = list(open(file, 'rt'))
    sids_ = list(range(1, len(outs_)+1))
    lps_ = len(outs_) * [lp]
    syss_ = len(outs_) * [system]
    
    outs.extend(outs_)
    sids.extend(sids_)
    lps.extend(lps_)
    syss.extend(syss_)

out_df = pd.DataFrame({'lp': lps, 'output':outs, 'sid': sids, 'system': syss})
print(len(out_df), len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(out_df, on=['lp','sid', 'system'], how='left')
print(len(raw_seg_scores_da))

cs-de online-B.0
cs-de Unsupervised.cs-de.6979
cs-de online-G.0
cs-de NEU_KingSoft.6783
cs-de online-A.0
cs-de online-Y.0
cs-de NICT.6948
en-gu online-B.0
en-gu MSRA.CrossBERT.6991
en-gu CUNI-T2T-transfer-engu.6466
en-gu GTCOM-Primary.6970
en-gu UdS-DFKI.6866
en-gu online-G.0
en-gu Ju_Saarland_clean_num_135_bpe.6617
en-gu UEDIN.6849
en-gu NICT.6604
en-gu online-X.0
en-gu IITP-MT.6827
kk-en rug_kken_morfessor.6677
kk-en DBMS-KU_KKEN.6726
kk-en online-G.0
kk-en NICT.6770
kk-en online-B.0
kk-en NEU.6753
kk-en talp_upc_2019_kken.6657
kk-en CUNI-T2T-transfer-kken.6436
kk-en NRC-CNRC.6895
kk-en UMD.6736
kk-en Frank_s_MT.6127
en-lt MSRA.MASS.6931
en-lt tilde-nc-nmt.6696
en-lt eTranslation.6836
en-lt TartuNLP-c.6510
en-lt NEU.6760
en-lt online-X.0
en-lt online-A.0
en-lt GTCOM-Primary.7001
en-lt online-G.0
en-lt MSRA.MASS.6954
en-lt tilde-c-nmt.6695
en-lt online-B.0
gu-en NICT.6603
gu-en GTCOM-Primary.6969
gu-en UdS-DFKI.6861
gu-en online-B.0
gu-en IITP-MT.6824
gu-en IIITH-MT.6688
gu-en NEU.675

In [120]:
raw_seg_scores_da.lp.value_counts()

ru-en    25310
fi-en    20420
zh-en    20199
de-cs    16900
gu-en    11996
kk-en    11805
lt-en    11796
de-fr     6700
fr-de     4000
Name: lp, dtype: int64

### Pickle

In [31]:
pickle.dump(raw_seg_scores_da, open('data/pickles/wmt19-sys_level-all.pkl', 'wb'))
#pickle.dump(sys_scores_da, open('data/pickles/wmt19-sys_level-agg.pkl', 'wb'))