In [4]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import hashlib
import pandas as pd
import numpy as np

# WMT17 system-level data

### Checksums

In [7]:
mine = '6971c87e22cf24c11bbf6551af64ab13'
yours = hashlib.md5(open('data/downloads/wmt17-submitted-data-v1.0.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

mine = 'f45f3160ff90e64f275944028739bd41'
# yours = hashlib.md5(open('data/downloads/wmt17-metrics-task-package.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

# unzip to data/
# os.system('tar -xvf data/downloads/newstest2017-segment-level-human.tar.gz -p data/')
# os.system('tar -xvf data/downloads/wmt17-metrics-task-package.tgz -p data/')

6971c87e22cf24c11bbf6551af64ab13
6971c87e22cf24c11bbf6551af64ab13
True
f45f3160ff90e64f275944028739bd41
6971c87e22cf24c11bbf6551af64ab13
False


### Official system-level da scores

In [3]:
official_da_sys = pd.read_csv('data/wmt17-metrics-task-package/manual-evaluation/DA-syslevel.csv', delimiter=' ', header=0)
official_da_sys.columns = ['lp', 'score', 'system']
official_da_sys['system'] = official_da_sys['system'].apply(lambda x: x.split('.')[0])

print('Duplications: %d' % sum(official_da_sys.drop('score', axis=1).duplicated()))
official_da_sys

Duplications: 0


Unnamed: 0,lp,score,system
0,en-lv,0.196,tilde-nc-nmt-smt-hybrid
1,en-lv,0.121,online-B
2,en-lv,0.104,tilde-c-nmt-smt-hybrid
3,en-lv,0.075,limsi-factored-norm
4,en-lv,0.058,usfd-consensus-qt21
...,...,...,...
147,de-en,-0.260,online-F
148,cs-en,0.181,uedin-nmt
149,cs-en,0.068,online-B
150,cs-en,-0.068,online-A


### Metric scores

In [4]:
sys_scores = pd.DataFrame(data={'lp':[], 'system':[]})

submission = 'data/wmt17-metrics-task-package/final-metric-scores/baselines/mteval.sys.score.gz'
hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open(submission, 'rt') if 'hybrid' not in i)
reader = io.StringIO(hybrid_filtered)
metric_syss = pd.read_csv(reader, delimiter='\t', header=None)
metric_syss.iloc[:,3] = metric_syss.iloc[:,3].apply(lambda x: x.split('.')[0])

bleu_scores = metric_syss[metric_syss.iloc[:,0] == 'BLEU']
nist_scores = metric_syss[metric_syss.iloc[:,0] == 'NIST']

bleu_scores.columns = ['name', 'lp', 'testset', 'system', 'BLEU']
nist_scores.columns = ['name', 'lp', 'testset', 'system', 'NIST']

sys_scores = sys_scores.merge(bleu_scores[['lp','system', 'BLEU']], on=['lp','system'], how='outer')
sys_scores = sys_scores.merge(nist_scores[['lp','system', 'NIST']], on=['lp','system'], how='outer')

In [5]:
baseline_syss = glob.glob('data/wmt17-metrics-task-package/final-metric-scores/baselines/*.sys.*')

for submission in itertools.chain(baseline_syss):
    if 'baselines.en-zh' in submission:
        continue
    if 'mteval' in submission:
        continue
        
    metric_name = submission.split('/')[-1]
    metric_name = metric_name[:-len('.sys.score.gz')] if metric_name.endswith('.gz') else metric_name[:-len('.sys.score')]
    print(metric_name)

    if submission.endswith('.gz'):
        hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open(submission, 'rt') if 'hybrid' not in i)
        reader = io.StringIO(hybrid_filtered)
        metric_syss = pd.read_csv(reader, delimiter='\t', header=None)
    else:
        metric_syss = pd.read_csv(open(submission, 'rt'), delimiter='\t', header=None)
    metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
    
    # fix system names
    metric_syss.dropna(inplace=True)
    metric_syss['system'] = metric_syss['system'].apply(lambda x: x.split('.')[0])
    
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

PER
WER
CDER
TER


In [6]:
# en-zh
en_zh_sys = pd.read_csv(gzip.open('data/wmt17-metrics-task-package/final-metric-scores/baselines/baselines.en-zh.sys.score.gz', 'rt'), delimiter='\t', header=None)
en_zh_sys.columns = ['metric', 'lp', 'testset', 'system', 'score']
en_zh_sys = en_zh_sys[~en_zh_sys.system.str.contains('hybrid')]
en_zh_sys['system'] = en_zh_sys['system'].apply(lambda x: x.split('.')[0])


for metric_name in en_zh_sys['metric'].unique():
    if metric_name in ['chrF']:
        continue
    print(metric_name)
        
    metric_syss = en_zh_sys[en_zh_sys.metric == metric_name]
    metric_syss.columns = ['metric', 'lp', 'testset', 'system', metric_name]
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')
    
    # fix _x and _y
    if '%s_x' % metric_name in sys_scores.columns:
        sys_scores[metric_name] = [ x['%s_x' % metric_name] if not np.isnan(x['%s_x' % metric_name]) else x['%s_y' % metric_name] for i, x in sys_scores.iterrows() ]
        sys_scores = sys_scores.drop(['%s_x' % metric_name, '%s_y' % metric_name], axis=1)

BLEU
CDER
NIST
PER
TER
WER


In [7]:
sys_scores

Unnamed: 0,lp,system,BLEU,CDER,NIST,PER,TER,WER
0,cs-en,online-A,0.2680,0.4765,7.4157,0.6082,0.4257,0.3939
1,cs-en,online-B,0.2877,0.4971,7.7562,0.6169,0.4573,0.4265
2,cs-en,PJATK,0.2432,0.4504,6.9479,0.5478,0.3599,0.3264
3,cs-en,uedin-nmt,0.3248,0.5253,8.0391,0.6385,0.4749,0.4430
4,de-en,C-3MA,0.3073,0.5170,8.1492,0.6240,0.4908,0.4613
...,...,...,...,...,...,...,...,...
281,en-zh,Oregon-State-University-S,0.2594,0.4339,7.0327,0.5618,0.3791,0.3208
282,en-zh,SogouKnowing-nmt,0.3489,0.5190,8.2129,0.6434,0.4625,0.3974
283,en-zh,uedin-nmt,0.3601,0.5315,8.3343,0.6548,0.4805,0.4196
284,en-zh,UU-HNMT,0.2392,0.4102,6.7102,0.5270,0.3798,0.3284


### Join metric scores

In [8]:
sys_scores_da = official_da_sys.merge(sys_scores, on=['lp', 'system'], how='left')
sys_scores_da   # 4 na

Unnamed: 0,lp,score,system,BLEU,CDER,NIST,PER,TER,WER
0,en-lv,0.196,tilde-nc-nmt-smt-hybrid,,,,,,
1,en-lv,0.121,online-B,0.1852,0.3925,5.7343,0.4859,0.3373,0.3139
2,en-lv,0.104,tilde-c-nmt-smt-hybrid,,,,,,
3,en-lv,0.075,limsi-factored-norm,0.1798,0.3799,5.5456,0.4447,0.3028,0.2787
4,en-lv,0.058,usfd-consensus-qt21,0.1927,0.3962,5.8896,0.4690,0.3240,0.2982
...,...,...,...,...,...,...,...,...,...
147,de-en,-0.260,online-F,0.1955,0.4201,6.6679,0.5513,0.3658,0.3285
148,cs-en,0.181,uedin-nmt,0.3248,0.5253,8.0391,0.6385,0.4749,0.4430
149,cs-en,0.068,online-B,0.2877,0.4971,7.7562,0.6169,0.4573,0.4265
150,cs-en,-0.068,online-A,0.2680,0.4765,7.4157,0.6082,0.4257,0.3939


### WMT17 system-level data (raw)

In [9]:
lp_df = []

for file in glob.glob('data/newstest2017-system-level-human/anon-proc-hits-sys-combined/analysis/ad-seg-scores-*.csv.gz'):
    lp = file[-12:-7]
    print(lp)
    
    df = pd.read_csv(gzip.open(file, 'rt'), delimiter=' ')
    df['lp'] = [lp] * len(df)
    
    lp_df.append(df)
raw_seg_scores_da = pd.concat(lp_df)

tr-en
en-zh
de-en
fi-en
lv-en
en-ru
en-cs
en-lv
zh-en
cs-en
en-tr
en-fi
ru-en
en-de


In [10]:
raw_sys_scores_da = raw_seg_scores_da.groupby(['lp', 'SYS'], as_index=False).mean()

raw_sys_scores_da.columns = ['lp', 'system', 'sid', 'raw_score', 'score', 'N', '5']
raw_sys_scores_da = raw_sys_scores_da[['lp', 'system', 'raw_score', 'score']]
raw_sys_scores_da['system'] = raw_sys_scores_da['system'].apply(lambda x: x.split('.')[0])
raw_sys_scores_da

Unnamed: 0,lp,system,raw_score,score
0,cs-en,PJATK,62.723466,-0.267913
1,cs-en,online-A,68.261460,-0.068109
2,cs-en,online-B,71.877972,0.068185
3,cs-en,uedin-nmt,74.569465,0.180920
4,de-en,C-3MA,68.607824,-0.102653
...,...,...,...,...
147,zh-en,online-B,69.885869,0.112879
148,zh-en,online-F,59.622814,-0.278690
149,zh-en,online-G,59.258396,-0.304650
150,zh-en,uedin-nmt,73.799021,0.208171


In [11]:
sys_scores_da = sys_scores_da.merge(raw_sys_scores_da[['lp','system','raw_score']])
sys_scores_da[['lp','raw_score', 'score']].groupby('lp').corr('pearson')

Unnamed: 0_level_0,Unnamed: 1_level_0,raw_score,score
lp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cs-en,raw_score,1.0,0.999526
cs-en,score,0.999526,1.0
de-en,raw_score,1.0,0.999429
de-en,score,0.999429,1.0
en-cs,raw_score,1.0,0.999788
en-cs,score,0.999788,1.0
en-de,raw_score,1.0,0.999175
en-de,score,0.999175,1.0
en-fi,raw_score,1.0,0.999757
en-fi,score,0.999757,1.0


### WMT17 system-level (src, ref, out)

In [12]:
raw_seg_scores_da = raw_seg_scores_da[['SYS','SID','RAW.SCR','Z.SCR','N','lp']]
raw_seg_scores_da.columns = ['system', 'sid', 'raw_score', 'score', 'N', 'lp']

raw_seg_scores_da['system'] = raw_seg_scores_da['system'].apply(lambda x: x.split('.')[0]) 
raw_seg_scores_da

Unnamed: 0,system,sid,raw_score,score,N,lp
0,online-A,2524,93.333333,1.448358,3,tr-en
1,online-A,1633,31.600000,-0.310297,5,tr-en
2,online-A,2872,50.500000,0.214607,2,tr-en
3,online-A,2828,36.500000,-0.141036,2,tr-en
4,online-A,2284,35.000000,-0.171877,2,tr-en
...,...,...,...,...,...,...
7020,KIT,1517,100.000000,1.409500,1,en-de
7021,KIT,560,63.000000,-0.252765,1,en-de
7022,KIT,1242,89.000000,0.761543,1,en-de
7023,KIT,2016,56.000000,-0.525138,1,en-de


In [13]:
# srcs and refs
srcs, refs, lps, sids = [], [], [], []
for lp in sys_scores_da.lp.unique():
    fr, to = lp[:2], lp[3:]
    
    refs_ = list(open('data/wmt17-submitted-data/txt/references/newstest2017-%s%s-ref.%s' % (fr, to, to)))
    srcs_ = list(open('data/wmt17-submitted-data/txt/sources/newstest2017-%s%s-src.%s' % (fr, to, fr)))
    sids_ = list(range(1, len(refs_)+1))
    refs.extend(refs_)
    srcs.extend(srcs_)
    sids.extend(sids_)
    
    assert(len(refs_) == len(srcs_))
    lps.extend([lp]*len(refs_))
    
src_ref_df = pd.DataFrame({'reference' : refs, 'source':srcs, 'lp': lps, 'sid': sids})
print('# of entries before merge: %d' % len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(src_ref_df, on=['lp','sid'], how='inner')
print('# of entries after merge: %d' % len(raw_seg_scores_da))
print('These two should be equal.')

# of entries before merge: 203012
# of entries after merge: 203012
These two should be equal.


In [14]:
# outs
lps, outs, sids, syss, syss_full = [], [], [], [], []
for file in glob.glob('data/wmt17-submitted-data/txt/system-outputs/newstest2017/*/*'):
    lp = file.split('.')[-1]
    system_full = '.'.join(file.split('.')[1:-1])
    system = system_full.split('.')[0]
    
    # manual fixes
    if 'tuning' in file:
        system = file.split('.')[1]
    elif system == 'CASICT-DCU-NMT':
        system = 'CASICT-cons'
    elif system == 'FBK':
        system = 'fbk-nmt-combination'    
    
    outs_ = list(open(file, 'rt'))
    sids_ = list(range(1, len(outs_)+1))
    lps_ = len(outs_) * [lp]
    syss_ = len(outs_) * [system]
    syss_full_ = len(outs_) * [system_full]
    
    outs.extend(outs_)
    sids.extend(sids_)
    lps.extend(lps_)
    syss.extend(syss_)
    syss_full.extend(syss_full_)

out_df = pd.DataFrame({'lp': lps, 'output':outs, 'sid': sids, 'system': syss, 'system_full':syss_full})
print('# of entries before merge: %d' % len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(out_df, on=['lp','sid', 'system'], how='inner')
print('# of entries after merge: %d' % len(raw_seg_scores_da))
print('These two should be equal.')

# of entries before merge: 203012
# of entries after merge: 203012
These two should be equal.


### Pickle

In [15]:
pickle.dump(raw_seg_scores_da, open('data/pickles/wmt17-sys_level-all.pkl', 'wb'))
pickle.dump(sys_scores_da, open('data/pickles/wmt17-sys_level-agg.pkl', 'wb'))

# WMT17 segment-level data

In [16]:
seg_scores_da = pd.read_csv('data/wmt17-metrics-task-package/manual-evaluation/DA-seglevel.csv', delimiter=' ')
seg_scores_da.columns = ['lp', 'testset', 'system', 'sid', 'score']
seg_scores_da['system'] = seg_scores_da['system'].apply(lambda x: x.split('+')[0])

In [17]:
# there is no ROCNMT.5167 only ROCNMT.5183 ???
seg_scores_da = seg_scores_da.replace('ROCMT.5167', 'ROCMT.5183')
seg_scores_da = seg_scores_da.replace('CASICT-cons.5157', 'CASICT-DCU-NMT.5157')
seg_scores_da = seg_scores_da.replace('CASICT-cons.5144', 'CASICT-DCU-NMT.5144')

In [18]:
print(len(seg_scores_da))
out_df['system'] = out_df['system_full']

# need to have two merges, raw_seg_scores_da does not contain all that we need
seg_scores_da = seg_scores_da.merge(out_df[['lp', 'system', 'sid', 'output']], on=['lp', 'system', 'sid'], how='inner')
seg_scores_da = seg_scores_da.merge(src_ref_df[['lp', 'sid', 'source', 'reference']], on=['lp', 'sid'], how='inner')
print(len(seg_scores_da))

5040
5040


In [19]:
seg_scores_da = seg_scores_da[['lp', 'system', 'sid', 'output', 'source', 'reference', 'score']]

In [20]:
seg_scores_da

Unnamed: 0,lp,system,sid,output,source,reference,score
0,en-zh,UU-HNMT.5134,11,“他找到了一个公寓，他和约会一个女孩，”路易斯·卡利亚说。\n,"""He found an apartment, he was dating a girl,""...",Louis Galicia 告诉 KGO：“Frank找到一间公寓，同时在跟一个女孩交往。”\n,-0.522268
1,en-zh,jhu-nmt.5153,28,这场纠纷导致了今年参加六次罢工的初级医生，其中包括NHS历史上第一次停工。\n,The dispute has led to junior doctors taking p...,该纠纷已导致初级医生今年共参与六次罢工，包括英国国家医疗服务体系历史上的首次全面罢工。\n,-0.150441
2,en-zh,CASICT-DCU-NMT.5157,39,我迷失了。\n,I lost count.\n,我记不清了。\n,-0.330678
3,en-zh,online-A.0,49,"自11月以来, 俄罗斯公众舆论也出现了转变。\n",Russian public opinion has also turned since N...,自11月份开始，俄罗斯民意也有所扭转。\n,0.490614
4,en-zh,CASICT-DCU-NMT.5157,68,安卡拉对西方感到愤怒，因为它认为对企图收购的反应是微弱的。\n,Ankara is angry with the West for what it cons...,安卡拉对于西方世界对接管意图的微弱反应感到愤怒。\n,-0.431865
...,...,...,...,...,...,...,...
5035,en-ru,jhu-pbmt.4986,2642,"Кендалл, который является послом марки Estee Л...","Kendall, who is an Estee Lauder brand ambassad...","Кендалл, которая является лицом бренда Estee L...",-0.140292
5036,en-ru,jhu-pbmt.4986,2719,"Во вторник, в ""Фейсбуке"" сообщил, что его наст...","On Tuesday, Facebook said that its desktop web...","Во вторник Facebook объявил, что настольная ве...",0.087274
5037,en-ru,online-F.0,2760,Результат средние температуры климата которые ...,The result is mean climate temperatures that a...,Результатом являются средние климатические тем...,-0.836368
5038,en-ru,uedin-nmt.4756,2803,"По их пиковым ставкам, эти матовые звезды отни...","At their peak spin rates, these maturing stars...",На своих пиковых скоростях вращения эти взросл...,-1.162147


In [21]:
seg_scores_da.groupby('lp').count()

Unnamed: 0_level_0,system,sid,output,source,reference,score
lp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cs-en,560,560,560,560,560,560
de-en,560,560,560,560,560,560
en-ru,560,560,560,560,560,560
en-zh,560,560,560,560,560,560
fi-en,560,560,560,560,560,560
lv-en,560,560,560,560,560,560
ru-en,560,560,560,560,560,560
tr-en,560,560,560,560,560,560
zh-en,560,560,560,560,560,560


### Add baselines

In [22]:
sentbleu = pd.read_csv('data/wmt17-metrics-task-package/final-metric-scores/baselines/sentence-BLEU.seg.score.gz', 
                    delimiter='\t',
                    header=None)
sentbleu.columns = [ 'name', 'lp', 'testset', 'system', 'sid', 'sentBLEU' ]
sentbleu = sentbleu[sentbleu.testset == 'newstest2017']
sentbleu = sentbleu[~(sentbleu.lp == 'en-zh')]
sentbleu

Unnamed: 0,name,lp,testset,system,sid,sentBLEU
3494,sentBLEU,cs-en,newstest2017,online-A.0,1,0.265386
3495,sentBLEU,cs-en,newstest2017,online-A.0,2,0.189448
3496,sentBLEU,cs-en,newstest2017,online-A.0,3,0.249669
3497,sentBLEU,cs-en,newstest2017,online-A.0,4,0.312581
3498,sentBLEU,cs-en,newstest2017,online-A.0,5,0.173457
...,...,...,...,...,...,...
428931,sentBLEU,en-lv,newstest2017,tilde-nc-nmt-smt-hybrid.5047,1997,0.120454
428932,sentBLEU,en-lv,newstest2017,tilde-nc-nmt-smt-hybrid.5047,1998,0.162550
428933,sentBLEU,en-lv,newstest2017,tilde-nc-nmt-smt-hybrid.5047,1999,0.061098
428934,sentBLEU,en-lv,newstest2017,tilde-nc-nmt-smt-hybrid.5047,2000,0.215490


In [23]:
print(len(seg_scores_da))
seg_scores_da = seg_scores_da.merge(sentbleu[['lp', 'system', 'sid', 'sentBLEU']], on=['lp', 'system', 'sid'], how='left')
print(len(seg_scores_da))

5040
5040


### Pickle

In [26]:
pickle.dump(seg_scores_da, open('data/pickles/wmt17-seg_level-agg.pkl', 'wb'))