In [1]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import hashlib
import pandas as pd
import numpy as np

# WMT16 system-level data

### Checksums

In [2]:
mine = 'c983b60fa311b60c04c0293aaf1a2bc4'
#yours = hashlib.md5(open("data/downloads/wmt16-submitted-data-v2.tgz", 'rb').read()).hexdigest()
#print(mine + '\n' + yours)
#print(mine == yours)


mine = '2acd4f1d8fcc07115cc06bcaed4ff236'
#yours = hashlib.md5(open("data/downloads/wmt16-metrics-results.tar.gz", 'rb').read()).hexdigest()
#print(mine + '\n' + yours)
#print(mine == yours)

# unzip to data/
if False:
    os.system('tar -xvf data/downloads/wmt16-submitted-data-v2.tgz -p data/')
    os.system('tar -xvf data/downloads/wmt16-metrics-results.tar.gz -p data/')

### Aggregate system-level data

In [3]:
da_files = [ i for i in glob.glob('data/wmt16-metrics-results/sys-level-results/standard/results-official/newstest2016*') if not 'noDA' in i ]
lp_df = []

for file in da_files:
    lp = file[-8:-4]
    lp = lp[0:2] + '-' + lp[2:]
    
    df = pd.read_csv(file, delimiter=' ')
    df['lp'] = lp
    
    lp_df.append(df)
sys_scores_da = pd.concat(lp_df)

Rename columns:

In [4]:
replace = {'DA':'score', 'MT':'system', 'mtevalBLEU':'BLEU', 'mtevalNIST':'NIST', 'mosesCDER':'CDER', 'mosesPER':'PER', 'mosesWER':'WER'}
sys_scores_da.columns = [ replace[i] if i in replace else i for i in sys_scores_da.columns ]

sys_scores_da = sys_scores_da[['lp', 'system', 'BLEU', 'NIST', 'CDER', 'PER', 'WER', 'TER', 'score']]
sys_scores_da

Unnamed: 0,lp,system,BLEU,NIST,CDER,PER,WER,TER,score
0,fi-en,UH-opus,0.2399,7.1450,0.4319,0.5509,0.3435,0.3933,0.064852
1,fi-en,online-B,0.2495,7.4056,0.4446,0.5889,0.3707,0.4141,0.095311
2,fi-en,online-A,0.2087,6.5204,0.4053,0.4948,0.2883,0.3351,-0.125989
3,fi-en,UH-factored,0.2095,6.6405,0.4010,0.5206,0.3106,0.3588,-0.097757
4,fi-en,PROMT-SMT,0.2183,6.8978,0.4185,0.5402,0.3226,0.3711,-0.036827
...,...,...,...,...,...,...,...,...,...
7,en-ru,online-G,0.2761,7.1012,0.4614,0.5537,0.4063,0.4263,0.101016
8,en-ru,AFRL-MITLL-verb-annot,0.2160,6.1589,0.4144,0.4928,0.3461,0.3656,-0.092687
9,en-ru,PROMT-Rule-based,0.2346,6.7571,0.4405,0.5246,0.3839,0.4027,0.257558
10,en-ru,AFRL-MITLL-phrase-based,0.2449,6.6385,0.4367,0.5295,0.3786,0.3976,-0.076716


### WMT16 system-level data (all/raw)

In [5]:
lp_df = []

for file in glob.glob('data/da-human-judgments/ad-seg-scores-*.csv'):
    lp = file[-9:-4]
    
    df = pd.read_csv(file, delimiter=' ')
    df['lp'] = [lp] * len(df)
    
    lp_df.append(df)
raw_seg_scores_da = pd.concat(lp_df)

In [6]:
N = raw_seg_scores_da.groupby(['lp', 'SYS'], as_index=False).count()['N']
raw_sys_scores_da = raw_seg_scores_da.groupby(['lp', 'SYS'], as_index=False).mean()
raw_sys_scores_da['N'] = N

raw_sys_scores_da.columns = ['lp', 'system', 'sid', 'raw_score', 'score', 'N', '5']
raw_sys_scores_da = raw_sys_scores_da[['lp', 'system', 'raw_score', 'score', 'N']]
raw_sys_scores_da

Unnamed: 0,lp,system,raw_score,score,N
0,cs-en,PJATK,68.954545,-0.024469,2464
1,cs-en,cu-mergedtrees,55.783751,-0.502992,2437
2,cs-en,jhu-pbmt,72.568873,0.101488,2432
3,cs-en,online-A,69.455488,0.000170,2460
4,cs-en,online-B,70.844961,0.050822,2451
...,...,...,...,...,...
58,tr-en,jhu-syntax,40.803632,-0.363599,1762
59,tr-en,online-A,52.190503,0.002009,1790
60,tr-en,online-B,57.128603,0.163079,1804
61,tr-en,online-G,54.964850,0.108645,1835


In [7]:
sys_scores_da = sys_scores_da.merge(raw_sys_scores_da[['lp','system','raw_score']])
sys_scores_da[['lp','raw_score', 'score']].groupby('lp').corr('pearson')

Unnamed: 0_level_0,Unnamed: 1_level_0,raw_score,score
lp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cs-en,raw_score,1.0,0.999818
cs-en,score,0.999818,1.0
de-en,raw_score,1.0,0.999354
de-en,score,0.999354,1.0
en-ru,raw_score,1.0,0.993851
en-ru,score,0.993851,1.0
fi-en,raw_score,1.0,0.999598
fi-en,score,0.999598,1.0
ro-en,raw_score,1.0,0.998583
ro-en,score,0.998583,1.0


### WMT16 (src, ref, out)

In [8]:
# srcs and refs
srcs, refs, lps, sids = [], [], [], []
for lp in sys_scores_da.lp.unique():
    fr, to = lp[:2], lp[3:]
    
    refs_ = list(open('data/wmt16-submitted-data/txt/references/newstest2016-%s%s-ref.%s' % (fr, to, to)))
    srcs_ = list(open('data/wmt16-submitted-data/txt/sources/newstest2016-%s%s-src.%s' % (fr, to, fr)))
    sids_ = list(range(1, len(refs_)+1))
    refs.extend(refs_)
    srcs.extend(srcs_)
    sids.extend(sids_)
    
    assert(len(refs_) == len(srcs_))
    lps.extend([lp]*len(refs_))
    
src_ref_df = pd.DataFrame({'reference' : refs, 'source':srcs, 'lp': lps, 'SID': sids})
print('# of entries before merge: %d' % len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(src_ref_df, on=['lp','SID'], how='inner')
print('# of entries after merge: %d' % len(raw_seg_scores_da))
print('These two should be equal.')

# of entries before merge: 141905
# of entries after merge: 141905
These two should be equal.


In [9]:
# outs
lps, outs, sids, syss = [], [], [], []
for file in glob.glob('data/wmt16-submitted-data/txt/system-outputs/newstest2016/*/*'):
    lp = file.split('.')[-1]
    system = file.split('.')[-3]
    
    outs_ = list(open(file, 'rt'))
    sids_ = list(range(1, len(outs_)+1))
    lps_ = len(outs_) * [lp]
    syss_ = len(outs_) * [system]
    
    outs.extend(outs_)
    sids.extend(sids_)
    lps.extend(lps_)
    syss.extend(syss_)

out_df = pd.DataFrame({'lp': lps, 'output':outs, 'SID': sids, 'SYS': syss})
print('# of entries before merge: %d' % len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(out_df, on=['lp','SID', 'SYS'], how='inner')
print('# of entries after merge: %d' % len(raw_seg_scores_da))
print('These two should be equal.')

# of entries before merge: 141905
# of entries after merge: 141905
These two should be equal.


In [10]:
raw_seg_scores_da.columns = ['system', 'sid', 'raw_score', 'score', 'N', 'nan', 'lp', 'reference', 'source', 'output']
raw_seg_scores_da = raw_seg_scores_da[['system', 'sid', 'raw_score', 'score', 'N', 'lp', 'reference', 'source', 'output']]

### Pickle

In [11]:
pickle.dump(raw_seg_scores_da, open('data/pickles/wmt16-sys_level-all.pkl', 'wb'))
pickle.dump(sys_scores_da, open('data/pickles/wmt16-sys_level-agg.pkl', 'wb'))

# WMT16 segment-level data

In [12]:
da_files = [ i for i in glob.glob('data/wmt16-metrics-results/seg-level-results/da-results/metrics.*.csv') if not 'noDA' in i ]
lp_df = []

for file in da_files:
    lp = file[-9:-4]
    
    df = pd.read_csv(file, delimiter=' ')
    df['lp'] = lp
    lp_df.append(df)
    
    #df['source'] = list(open('data/wmt16-metrics-results/seg-level-results/da-results/src.%s' % lp, 'rt'))
    #df['output'] = list(open('data/wmt16-metrics-results/seg-level-results/da-results/snt.%s' % lp, 'rt'))
    #df['reference'] = list(open('data/wmt16-metrics-results/seg-level-results/da-results/ref.%s' % lp, 'rt'))
    
seg_scores_da = pd.concat(lp_df)

In [13]:
seg_scores_da.groupby('lp').count()[['SID']]

Unnamed: 0_level_0,SID
lp,Unnamed: 1_level_1
cs-en,560
de-en,560
en-ru,560
fi-en,560
ro-en,560
ru-en,560
tr-en,560


Rename columns:

In [14]:
replace = {'HUMAN.Z':'score', 'SID':'sid', 'MT':'system', 'mtevalBLEU':'BLEU', 'mtevalNIST':'NIST', 'mosesCDER':'CDER', 'mosesPER':'PER', 'mosesWER':'WER'}
seg_scores_da.columns = [ replace[i] if i in replace else i for i in seg_scores_da.columns ]
#seg_scores_da = seg_scores_da[['lp', 'system', 'sid', 'source', 'reference', 'output', 'sentBLEU', 'score']]
seg_scores_da = seg_scores_da[['lp', 'system', 'sid', 'sentBLEU', 'score']]
seg_scores_da

Unnamed: 0,lp,system,sid,sentBLEU,score
0,en-ru,jhu-pbmt,1092,0.273012,0.363122
1,en-ru,online-G,750,0.076668,-0.450232
2,en-ru,AFRL-MITLL-phrase-based,2786,0.252464,0.113451
3,en-ru,LIMSI,250,0.531697,-0.257524
4,en-ru,AFRL-MITLL-phrase-based,88,0.097414,-0.695001
...,...,...,...,...,...
555,de-en,uedin-syntax,2921,0.316149,-0.893829
556,de-en,KIT,2090,0.772290,1.019740
557,de-en,KIT,2158,0.526244,0.934524
558,de-en,KIT,2097,0.292517,0.867003


In [15]:
src_ref_out_df = out_df.merge(src_ref_df, how='left')
src_ref_out_df.columns = ['lp', 'output', 'sid', 'system', 'reference', 'source']
print(len(seg_scores_da))
seg_scores_da = seg_scores_da.merge(src_ref_out_df, on=['lp', 'system', 'sid'], how='inner')
print(len(seg_scores_da))

3920
3920


### Pickle

In [16]:
pickle.dump(seg_scores_da, open('data/pickles/wmt16-seg_level-agg.pkl', 'wb'))