In [1]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import hashlib
import pandas as pd
import numpy as np

# WMT16 system-level data

### Checksums

In [None]:
mine = 'c983b60fa311b60c04c0293aaf1a2bc4'
yours = hashlib.md5(open("data/downloads/wmt16-submitted-data-v2.tgz", 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)


mine = '2acd4f1d8fcc07115cc06bcaed4ff236'
yours = hashlib.md5(open("data/downloads/wmt16-metrics-results.tar.gz", 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

# unzip to data/
if False:
    os.system('tar -xvf data/downloads/wmt16-submitted-data-v2.tgz -p data/')
    os.system('tar -xvf data/downloads/wmt16-metrics-results.tar.gz -p data/')

### Aggregate system-level data

In [None]:
da_files = [ i for i in glob.glob('data/wmt16-metrics-results/sys-level-results/standard/results-official/newstest2016*') if not 'noDA' in i ]
lp_df = []

for file in da_files:
    lp = file[-8:-4]
    lp = lp[0:2] + '-' + lp[2:]
    
    df = pd.read_csv(file, delimiter=' ')
    df['lp'] = lp
    
    lp_df.append(df)
sys_scores_da = pd.concat(lp_df)

Rename columns:

In [None]:
replace = {'DA':'score', 'MT':'system', 'mtevalBLEU':'BLEU', 'mtevalNIST':'NIST', 'mosesCDER':'CDER', 'mosesPER':'PER', 'mosesWER':'WER'}
sys_scores_da.columns = [ replace[i] if i in replace else i for i in sys_scores_da.columns ]

sys_scores_da = sys_scores_da[['lp', 'system', 'BLEU', 'NIST', 'CDER', 'PER', 'WER', 'TER', 'score']]
sys_scores_da

### WMT16 system-level data (all/raw)

In [None]:
lp_df = []

for file in glob.glob('data/wmt-human-evaluation/da-human-judgments/ad-seg-scores-*.csv'):
    lp = file[-9:-4]
    
    df = pd.read_csv(file, delimiter=' ')
    df['lp'] = [lp] * len(df)
    
    lp_df.append(df)
raw_seg_scores_da = pd.concat(lp_df)

In [None]:
N = raw_seg_scores_da.groupby(['lp', 'SYS'], as_index=False).count()['N']
raw_sys_scores_da = raw_seg_scores_da.groupby(['lp', 'SYS'], as_index=False).mean()
raw_sys_scores_da['N'] = N

raw_sys_scores_da.columns = ['lp', 'system', 'sid', 'raw_score', 'score', 'N', '5']
raw_sys_scores_da = raw_sys_scores_da[['lp', 'system', 'raw_score', 'score', 'N']]
raw_sys_scores_da

In [None]:
sys_scores_da = sys_scores_da.merge(raw_sys_scores_da[['lp','system','raw_score']])
sys_scores_da[['lp','raw_score', 'score']].groupby('lp').corr('pearson')

### WMT16 (src, ref, out)

In [None]:
# srcs and refs
srcs, refs, lps, sids = [], [], [], []
for lp in sys_scores_da.lp.unique():
    fr, to = lp[:2], lp[3:]
    
    refs_ = list(open('data/wmt16-submitted-data/txt/references/newstest2016-%s%s-ref.%s' % (fr, to, to)))
    srcs_ = list(open('data/wmt16-submitted-data/txt/sources/newstest2016-%s%s-src.%s' % (fr, to, fr)))
    sids_ = list(range(1, len(refs_)+1))
    refs.extend(refs_)
    srcs.extend(srcs_)
    sids.extend(sids_)
    
    assert(len(refs_) == len(srcs_))
    lps.extend([lp]*len(refs_))
    
df = pd.DataFrame({'reference' : refs, 'source':srcs, 'lp': lps, 'SID': sids})
print('# of entries before merge: %d' % len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(df, on=['lp','SID'], how='inner')
print('# of entries after merge: %d' % len(raw_seg_scores_da))
print('These two should be equal.')

In [None]:
# outs
lps, outs, sids, syss = [], [], [], []
for file in glob.glob('data/wmt16-submitted-data/txt/system-outputs/newstest2016/*/*'):
    lp = file.split('.')[-1]
    system = file.split('.')[-3]
    
    outs_ = list(open(file, 'rt'))
    sids_ = list(range(1, len(outs_)+1))
    lps_ = len(outs_) * [lp]
    syss_ = len(outs_) * [system]
    
    outs.extend(outs_)
    sids.extend(sids_)
    lps.extend(lps_)
    syss.extend(syss_)

df = pd.DataFrame({'lp': lps, 'output':outs, 'SID': sids, 'SYS': syss})
print('# of entries before merge: %d' % len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(df, on=['lp','SID', 'SYS'], how='inner')
print('# of entries after merge: %d' % len(raw_seg_scores_da))
print('These two should be equal.')

In [None]:
raw_seg_scores_da.columns = ['system', 'sid', 'raw_score', 'score', 'N', 'nan', 'lp', 'reference', 'source', 'output']
raw_seg_scores_da = raw_seg_scores_da[['system', 'sid', 'raw_score', 'score', 'N', 'lp', 'reference', 'source', 'output']]

### Validate correlations

In [None]:
sys_scores_da.groupby('lp') \
    ['lp'] \
    .count()

In [None]:
sys_scores_da[sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[6::8] \
    .round(3) \
    .T \
    .sort_index()

In [None]:
sys_scores_da[~sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[6::8] \
    .round(3) \
    .T \
    .sort_index()

### Pickle

In [None]:
pickle.dump(raw_seg_scores_da, open('data/pickles/wmt16-sys_level-all.pkl', 'wb'))
pickle.dump(sys_scores_da, open('data/pickles/wmt16-sys_level-agg.pkl', 'wb'))

# WMT16 segment-level data

In [2]:
da_files = [ i for i in glob.glob('data/wmt16-metrics-results/seg-level-results/da-results/metrics.*.csv') if not 'noDA' in i ]
lp_df = []

for file in da_files:
    lp = file[-9:-4]
    
    df = pd.read_csv(file, delimiter=' ')
    df['lp'] = lp
    lp_df.append(df)
    
    df['source'] = list(open('data/wmt16-metrics-results/seg-level-results/da-results/src.%s' % lp, 'rt'))
    df['output'] = list(open('data/wmt16-metrics-results/seg-level-results/da-results/snt.%s' % lp, 'rt'))
    df['reference'] = list(open('data/wmt16-metrics-results/seg-level-results/da-results/ref.%s' % lp, 'rt'))
    
seg_scores_da = pd.concat(lp_df)

In [3]:
seg_scores_da.groupby('lp').count()[['SID']]

Unnamed: 0_level_0,SID
lp,Unnamed: 1_level_1
cs-en,560
de-en,560
en-ru,560
fi-en,560
ro-en,560
ru-en,560
tr-en,560


Rename columns:

In [4]:
replace = {'HUMAN.Z':'score', 'SID':'sid', 'MT':'system', 'mtevalBLEU':'BLEU', 'mtevalNIST':'NIST', 'mosesCDER':'CDER', 'mosesPER':'PER', 'mosesWER':'WER'}
seg_scores_da.columns = [ replace[i] if i in replace else i for i in seg_scores_da.columns ]
seg_scores_da = seg_scores_da[['lp', 'system', 'sid', 'source', 'reference', 'output', 'sentBLEU', 'chrF1', 'BEER',  'score']]
seg_scores_da

Unnamed: 0,lp,system,sid,source,reference,output,sentBLEU,chrF1,BEER,score
0,en-ru,jhu-pbmt,1092,"To recall, Luis Enrique trained the ""wolves"" d...","Напомним, что Луис Энрике тренировал ""волков"" ...","Напомним, Луис Энрике тренировал ""волков"" в се...",0.273012,73.3096,0.658724,0.363122
1,en-ru,online-G,750,It could have had it last night.\n,Она могла получить компресс вчера вечером.\n,Оно смогло иметь его вчера вечером.\n,0.076668,61.0558,0.449425,-0.450232
2,en-ru,AFRL-MITLL-phrase-based,2786,"Police asked the caller his name, but he didn'...","Полиция поинтересовалась именем звонившего, но...","Полиция попросила звонящему его имя, но он не ...",0.252464,69.0143,0.542931,0.113451
3,en-ru,LIMSI,250,"In a report this week, Morgan Stanley analyst ...","В отчете, опубликованном на этой неделе, Том К...","В отчете, опубликованном на этой неделе, по оц...",0.531697,69.1746,0.611773,-0.257524
4,en-ru,AFRL-MITLL-phrase-based,88,There is a potential investor who is ready to ...,"Есть потенциальный инвестор, готовый вложить в...","Есть потенциальный инвестор, который готов вло...",0.097414,61.5655,0.485377,-0.695001
...,...,...,...,...,...,...,...,...,...,...
555,de-en,uedin-syntax,2921,Ursprünglich hatte dieser Punkt auf dem nicht-...,Originally this point was scheduled as a part ...,This point had originally stood on the non pub...,0.316149,60.5717,0.590319,-0.893829
556,de-en,KIT,2090,"Metcash lehnte es ab, öffentlich auf die Komme...",Metcash has declined to respond publicly to Mr...,Metcash refused to respond publicly to the com...,0.772290,89.4910,0.832221,1.019740
557,de-en,KIT,2158,"Jede Wohneinheit hat zwei Schlafzimmer, ein Ba...","Each living unit has two bedrooms, one bathroo...","Each housing unit has two bedrooms, a bath, a ...",0.526244,78.9509,0.735487,0.934524
558,de-en,KIT,2097,"Wir brauchen wirklich, wirklich die Unterstütz...","We really, really need the support of Celtic b...",We really need to really support Celtic becaus...,0.292517,61.0922,0.645676,0.867003


### Validate correlations

In [5]:
seg_scores_da.groupby('lp').corr()['score']

lp             
cs-en  sid        -0.024384
       sentBLEU    0.556577
       chrF1       0.643857
       BEER        0.661487
       score       1.000000
de-en  sid        -0.022854
       sentBLEU    0.483888
       chrF1       0.451662
       BEER        0.470757
       score       1.000000
en-ru  sid         0.055816
       sentBLEU    0.550092
       chrF1       0.641864
       BEER        0.665764
       score       1.000000
fi-en  sid        -0.012614
       sentBLEU    0.448357
       chrF1       0.454240
       BEER        0.461745
       score       1.000000
ro-en  sid        -0.133624
       sentBLEU    0.498979
       chrF1       0.570245
       BEER        0.551429
       score       1.000000
ru-en  sid        -0.022056
       sentBLEU    0.501937
       chrF1       0.521555
       BEER        0.532879
       score       1.000000
tr-en  sid         0.029222
       sentBLEU    0.531602
       chrF1       0.550505
       BEER        0.544962
       score       1.000000
Name

The de-en and fi-en columns are switched comparing our results to the paper. I will assume there is a mistake in the paper.

### Pickle

In [6]:
pickle.dump(seg_scores_da, open('data/pickles/wmt16-seg_level-agg.pkl', 'wb'))