In [55]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import hashlib
import pandas as pd
import numpy as np

# WMT17 system-level data

### Checksums

In [56]:
mine = '6971c87e22cf24c11bbf6551af64ab13'
yours = hashlib.md5(open('data/downloads/wmt17-submitted-data-v1.0.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

mine = 'f45f3160ff90e64f275944028739bd41'
yours = hashlib.md5(open('data/downloads/wmt17-metrics-task-package.tgz', 'rb').read()).hexdigest()
print(mine + '\n' + yours)
print(mine == yours)

# unzip to data/
# os.system('tar -xvf data/downloads/newstest2017-segment-level-human.tar.gz -p data/')
# os.system('tar -xvf data/downloads/wmt17-metrics-task-package.tgz -p data/')

6971c87e22cf24c11bbf6551af64ab13
6971c87e22cf24c11bbf6551af64ab13
True


FileNotFoundError: [Errno 2] No such file or directory: 'data/downloads/wmt17-metrics-task-package.tgz'

### Official system-level da scores

In [57]:
official_da_sys = pd.read_csv('data/wmt17-metrics-task-package/manual-evaluation/DA-syslevel.csv', delimiter=' ', header=0)
official_da_sys.columns = ['lp', 'score', 'system']
official_da_sys['system'] = official_da_sys['system'].apply(lambda x: x.split('.')[0])

### Metric scores

In [58]:
sys_scores = pd.DataFrame(data={'lp':[], 'system':[]})

baseline_syss = glob.glob('data/wmt17-metrics-task-package/final-metric-scores/baselines/*.sys.*')

for submission in itertools.chain(baseline_syss):
    metric_name = submission.split('/')[-1]
    metric_name = metric_name[:-len('.sys.score.gz')] if metric_name.endswith('.gz') else metric_name[:-len('.sys.score')]
    print(metric_name)

    if submission.endswith('.gz'):
        hybrid_filtered = '\n'.join(i.replace(' ', '\t') for i in gzip.open(submission, 'rt') if 'hybrid' not in i)
        reader = io.StringIO(hybrid_filtered)
        metric_syss = pd.read_csv(reader, delimiter='\t', header=None)
    else:
        metric_syss = pd.read_csv(open(submission, 'rt'), delimiter='\t', header=None)
    metric_syss.columns = ['name', 'lp', 'testset', 'system', metric_name] + list(metric_syss.columns[5:])
    
    # fix system names
    metric_syss.dropna(inplace=True)
    metric_syss['system'] = metric_syss['system'].apply(lambda x: x.split('.')[0])
    
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')

PER
WER
CDER
baselines.en-zh
mteval
TER


In [59]:
# en-zh
en_zh_sys = pd.read_csv(gzip.open('data/wmt17-metrics-task-package/final-metric-scores/baselines/baselines.en-zh.sys_hide.score.gz', 'rt'), delimiter='\t', header=None)
en_zh_sys.columns = ['metric', 'lp', 'testset', 'system', 'score']
en_zh_sys = en_zh_sys[~en_zh_sys.system.str.contains('hybrid')]
en_zh_sys['system'] = en_zh_sys['system'].apply(lambda x: x.split('.')[0])


for metric_name in en_zh_sys['metric'].unique():
    if metric_name in ['chrF']:
        continue
    print(metric_name)
        
    metric_syss = en_zh_sys[en_zh_sys.metric == metric_name]
    metric_syss.columns = ['metric', 'lp', 'testset', 'system', metric_name]
    sys_scores = sys_scores.merge(metric_syss[['lp','system',metric_name]], on=['lp','system'], how='outer')
    
    # fix _x and _y
    sys_scores[metric_name] = [ x['%s_x' % metric_name] if not np.isnan(x['%s_x' % metric_name]) else x['%s_y' % metric_name] for i, x in sys_scores.iterrows() ]
    sys_scores = sys_scores.drop(['%s_x' % metric_name, '%s_y' % metric_name], axis=1)

FileNotFoundError: [Errno 2] No such file or directory: 'data/wmt17-metrics-task-package/final-metric-scores/baselines/baselines.en-zh.sys_hide.score.gz'

### Join metric scores

In [None]:
sys_scores_da = official_da_sys.merge(sys_scores, on=['lp', 'system'], how='left')
sys_scores_da

### WMT17 system-level data (raw)

In [None]:
lp_df = []

for file in glob.glob('data/wmt-human-evaluation/newstest2017-system-level-human/anon-proc-hits-sys-combined/analysis/ad-seg-scores-*.csv.gz'):
    lp = file[-12:-7]
    print(lp)
    
    df = pd.read_csv(gzip.open(file, 'rt'), delimiter=' ')
    df['lp'] = [lp] * len(df)
    
    lp_df.append(df)
raw_seg_scores_da = pd.concat(lp_df)

In [None]:
raw_sys_scores_da = raw_seg_scores_da.groupby(['lp', 'SYS'], as_index=False).mean()

raw_sys_scores_da.columns = ['lp', 'system', 'sid', 'raw_score', 'score', 'N', '5']
raw_sys_scores_da = raw_sys_scores_da[['lp', 'system', 'raw_score', 'score']]
raw_sys_scores_da['system'] = raw_sys_scores_da['system'].apply(lambda x: x.split('.')[0])
raw_sys_scores_da

In [None]:
sys_scores_da = sys_scores_da.merge(raw_sys_scores_da[['lp','system','raw_score']])
sys_scores_da[['lp','raw_score', 'score']].groupby('lp').corr('pearson')

### Validate correlations

In [None]:
official_da_sys \
    .groupby('lp') \
    ['system'] \
    .count()

In [None]:
# can't be reproduced:
# lv-en (differences +-0.002)
# en-lv (differences +-0.02)

In [None]:
sys_scores_da[sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::9] \
    .round(3) \
    .T \
    .sort_index()

In [None]:
sys_scores_da[~sys_scores_da.lp.str.endswith('en')] \
    .groupby('lp') \
    .corr()[::9] \
    .round(3) \
    .T \
    .sort_index()

### WMT17 system-level (src, ref, out)

In [None]:
raw_seg_scores_da = raw_seg_scores_da[['SYS','SID','RAW.SCR','Z.SCR','N','lp']]
raw_seg_scores_da.columns = ['system', 'sid', 'raw_score', 'score', 'N', 'lp']

raw_seg_scores_da['system'] = raw_seg_scores_da['system'].apply(lambda x: x.split('.')[0]) 
raw_seg_scores_da

In [None]:
# srcs and refs
srcs, refs, lps, sids = [], [], [], []
for lp in sys_scores_da.lp.unique():
    fr, to = lp[:2], lp[3:]
    
    refs_ = list(open('data/wmt17-submitted-data/txt/references/newstest2017-%s%s-ref.%s' % (fr, to, to)))
    srcs_ = list(open('data/wmt17-submitted-data/txt/sources/newstest2017-%s%s-src.%s' % (fr, to, fr)))
    sids_ = list(range(1, len(refs_)+1))
    refs.extend(refs_)
    srcs.extend(srcs_)
    sids.extend(sids_)
    
    assert(len(refs_) == len(srcs_))
    lps.extend([lp]*len(refs_))
    
src_ref_df = pd.DataFrame({'reference' : refs, 'source':srcs, 'lp': lps, 'sid': sids})
print('# of entries before merge: %d' % len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(src_ref_df, on=['lp','sid'], how='inner')
print('# of entries after merge: %d' % len(raw_seg_scores_da))
print('These two should be equal.')

In [None]:
# outs
lps, outs, sids, syss = [], [], [], []
for file in glob.glob('data/wmt17-submitted-data/txt/system-outputs/newstest2017/*/*'):
    lp = file.split('.')[-1]
    system = file.split('.')[-3]
    
    # manual fixes
    if 'tuning' in file:
        system = file.split('.')[1]
    elif system == 'CASICT-DCU-NMT':
        system = 'CASICT-cons'
    elif system == 'FBK':
        system = 'fbk-nmt-combination'    
    
    outs_ = list(open(file, 'rt'))
    sids_ = list(range(1, len(outs_)+1))
    lps_ = len(outs_) * [lp]
    syss_ = len(outs_) * [system]
    
    outs.extend(outs_)
    sids.extend(sids_)
    lps.extend(lps_)
    syss.extend(syss_)

out_df = pd.DataFrame({'lp': lps, 'output':outs, 'sid': sids, 'system': syss})
print('# of entries before merge: %d' % len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(out_df, on=['lp','sid', 'system'], how='inner')
print('# of entries after merge: %d' % len(raw_seg_scores_da))
print('These two should be equal.')

### Pickle

In [None]:
pickle.dump(raw_seg_scores_da, open('data/pickles/wmt17-sys_level-all.pkl', 'wb'))
pickle.dump(sys_scores_da, open('data/pickles/wmt17-sys_level-agg.pkl', 'wb'))

# WMT17 segment-level data

In [71]:
seg_scores_da = pd.read_csv('data/wmt17-metrics-task-package/manual-evaluation/DA-seglevel.csv', delimiter=' ')
seg_scores_da.columns = ['lp', 'testset', 'system', 'sid', 'score']
seg_scores_da['system'] = seg_scores_da['system'].apply(lambda x: x.split('.')[0]) 
print(len(seg_scores_da))
seg_scores_da = seg_scores_da.merge(out_df[['lp', 'system', 'sid', 'output']], on=['lp', 'system', 'sid'], how='inner')
seg_scores_da = seg_scores_da.merge(src_ref_df[['lp', 'sid', 'source', 'reference']], on=['lp', 'sid'], how='inner')
print(len(seg_scores_da))

5040


NameError: name 'out_df' is not defined

In [None]:
seg_scores_da = seg_scores_da[['lp', 'system', 'sid', 'output', 'source', 'reference', 'score']]

In [None]:
seg_scores_da

In [None]:
seg_scores_da.groupby('lp').count()

### Add baselines

In [72]:
seg_scores_da = pickle.load(open('data/pickles/wmt17-seg_level-agg.pkl', 'rb'))
seg_scores_da.head()

Unnamed: 0,lp,system,sid,output,source,reference,score
0,en-zh,UU-HNMT,11,“他找到了一个公寓，他和约会一个女孩，”路易斯·卡利亚说。\n,"""He found an apartment, he was dating a girl,""...",Louis Galicia 告诉 KGO：“Frank找到一间公寓，同时在跟一个女孩交往。”\n,-0.522268
1,en-zh,jhu-nmt,28,这场纠纷导致了今年参加六次罢工的初级医生，其中包括NHS历史上第一次停工。\n,The dispute has led to junior doctors taking p...,该纠纷已导致初级医生今年共参与六次罢工，包括英国国家医疗服务体系历史上的首次全面罢工。\n,-0.150441
2,en-zh,CASICT-cons,39,我迷失了。\n,I lost count.\n,我记不清了。\n,-0.330678
3,en-zh,online-A,49,"自11月以来, 俄罗斯公众舆论也出现了转变。\n",Russian public opinion has also turned since N...,自11月份开始，俄罗斯民意也有所扭转。\n,0.490614
4,en-zh,CASICT-cons,68,安卡拉对西方感到愤怒，因为它认为对企图收购的反应是微弱的。\n,Ankara is angry with the West for what it cons...,安卡拉对于西方世界对接管意图的微弱反应感到愤怒。\n,-0.431865


In [73]:
sentbleu = pd.read_csv('data/wmt17-metrics-task-package/final-metric-scores/baselines/sentence-BLEU.seg.score.gz', 
                    delimiter='\t',
                    header=None)
sentbleu.columns = [ 'name', 'lp', 'testset', 'system', 'sid', 'sentBLEU' ]
sentbleu = sentbleu[sentbleu.testset == 'newstest2017']
sentbleu['system'] = sentbleu['system'].apply(lambda x: x.split('.')[0])
sentbleu.drop_duplicates(inplace=True)
sentbleu.head()

Unnamed: 0,name,lp,testset,system,sid,sentBLEU
3494,sentBLEU,cs-en,newstest2017,online-A,1,0.265386
3495,sentBLEU,cs-en,newstest2017,online-A,2,0.189448
3496,sentBLEU,cs-en,newstest2017,online-A,3,0.249669
3497,sentBLEU,cs-en,newstest2017,online-A,4,0.312581
3498,sentBLEU,cs-en,newstest2017,online-A,5,0.173457


In [74]:
print(len(seg_scores_da))
seg_scores_da = seg_scores_da.merge(sentbleu[['lp', 'system', 'sid', 'sentBLEU']], on=['lp', 'system', 'sid'], how='inner')
print(len(seg_scores_da))

5040
5600


In [78]:
seg_scores_da[seg_scores_da.lp.str.endswith('en')].groupby('lp').corr()[1::3]['sentBLEU']

lp          
cs-en  score    0.434955
de-en  score    0.432482
fi-en  score    0.571167
lv-en  score    0.392805
ru-en  score    0.484211
tr-en  score    0.538433
zh-en  score    0.511674
Name: sentBLEU, dtype: float64

In [79]:
seg_scores_da[~seg_scores_da.lp.str.endswith('en')].groupby('lp').corr()[1::3]['sentBLEU']

lp          
en-ru  score    0.467901
en-zh  score    0.377060
Name: sentBLEU, dtype: float64

### Pickle

In [None]:
pickle.dump(seg_scores_da, open('data/pickles/wmt17-seg_level-agg.pkl', 'wb'))