In [1]:
import glob
import gzip
import itertools
import csv
import io
import pickle
import hashlib
import pandas as pd
import numpy as np

# WMT19 system-level data

### WMT19 system-level data (raw)

In [2]:
mturk = pd.read_csv('data/newstest2019-humaneval/mturk-sntlevel-humaneval-newstest2019/analysis/ad-good-raw-redup.csv', sep='\t')
print(len(mturk))
turkle = pd.read_csv('data/newstest2019-humaneval/turkle-sntlevel-humaneval-newstest2019/analysis/ad-good-raw-redup.csv', sep='\t')
print(len(turkle))
raw_seg_scores_da = pd.concat([mturk, turkle])
print(len(raw_seg_scores_da))

131720
47799
179519


In [3]:
raw_seg_scores_da['lp'] = raw_seg_scores_da.apply(lambda x: '%s-%s' % (x['Input.src'], x['Input.trg']), axis=1)
raw_seg_scores_da.head(1)

Unnamed: 0,HITId,WorkerId,Input.src,Input.trg,Input.item,hit,sys_id,rid,type,sid,score,time,lp
0,3MA5N0ATTCBZLJ39L2ULJO4IBBWWK2,M0765,fi,en,ad,1,parfda.6526,0,SYSTEM,367,71,626,fi-en


In [4]:
raw_seg_scores_da.sys_id.unique()

array(['parfda.6526', 'TartuNLP-c.6905',
       'apertium-fin-eng-unconstrained-fien.6449', 'Helsinki-NLP.6889',
       'online-Y.0', 'GTCOM-Primary.6946', 'online-X.0', 'MSRA.NAO.6983',
       'online-B.0', 'online-G.0', 'online-A.0', 'USYD.6995',
       'UdS-DFKI.6861', 'UEDIN.6534', 'CUNI-T2T-transfer-guen.6431',
       'NEU.6756', 'IITP-MT.6824', 'Ju-Saarland.6525',
       'GTCOM-Primary.6969', 'aylien-mt-gu-en-multilingual.6826',
       'IIITH-MT.6688', 'NICT.6603', 'NEU.6803', 'rerank-re.6540',
       'eTranslation.6598', 'afrl-ewc.6659', 'MSRA.SCA.6976',
       'Facebook-FAIR.6937', 'afrl-syscomb19.6782', 'TartuNLP-u.6650',
       'NICT.6561', 'tilde-c-nmt.6876', 'NEU.6759', 'GTCOM-Primary.6998',
       'TartuNLP-c.6908', 'tilde-nc-nmt.6881', 'MSRA.MASS.6945',
       'JUMT.6616', 'talp-upc-2019-kken.6657', 'DBMS-KU-KKEN.6726',
       'CUNI-T2T-transfer-kken.6436', 'UMD.6736', 'NICT.6770',
       'rug-kken-morfessor.6677', 'Frank-s-MT.6127', 'NEU.6753',
       'NRC-CNRC.6895', 'o

In [5]:
def rchop(s, sub):
    return s[:-len(sub)] if s.endswith(sub) else s

def cut_trailing_lp(x):
    for i in ['.de-fr', '.fr-de', '.de-cs', '.cs-de', '.zh-en']:
        if x.endswith(i):            
            return rchop(x, i)
    return x

def fix_unsupervised(x):
    if x.startswith('Unsupervised'):
        parts = x.split('.')
        return 'Unsupervised.%s' % parts[-1]
    return x
raw_seg_scores_da['sys_id'] = raw_seg_scores_da['sys_id'].apply(cut_trailing_lp)
raw_seg_scores_da['sys_id'] = raw_seg_scores_da['sys_id'].apply(fix_unsupervised)
print(raw_seg_scores_da.sys_id.unique())

['parfda.6526' 'TartuNLP-c.6905'
 'apertium-fin-eng-unconstrained-fien.6449' 'Helsinki-NLP.6889'
 'online-Y.0' 'GTCOM-Primary.6946' 'online-X.0' 'MSRA.NAO.6983'
 'online-B.0' 'online-G.0' 'online-A.0' 'USYD.6995' 'UdS-DFKI.6861'
 'UEDIN.6534' 'CUNI-T2T-transfer-guen.6431' 'NEU.6756' 'IITP-MT.6824'
 'Ju-Saarland.6525' 'GTCOM-Primary.6969'
 'aylien-mt-gu-en-multilingual.6826' 'IIITH-MT.6688' 'NICT.6603'
 'NEU.6803' 'rerank-re.6540' 'eTranslation.6598' 'afrl-ewc.6659'
 'MSRA.SCA.6976' 'Facebook-FAIR.6937' 'afrl-syscomb19.6782'
 'TartuNLP-u.6650' 'NICT.6561' 'tilde-c-nmt.6876' 'NEU.6759'
 'GTCOM-Primary.6998' 'TartuNLP-c.6908' 'tilde-nc-nmt.6881'
 'MSRA.MASS.6945' 'JUMT.6616' 'talp-upc-2019-kken.6657'
 'DBMS-KU-KKEN.6726' 'CUNI-T2T-transfer-kken.6436' 'UMD.6736' 'NICT.6770'
 'rug-kken-morfessor.6677' 'Frank-s-MT.6127' 'NEU.6753' 'NRC-CNRC.6895'
 'NICT.6938' 'CAiRE.6949' 'lmu-unsup-nmt-de-cs.6845' 'Unsupervised.6935'
 'Unsupervised.6929' 'NEU_KingSoft.6766' 'CUNI-Unsupervised-NER-post.6934'

### WMT system-level (src, ref, out)

In [6]:
raw_seg_scores_da['lp'] = raw_seg_scores_da.apply(lambda x: '%s-%s' % (x['Input.src'], x['Input.trg']), axis=1)
raw_seg_scores_da.head(1)

Unnamed: 0,HITId,WorkerId,Input.src,Input.trg,Input.item,hit,sys_id,rid,type,sid,score,time,lp
0,3MA5N0ATTCBZLJ39L2ULJO4IBBWWK2,M0765,fi,en,ad,1,parfda.6526,0,SYSTEM,367,71,626,fi-en


In [7]:
raw_seg_scores_da.head(1)

Unnamed: 0,HITId,WorkerId,Input.src,Input.trg,Input.item,hit,sys_id,rid,type,sid,score,time,lp
0,3MA5N0ATTCBZLJ39L2ULJO4IBBWWK2,M0765,fi,en,ad,1,parfda.6526,0,SYSTEM,367,71,626,fi-en


In [8]:
# srcs and refs
srcs, refs, lps, sids = [], [], [], []
for lp in raw_seg_scores_da.lp.unique():
    fr, to = lp[:2], lp[3:]
    print(lp)
    
    refs_ = list(open('data/wmt19-submitted-data/txt/references/newstest2019-%s%s-ref.%s' % (fr, to, to)))
    srcs_ = list(open('data/wmt19-submitted-data/txt/sources/newstest2019-%s%s-src.%s' % (fr, to, fr)))
    sids_ = list(range(1, len(refs_)+1))
    refs.extend(refs_)
    srcs.extend(srcs_)
    sids.extend(sids_)
    
    assert(len(refs_) == len(srcs_))
    lps.extend([lp]*len(refs_))
    
src_ref_df = pd.DataFrame({'reference' : refs, 'source':srcs, 'lp': lps, 'sid': sids})
print(len(src_ref_df), len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(src_ref_df, on=['lp','sid'], how='left')
print(len(raw_seg_scores_da))

fi-en
gu-en
ru-en
lt-en
kk-en
de-cs
de-fr
fr-de
zh-en
14411 179519
179519


In [9]:
# outs
lps, outs, sids, syss = [], [], [], []
for file in glob.glob('data/wmt19-submitted-data/txt/system-outputs/newstest2019/*/*'):
    lp = file.split('.')[-1]
    system = '.'.join(file.split('.')[1:-1])
    #print(lp, system)
    
    if 'Unsupervised.' in system:
        system = system[:13] + system[-4:]
    elif '_' in system and not any(['NEU_KingSoft' in system, 'Kyoto_University' in system, 'lingua_custodia_primary' in system]):
        system = system.replace('_', '-')
    
    outs_ = list(open(file, 'rt'))
    if lp in ['de-fr', 'fr-de', 'de-cs', 'cs-de', 'zh-en']:
        sids_ = list(range(0, len(outs_)))
    else:
        sids_ = list(range(1, len(outs_)+1))
    lps_ = len(outs_) * [lp]
    syss_ = len(outs_) * [system]
    
    outs.extend(outs_)
    sids.extend(sids_)
    lps.extend(lps_)
    syss.extend(syss_)

out_df = pd.DataFrame({'lp': lps, 'output':outs, 'sid': sids, 'system': syss})
print(len(out_df), len(raw_seg_scores_da))
raw_seg_scores_da = raw_seg_scores_da.merge(out_df, left_on=['lp','sid', 'sys_id'], right_on=['lp', 'sid', 'system'], how='left')
print(len(raw_seg_scores_da))

391536 179519
179519


In [10]:
raw_seg_scores_da.count()

HITId         179519
WorkerId      179519
Input.src     179519
Input.trg     179519
Input.item    179519
hit           179519
sys_id        179519
rid           179519
type          179519
sid           179519
score         179519
time          179519
lp            179519
reference     179490
source        179490
output        179519
system        179519
dtype: int64

In [11]:
raw_seg_scores_da[raw_seg_scores_da.system.isna()]

Unnamed: 0,HITId,WorkerId,Input.src,Input.trg,Input.item,hit,sys_id,rid,type,sid,score,time,lp,reference,source,output,system


### Pickle

In [12]:
pickle.dump(raw_seg_scores_da, open('./pickles/wmt19_sys_metadata.pkl', 'wb'))
#pickle.dump(sys_scores_da, open('data/pickles/wmt19-sys_level-agg.pkl', 'wb'))