In [1]:
import pandas as pd
import glob
import os
import pickle

### System-level data

In [2]:
wmt16_no_metadata = pickle.load(open('../data/pickles/wmt16-sys_level-all.pkl', 'rb'))
wmt16_no_metadata.count()

system       141905
sid          141905
raw_score    141905
score        141905
N            141905
lp           141905
reference    141905
source       141905
output       141905
dtype: int64

In [3]:
df = pd.read_csv('./data/da-human-judgments/wmt2016-DA-sys-anon/adequacy/analysis/ad-good-raw.csv', sep='\t')
df.count()

HITId         210000
WorkerId      210000
Input.src     210000
Input.trg     210000
Input.item    210000
hit           210000
sys_id        210000
rid           210000
type          210000
sid           210000
score         210000
time          210000
dtype: int64

In [4]:
df['lp'] = df.apply(lambda x: '%s-%s' % (x['Input.src'], x['Input.trg']), axis=1)
df.head(1)

Unnamed: 0,HITId,WorkerId,Input.src,Input.trg,Input.item,hit,sys_id,rid,type,sid,score,time,lp
0,3XJOUITW8URT45XA3297SQ9SRI6QT9,A0447,en,ru,ad,0,PROMT-Rule-based,0,SYSTEM,52,79,1661,en-ru


In [5]:
wmt16_metadata = df.merge(wmt16_no_metadata[['system', 'sid', 'lp', 'reference', 'source', 'output']],
         left_on=['lp', 'sys_id', 'sid'],
         right_on=['lp', 'system', 'sid'],
         how='left'
        )
wmt16_metadata.count()

HITId         210000
WorkerId      210000
Input.src     210000
Input.trg     210000
Input.item    210000
hit           210000
sys_id        210000
rid           210000
type          210000
sid           210000
score         210000
time          210000
lp            210000
system        210000
reference     210000
source        210000
output        210000
dtype: int64

In [6]:
wmt16_metadata = wmt16_metadata[['lp', 'HITId', 'WorkerId', 'score', 'time', 'system', 'type', 'sid', 'reference', 'source', 'output']]

In [7]:
pickle.dump(wmt16_metadata, open('pickles/wmt16_sys_metadata.pkl', 'wb'))

### Segment-level data

In [8]:
wmt16_no_metadata = pickle.load(open('../data/pickles/wmt16-seg_level-agg.pkl', 'rb'))
wmt16_no_metadata.count()

lp           3920
system       3920
sid          3920
sentBLEU     3920
score        3920
output       3920
reference    3920
source       3920
dtype: int64

In [9]:
df = pd.read_csv('./data/da-human-judgments/wmt2016-DA-seg-anon/analysis/ad-good-raw.csv', sep='\t')
df.groupby(['Input.src', 'Input.trg', 'sys_id', 'sid'], as_index=False).count()

Unnamed: 0,Input.src,Input.trg,sys_id,sid,HITId,WorkerId,Input.item,hit,rid,type,score,time
0,cs,en,PJATK,51,16,16,16,16,16,16,16,16
1,cs,en,PJATK,61,16,16,16,16,16,16,16,16
2,cs,en,PJATK,66,32,32,32,32,32,32,32,32
3,cs,en,PJATK,69,17,17,17,17,17,17,17,17
4,cs,en,PJATK,89,15,15,15,15,15,15,15,15
...,...,...,...,...,...,...,...,...,...,...,...,...
3915,tr,en,tbtk-syscomb,2622,30,30,30,30,30,30,30,30
3916,tr,en,tbtk-syscomb,2641,19,19,19,19,19,19,19,19
3917,tr,en,tbtk-syscomb,2674,15,15,15,15,15,15,15,15
3918,tr,en,tbtk-syscomb,2700,34,34,34,34,34,34,34,34


In [10]:
df['lp'] = df.apply(lambda x: '%s-%s' % (x['Input.src'], x['Input.trg']), axis=1)
df.head(1)

Unnamed: 0,HITId,WorkerId,Input.src,Input.trg,Input.item,hit,sys_id,rid,type,sid,score,time,lp
0,3XAOZ9UYRZRSVLL11JPXMKQXZYP1QJ,F0358,cs,en,ad,0,cu-mergedtrees,0,SYSTEM,1158,100,1328,cs-en


In [11]:
wmt16_metadata = df.merge(wmt16_no_metadata[['system', 'sid', 'lp', 'reference', 'source', 'output']],
         left_on=['lp', 'sys_id', 'sid'],
         right_on=['lp', 'system', 'sid'],
         how='left'
        )
wmt16_metadata.count()

HITId         100300
WorkerId      100300
Input.src     100300
Input.trg     100300
Input.item    100300
hit           100300
sys_id        100300
rid           100300
type          100300
sid           100300
score         100300
time          100300
lp            100300
system        100300
reference     100300
source        100300
output        100300
dtype: int64

In [12]:
wmt16_metadata = wmt16_metadata[['lp', 'HITId', 'WorkerId', 'score', 'time', 'system', 'type', 'sid', 'reference', 'source', 'output']]

In [13]:
pickle.dump(wmt16_metadata, open('pickles/wmt16_seg_metadata.pkl', 'wb'))