In [2]:
import pickle
import pandas as pd
from tqdm import tqdm
import numpy as np
import json

### Data

In [86]:
wmt16 = pickle.load(open('../wmt16-19-metrics-shared-task/wmt_metadata/pickles/wmt16_sys_metadata.pkl', 'rb'))
wmt17 = pickle.load(open('../wmt16-19-metrics-shared-task/wmt_metadata/pickles/wmt17_sys_metadata.pkl', 'rb'))
wmt18 = pickle.load(open('../wmt16-19-metrics-shared-task/wmt_metadata/pickles/wmt18_sys_metadata.pkl', 'rb'))
wmt19 = pickle.load(open('../wmt16-19-metrics-shared-task/wmt_metadata/pickles/wmt19_sys_metadata.pkl', 'rb'))

In [87]:
wmt19.drop('sys_id', axis=1, inplace=True)
wmt18.columns = [ i if i != 'sys_id' else 'system' for i in wmt18.columns ]
wmt17.columns = [ i if i != 'sys_id' else 'system' for i in wmt17.columns ]
wmt16.columns = [ i if i != 'sys_id' else 'system' for i in wmt16.columns ]

In [88]:
wmt16.type.value_counts()

SYSTEM     147000
REPEAT      21000
REF         21000
BAD_REF     21000
Name: type, dtype: int64

In [89]:
wmt17.type.value_counts()

SYSTEM     273822
REPEAT      33885
REF         33835
BAD_REF     33783
Name: type, dtype: int64

In [90]:
wmt18.type.value_counts()

SYSTEM     265387
BAD_REF     36924
REPEAT      26489
REF         26003
Name: type, dtype: int64

In [91]:
wmt19.type.value_counts()

SYSTEM     139963
REPEAT      13266
REF         13177
BAD_REF     13113
Name: type, dtype: int64

### Pooled variances

In [92]:
def pooled_variance(groups, variable='score'):
    stds = []
    for i, group in groups:
        if len(group) < 2:
            continue   
        stds.append((len(group), group.std()[variable]))
        
    num = 0
    denom = 0

    for n, sigma in stds:
        denom += (n-1)
        num += (n-1) * sigma * sigma

    pooled_var = num / denom
    return pooled_var

In [96]:
pooled_vars = {}
for i, dataset in [('wmt16', wmt16), ('wmt17', wmt17), ('wmt18', wmt18), ('wmt19', wmt19)]:
    dataset = dataset[dataset.type.isin(['SYSTEM', 'REPEAT'])]
    pooled_vars[i] = pooled_variance(dataset.groupby(['lp', 'system', 'sid']))

In [40]:
pooled_vars

NameError: name 'pooled_vars' is not defined

In [106]:
pooled_vars_toen = {}
for i, dataset in [('wmt16', wmt16), ('wmt17', wmt17), ('wmt18', wmt18), ('wmt19', wmt19)]:
    dataset = dataset[dataset.type.isin(['SYSTEM', 'REPEAT']) & \
                      dataset.lp.str.endswith('en')]
    pooled_vars_toen[i] = pooled_variance(dataset.groupby(['lp', 'system', 'sid']))

In [107]:
pooled_vars_toen

{'wmt16': 307.17626731473774,
 'wmt17': 527.2973972404222,
 'wmt18': 382.8504227475613,
 'wmt19': 458.6292985991144}

### Total variances

In [98]:
total_vars = {}
for i, dataset in [('wmt16', wmt16), ('wmt17', wmt17), ('wmt18', wmt18), ('wmt19', wmt19)]:
    dataset = dataset[dataset.type.isin(['SYSTEM', 'REPEAT'])]
    total_vars[i] = dataset.var()['score']

In [99]:
total_vars

{'wmt16': 929.3833283786197,
 'wmt17': 941.6081748841809,
 'wmt18': 860.2982230631335,
 'wmt19': 888.5818433621715}

In [108]:
total_vars_toen = {}
for i, dataset in [('wmt16', wmt16), ('wmt17', wmt17), ('wmt18', wmt18), ('wmt19', wmt19)]:
    dataset = dataset[dataset.type.isin(['SYSTEM', 'REPEAT']) & \
                      dataset.lp.str.endswith('en')]
    total_vars_toen[i] = dataset.var()['score']

In [109]:
total_vars_toen

{'wmt16': 900.345673312272,
 'wmt17': 879.1690392977753,
 'wmt18': 795.9652837203586,
 'wmt19': 829.7852689294594}

### Export

In [105]:
json.dump(pooled_vars, open('./vars/pooled_vars.json', 'wt'))
json.dump(total_vars, open('./vars/total_vars.json', 'wt'))

In [110]:
json.dump(pooled_vars_toen, open('./vars/pooled_vars_toen.json', 'wt'))
json.dump(total_vars_toen, open('./vars/total_vars_toen.json', 'wt'))

# Results

In [6]:
total_vars_toen = json.load(open('./vars/total_vars_toen.json'))
pooled_vars_toen = json.load(open('./vars/pooled_vars_toen.json'))

l = []
for i in ['wmt16', 'wmt17', 'wmt18', 'wmt19']:
    l.append('%.2f' % np.sqrt(total_vars_toen[i]))
l.insert(0, '$\sqrt{\Var(H(x))}$')
print(' & '.join(l) + ' \\\\')
    
l = []
for i in ['wmt16', 'wmt17', 'wmt18', 'wmt19']:
    l.append('%.2f' % np.sqrt(pooled_vars_toen[i]))
l.insert(0, '$\sqrt{\Expect[\Var(H(x) | x)]}$')
print(' & '.join(l) + ' \\\\')


l = []
p_vars = {}
for i in ['wmt16', 'wmt17', 'wmt18', 'wmt19']:
    p_var = total_vars_toen[i] - pooled_vars_toen[i]
    p_vars[i] = p_var
    l.append('%.2f' % np.sqrt(p_var))
l.insert(0, '$\sqrt{\Var(P(x))}$')    
print(' & '.join(l) + ' \\\\')

l = []
for i in ['wmt16', 'wmt17', 'wmt18', 'wmt19']:
    p_var = p_vars[i]
    total_var = total_vars_toen[i]
    
    l.append('%.2f' % (total_var / p_var))
l.insert(0, '$\Var(H(x)) / \Var(P(x))$')   
print(' & '.join(l) + ' \\\\')

$\sqrt{\Var(H(x))}$ & 30.01 & 29.65 & 28.21 & 28.81 \\
$\sqrt{\Expect[\Var(H(x) | x)]}$ & 17.53 & 22.96 & 19.57 & 21.42 \\
$\sqrt{\Var(P(x))}$ & 24.36 & 18.76 & 20.33 & 19.27 \\
$\Var(H(x)) / \Var(P(x))$ & 1.52 & 2.50 & 1.93 & 2.24 \\
