<a href="https://colab.research.google.com/github/johntzwei/metric-statistical-advantage/blob/main/variance_analysis_wmt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pickle
import pandas as pd
from tqdm import tqdm
import numpy as np
import json

# Data

In [None]:
!git clone https://github.com/johntzwei/wmt16-19-metrics-shared-task.git

Cloning into 'wmt16-19-metrics-shared-task'...
remote: Enumerating objects: 168, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 168 (delta 26), reused 39 (delta 17), pack-reused 120[K
Receiving objects: 100% (168/168), 115.17 MiB | 18.18 MiB/s, done.
Resolving deltas: 100% (95/95), done.
Checking out files: 100% (30/30), done.


In [None]:
wmt16 = pickle.load(open('wmt16-19-metrics-shared-task/wmt_metadata/pickles/wmt16_sys_metadata.pkl', 'rb'))
wmt17 = pickle.load(open('wmt16-19-metrics-shared-task/wmt_metadata/pickles/wmt17_sys_metadata.pkl', 'rb'))
wmt18 = pickle.load(open('wmt16-19-metrics-shared-task/wmt_metadata/pickles/wmt18_sys_metadata.pkl', 'rb'))
wmt19 = pickle.load(open('wmt16-19-metrics-shared-task/wmt_metadata/pickles/wmt19_sys_metadata.pkl', 'rb'))

In [None]:
wmt19.drop('sys_id', axis=1, inplace=True)
wmt18.columns = [ i if i != 'sys_id' else 'system' for i in wmt18.columns ]
wmt17.columns = [ i if i != 'sys_id' else 'system' for i in wmt17.columns ]
wmt16.columns = [ i if i != 'sys_id' else 'system' for i in wmt16.columns ]

In [None]:
for df in [wmt16, wmt17, wmt18, wmt19]:
  print(df.type.value_counts())
  print()

SYSTEM     147000
BAD_REF     21000
REPEAT      21000
REF         21000
Name: type, dtype: int64

SYSTEM     273822
REPEAT      33885
REF         33835
BAD_REF     33783
Name: type, dtype: int64

SYSTEM     265387
BAD_REF     36924
REPEAT      26489
REF         26003
Name: type, dtype: int64

SYSTEM     139963
REPEAT      13266
REF         13177
BAD_REF     13113
Name: type, dtype: int64



# Variance analysis

### Pooled variances

In [None]:
def pooled_variance(groups, variable='score'):
    stds = []
    for i, group in groups:
        if len(group) < 2:
            continue   
        stds.append((len(group), group.std()[variable]))
        
    num = 0
    denom = 0

    for n, sigma in stds:
        denom += (n-1)
        num += (n-1) * sigma * sigma

    pooled_var = num / denom
    return pooled_var

In [None]:
pooled_vars_toen = {}
for i, dataset in [('wmt16', wmt16), ('wmt17', wmt17), ('wmt18', wmt18), ('wmt19', wmt19)]:
    dataset = dataset[dataset.type.isin(['SYSTEM', 'REPEAT']) & \
                      dataset.lp.str.endswith('en')]
    pooled_vars_toen[i] = pooled_variance(dataset.groupby(['lp', 'system', 'sid']))

In [None]:
pooled_vars_toen

{'wmt16': 307.17626731473774,
 'wmt17': 527.2973972404222,
 'wmt18': 382.8504227475613,
 'wmt19': 458.6292985991144}

### Total variances

In [None]:
total_vars_toen = {}
for i, dataset in [('wmt16', wmt16), ('wmt17', wmt17), ('wmt18', wmt18), ('wmt19', wmt19)]:
    dataset = dataset[dataset.type.isin(['SYSTEM', 'REPEAT']) & \
                      dataset.lp.str.endswith('en')]
    total_vars_toen[i] = dataset.var()['score']

In [None]:
total_vars_toen

{'wmt16': 900.3456733124744,
 'wmt17': 879.1690392974084,
 'wmt18': 795.9652837204678,
 'wmt19': 829.785268929387}

# Results

### Jsons

In [None]:
json.dumps(pooled_vars_toen)

'{"wmt16": 307.17626731473774, "wmt17": 527.2973972404222, "wmt18": 382.8504227475613, "wmt19": 458.6292985991144}'

In [None]:
json.dumps(total_vars_toen)

'{"wmt16": 900.3456733124744, "wmt17": 879.1690392974084, "wmt18": 795.9652837204678, "wmt19": 829.785268929387}'

### Table

In [None]:
print(
r'''
\begin{table}[!h]
\small
\centering
\begin{tabular}{l|cccc}
& 2016 & 2017 & 2018 & 2019 \\
\midrule
'''
)

cols = ['wmt16', 'wmt17', 'wmt18', 'wmt19']

# general human std dev
print('$\sqrt{\Var(H(x))}$', end=' & ')
l = []
for i in cols:
    l.append('%.2f' % np.sqrt(total_vars_toen[i]))
print(' & '.join(l) + ' \\\\')

# expected human std dev for a given input (pooled variance)
print('$\sqrt{\Expect[\Var(H(x) | x)]}$', end=' & ')
l = []
for i in cols:
    l.append('%.2f' % np.sqrt(pooled_vars_toen[i]))
print(' & '.join(l) + ' \\\\')

# std dev of true segment scores (or perfect annotator)
print('$\sqrt{\Var(P(x))}$', end=' & ')  
l = []  
p_vars = {}
for i in cols:
    p_var = total_vars_toen[i] - pooled_vars_toen[i]
    p_vars[i] = p_var
    l.append('%.2f' % np.sqrt(p_var))
print(' & '.join(l) + ' \\\\')

print('$\Var(H(x)) / \Var(P(x))$', end=' & ')  
l = []
for i in cols:
    p_var = p_vars[i]
    total_var = total_vars_toen[i]
    
    l.append('%.2f' % (total_var / p_var)) 
print(' & '.join(l) + ' \\\\')

print(
r'''
\end{tabular}
\caption{Step-by-step derivation for the efficiency ratio $r$ (fourth row) of the perfect annotator estimator for WMT16-19 as defined in \S 4.1. Square roots are taken so that values are in terms of the original units (standard deviations, judgments range from 0-100). These were calculated on to-English data only. } \label{perfect_annotator_derivation_wmt}
\end{table}
'''
)


\begin{table}[!h]
\small
\centering
\begin{tabular}{l|cccc}
& 2016 & 2017 & 2018 & 2019 \\
\midrule

$\sqrt{\Var(H(x))}$ & 30.01 & 29.65 & 28.21 & 28.81 \\
$\sqrt{\Expect[\Var(H(x) | x)]}$ & 17.53 & 22.96 & 19.57 & 21.42 \\
$\sqrt{\Var(P(x))}$ & 24.36 & 18.76 & 20.33 & 19.27 \\
$\Var(H(x)) / \Var(P(x))$ & 1.52 & 2.50 & 1.93 & 2.24 \\

\end{tabular}
\caption{Step-by-step derivation for the efficiency ratio $r$ (fourth row) of the perfect annotator estimator for WMT16-19 as defined in \S 4.1. Square roots are taken so that values are in terms of the original units (standard deviations, judgments range from 0-100). These were calculated on to-English data only. } \label{perfect_annotator_derivation_wmt}
\end{table}

