<a href="https://colab.research.google.com/github/johntzwei/metric-statistical-advantage/blob/main/bvnd_summeval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pickle
import itertools
import json
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from multiprocessing import Pool
import pdb

# Data

In [None]:
!pip install gdown
!gdown --id 1bv6zJdXbvUxMCmWXSHW3-GicPPWN9f1F

Downloading...
From: https://drive.google.com/uc?id=1bv6zJdXbvUxMCmWXSHW3-GicPPWN9f1F
To: /home/johnny/summeval_flat.pkl
100%|██████████████████████████████████████| 2.06M/2.06M [00:00<00:00, 15.0MB/s]


In [None]:
df = pickle.load(open('summeval_flat.pkl', 'rb'))
df.head(1)

Unnamed: 0,id,output,reference_1,system,r11_rouge_4_f_score,r11_meteor,r11_rouge_we_1_f,r11_bert_score,r1_supert,type,score
0,dm-test-8764fb95bfad8ee849274873a92fb8d6b400eee2,paul merson was brought on with only seven min...,Andros Townsend an 83rd minute sub in Tottenha...,M11,0.00185,0.220548,0.306888,0.399867,0.508232,expert,2.25


# Bias variance noise decomposition

In [None]:
# adopted from https://github.com/rasbt/mlxtend/blob/master/mlxtend/evaluate/bias_variance_decomp.py
def bias_var_noise_decomposition(metric_results, human_results, no_bias=False):
    main_predictions = np.apply_along_axis(lambda x:
                                           np.argmax(np.bincount(x)),
                                           axis=0,
                                           arr=metric_results)

    optimal_predictions = np.apply_along_axis(lambda x:
                                       np.argmax(np.bincount(x)),
                                       axis=0,
                                       arr=human_results)
    
    if no_bias:
        main_predictions = optimal_predictions
    
    avg_expected_loss = (human_results != metric_results).mean()

    noises = (human_results != optimal_predictions).mean(axis=0)
    probs = (metric_results == optimal_predictions).mean(axis=0)

    avg_noise_contrib = ((2 * probs - 1) * noises).mean()

    avg_bias_contrib = (main_predictions != optimal_predictions).mean()

    signs = (main_predictions == optimal_predictions).astype(np.int64) * 2 - 1
    variances = (metric_results != main_predictions).mean(axis=0)
    avg_var_contrib = (signs * variances).mean()

    return avg_expected_loss, avg_bias_contrib, avg_var_contrib, avg_noise_contrib

In [None]:
def pairs(x):
    systems = x.system.unique()
    for i, j in itertools.combinations(systems, 2):
        yield i, j
        
all_pairs = list(pairs(df))
all_pairs[:10]

[('M11', 'M13'),
 ('M11', 'M1'),
 ('M11', 'M14'),
 ('M11', 'M15'),
 ('M11', 'M12'),
 ('M11', 'M5'),
 ('M11', 'M17'),
 ('M11', 'M20'),
 ('M11', 'M23'),
 ('M11', 'M2')]

In [None]:
# get pairwise predictions from the scoring of each system
# in the form of a (num_pairs,) length vector, where 1 or 0 denotes the ordering of that pair
def get_preds(all_pairs, scores):
    preds = np.zeros(len(all_pairs))
    if type(scores) == type({}):
        for ii, (i, j) in enumerate(all_pairs):
            preds[ii] = 1 if scores[i] - scores[j] > 0 else 0
    else:
        for ii, (i, j) in enumerate(all_pairs):
            preds[ii] = 1 if scores.loc[i] - scores.loc[j] > 0 else 0
            
    return preds

In [None]:
groupby_cached = [ (i, pd.DataFrame(g)) for i, g in df.groupby(['system']) ]
groupby_labels = [ i[0] for i in groupby_cached ]

### Experimental parameters

In [None]:
NUM_WORKERS = 12
NUM_TRIALS = 10000
CHUNKSIZE = 10
METRICS = ['true_preds', 'human', 'r11_rouge_4_f_score', 'r11_meteor', 'r11_rouge_we_1_f', 'r11_bert_score', 'r1_supert']

In [None]:
SEED = 0
random_state = np.random.RandomState(SEED)

### Optimal predictions (human main predictions)

In [None]:
def f(seed):
    this_rs = np.random.RandomState(seed)

    # using bootstrap, sample frac=1 number of human judgments for each MT system
    # in WMT we typically have 1k-2k judgments per system collected
    groups = [ g.sample(frac=1, replace=True, random_state=this_rs) for i, g in groupby_cached ]

    # calculate the observed means for each MT system from this bootstrap sampling
    observed_means = [ g['score'].mean() for g in groups ]
    d = { label:mean for label, mean in zip(groupby_labels, observed_means) }

    # generate the pairwise predictions from this bootstrap sampling
    return get_preds(all_pairs, d)

with Pool(NUM_WORKERS) as p:
    it = tqdm(p.imap_unordered(f, [ random_state.randint(0, 2**32 - 1) for i in range(0, NUM_TRIALS) ],
                               chunksize=CHUNKSIZE), total=NUM_TRIALS)
    output = list(it)
    human_results = np.array(output, dtype=np.int64)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
# optimal predictions
optimal_predictions = np.apply_along_axis(lambda x:
                                   np.argmax(np.bincount(x)),
                                   axis=0,
                                   arr=human_results)

### Bootstrap experiments

In [None]:
def f(x):
    metric, seed = x
    this_rs = np.random.RandomState(seed)

    # compute human scores
    groups = [ g.sample(frac=1, replace=True, random_state=this_rs) for i, g in groupby_cached ]
    observed_means = [ g['score'].mean() for g in groups ]
    observed_means = { label:mean for label, mean in zip(groupby_labels, observed_means) }


    if metric == 'true_preds':
      human_preds = get_preds(all_pairs, observed_means)
      metric_preds = optimal_predictions  # constant, computed from optimal predictions above
      return human_preds, metric_preds

    groups = [ g.sample(frac=1, replace=True,  random_state=this_rs) for i, g in groupby_cached ]
    if metric == 'human':
      agg_means = [ g['score'].mean() for g in groups ]
    else:
      agg_means = [ g[metric].mean() for g in groups ]
    agg_means = { label:mean for label, mean in zip(groupby_labels, agg_means) }

    human_preds = get_preds(all_pairs, observed_means)
    metric_preds = get_preds(all_pairs, agg_means)

    return human_preds, metric_preds

bootstrap_results = {}
with Pool(NUM_WORKERS) as p:
    for metric in tqdm(METRICS):
        it = tqdm(p.imap_unordered(f, [ (metric, random_state.randint(0, 2**32 - 1)) for i in range(0, NUM_TRIALS) ],
                                   chunksize=CHUNKSIZE), total=NUM_TRIALS)
        output = list(it)

        human_results = np.array([ i[0] for i in output ], dtype=np.int64)
        metric_results = np.array([ i[1] for i in output ], dtype=np.int64)
        bootstrap_results[metric] = (human_results, metric_results)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

### Decomposition

In [None]:
decomp_results = {}
for metric in METRICS:
  no_bias = metric in ['human', 'true_preds']
  metric_results, human_results = bootstrap_results[metric]
  decomp_results[metric] = bias_var_noise_decomposition(metric_results, human_results, no_bias=no_bias)

# Results

### Jsons

In [None]:
json.dumps(decomp_results)

'{"true_preds": [0.0453639705882353, 0.0, 0.0453639705882353, 0.0], "human": [0.06744411764705882, 0.0, 0.04573970588235294, 0.021641097500000005], "r11_rouge_4_f_score": [0.29573235294117645, 0.29411764705882354, 0.007893382352941175, -0.006388005882352942], "r11_meteor": [0.2958823529411765, 0.2867647058823529, 0.005280147058823531, 0.0038608427941176473], "r11_rouge_we_1_f": [0.31671176470588236, 0.3014705882352941, 0.008197794117647058, 0.007194242941176469], "r11_bert_score": [0.32982058823529414, 0.3382352941176471, -0.004234558823529412, -0.004350778676470588], "r1_supert": [0.3902316176470588, 0.38235294117647056, 0.007596323529411766, 0.00028507529411764715]}'

### Table

In [None]:
print(
r'''
\begin{table*}[!t]
    \small
    \begin{tabular}{r c | c c c }
    \toprule
    & & \multicolumn{3}{c}{Error components} \\
    & $\text{Err}_{\text{obs}}(\cdot)$ & $c_0\text{Noise}$ & $\text{Bias}$ & $c_1\text{Var}$  \\
    \midrule
'''
)

names = {
    'true_preds' : "Optimal ($\Delta^{H*}_{S,S'}$)",
    'human' : "Human ($\widehat{\Delta^{H}_{S,S'}}$)",
    'r11_rouge_4_f_score' : "\sc{ROUGE}",
    'r11_meteor' : "\sc{METEOR}",
    'r11_rouge_we_1_f' : "\sc{ROUGE-WE}",
    'r11_bert_score' : "\sc{BERTscore}",
    'r1_supert' : "\sc{SUPERT}***"
}

sorted_results = sorted(list(decomp_results.items()), key=lambda x: x[1][0])
for metric, (avg_expected_loss, avg_bias_contrib, avg_var_contrib, avg_noise_contrib) in sorted_results:
    max_idx = np.argmax([avg_noise_contrib, avg_bias_contrib, avg_var_contrib])
    if max_idx == 0:
        s = '%s & %.3f & \\bf{%.3f} & %.3f & %.3f \\\\'
    elif max_idx == 1:
        s = '%s & %.3f & %.3f & \\bf{%.3f}& %.3f \\\\'
    else:
        s = '%s & %.3f & %.3f & %.3f & \\bf{%.3f} \\\\'
    print(s % (names[metric], avg_expected_loss, (avg_expected_loss-avg_bias_contrib-avg_var_contrib), avg_bias_contrib, avg_var_contrib))

print(
r'''
    \end{tabular}
    \end{minipage}
    \caption{Decomposition of the pairwise error of different metrics (left: WMT, right: SummEval). Highlighted in bold is the largest error component. 10K boostrap trials are conducted for estimation of the expectations (estimation error $<10^{-3}$). *Denotes an estimator assumed to be unbiased in the simulation. **{\sc BLEURT} is evaluated only on WMT2019. ***{\sc SUPERT} is a reference-less metric.}
    \label{table:bvd_wmt}
\end{table*}
'''
)


\begin{table*}[!t]
    \small
    \begin{tabular}{r c | c c c }
    \toprule
    & & \multicolumn{3}{c}{Error components} \\
    & $\text{Err}_{\text{obs}}(\cdot)$ & $c_0\text{Noise}$ & $\text{Bias}$ & $c_1\text{Var}$  \\
    \midrule

Optimal ($\Delta^{H*}_{S,S'}$) & 0.045 & 0.000 & 0.000 & \bf{0.045} \\
Human ($\widehat{\Delta^{H}_{S,S'}}$) & 0.067 & 0.022 & 0.000 & \bf{0.046} \\
\sc{ROUGE} & 0.296 & -0.006 & \bf{0.294}& 0.008 \\
\sc{METEOR} & 0.296 & 0.004 & \bf{0.287}& 0.005 \\
\sc{ROUGE-WE} & 0.317 & 0.007 & \bf{0.301}& 0.008 \\
\sc{BERTscore} & 0.330 & -0.004 & \bf{0.338}& -0.004 \\
\sc{SUPERT}*** & 0.390 & 0.000 & \bf{0.382}& 0.008 \\

    \end{tabular}
    \end{minipage}
    \caption{Decomposition of the pairwise error of different metrics (left: WMT, right: SummEval). Highlighted in bold is the largest error component. 10K boostrap trials are conducted for estimation of the expectations (estimation error $<10^{-3}$). *Denotes an estimator assumed to be unbiased in the simulat