<a href="https://colab.research.google.com/github/johntzwei/metric-statistical-advantage/blob/main/variance_analysis_summeval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pickle
import pandas as pd
from tqdm import tqdm
import numpy as np
import json

# Data

In [2]:
!pip install gdown
!gdown --id 148b2OkfYkSSVgjsYLl5wKKwrmdIkT2Eo

Downloading...
From: https://drive.google.com/uc?id=148b2OkfYkSSVgjsYLl5wKKwrmdIkT2Eo
To: /content/model_annotations.aligned.scored.jsonl.txt
22.2MB [00:00, 135MB/s]


In [3]:
lines = list(open('./model_annotations.aligned.scored.jsonl.txt', 'rt'))
objs = [ json.loads(i) for i in lines ]

In [4]:
objs[0]

{'decoded': "paul merson was brought on with only seven minutes remaining in his team 's 0-0 draw with burnley . andros townsend scored the tottenham midfielder in the 89th minute . paul merson had another dig at andros townsend after his appearance . the midfielder had been brought on to the england squad last week . click here for all the latest arsenal news news .",
 'expert_annotations': [{'coherence': 2,
   'consistency': 1,
   'fluency': 4,
   'relevance': 2},
  {'coherence': 1, 'consistency': 1, 'fluency': 2, 'relevance': 1},
  {'coherence': 1, 'consistency': 1, 'fluency': 3, 'relevance': 2}],
 'filepath': 'cnndm/dailymail/stories/8764fb95bfad8ee849274873a92fb8d6b400eee2.story',
 'id': 'dm-test-8764fb95bfad8ee849274873a92fb8d6b400eee2',
 'metric_scores_1': {'bert_score_f1': 0.39986729621887207,
  'bert_score_precision': 0.35211145877838135,
  'bert_score_recall': 0.44890454411506653,
  'blanc': 0.1702127659574468,
  'bleu': 6.083760628516704,
  'chrf': 0.3665588486313395,
  'cid

# Variance analysis

In [5]:
def get_annotations(x, agg=lambda x: np.mean(list(x.values())), annotations='expert_annotations'):
    scores = []
    if annotations not in x:
        return []
    
    for expert in x[annotations]:
        scores.append(agg(expert)) 
    return scores

### Pooled variances

In [6]:
def pooled_variance(objs, annotations):
    stds = []
    for obj in objs:
        scores = annotations(obj)
        if scores != []:
            stds.append((len(scores), np.var(scores)))
        
    num = 0
    denom = 0

    for n, var in stds:
        denom += (n-1)
        num += (n-1) * var

    pooled_var = num / denom
    return pooled_var

In [7]:
pooled_var_expert = pooled_variance(objs, get_annotations)
pooled_var_expert

0.0859722222222225

In [8]:
pooled_var_turkers = pooled_variance(objs, lambda x: get_annotations(x, annotations='turker_annotations'))
pooled_var_turkers

0.22594999999999998

In [9]:
pooled_vars = {'expert' : pooled_var_expert, 'turker' : pooled_var_turkers}
pooled_vars

{'expert': 0.0859722222222225, 'turker': 0.22594999999999998}

### Total variances

In [10]:
all_scores = sum([ get_annotations(obj) for obj in objs ], [])
total_var_expert = np.var(all_scores)
total_var_expert

0.5144329560745868

In [11]:
all_scores = sum([ get_annotations(obj, annotations='turker_annotations') for obj in objs ], [])
total_var_turker = np.var(all_scores)
total_var_turker

0.5551246093750001

In [12]:
total_vars = {'expert' : total_var_expert, 'turker' : total_var_turker}
total_vars

{'expert': 0.5144329560745868, 'turker': 0.5551246093750001}

# Results

### Jsons

In [13]:
json.dumps(pooled_vars)

'{"expert": 0.0859722222222225, "turker": 0.22594999999999998}'

In [14]:
json.dumps(total_vars)

'{"expert": 0.5144329560745868, "turker": 0.5551246093750001}'

### Table

In [15]:
print(
r'''
\begin{table}[!h]
    \small
    \centering
    \begin{tabular}{l|cc}
         & Expert & Turker \\
         \midrule
'''
)

cols = ['expert', 'turker']

# general human std dev
print('$\sqrt{\Var(H(x))}$', end=' & ')
l = []
for i in cols:
    l.append('%.2f' % np.sqrt(total_vars[i]))
print(' & '.join(l) + ' \\\\')

# expected human std dev for a given input (pooled variance)
print('$\sqrt{\Expect[\Var(H(x) | x)]}$', end=' & ')
l = []
for i in cols:
    l.append('%.2f' % np.sqrt(pooled_vars[i]))
print(' & '.join(l) + ' \\\\')

# std dev of true segment scores (or perfect annotator)
print('$\sqrt{\Var(P(x))}$', end=' & ')  
l = []  
p_vars = {}
for i in cols:
    p_var = total_vars[i] - pooled_vars[i]
    p_vars[i] = p_var
    l.append('%.3f' % np.sqrt(p_var))
print(' & '.join(l) + ' \\\\')

print('$\Var(H(x)) / \Var(P(x))$', end=' & ')  
l = []
for i in cols:
    p_var = p_vars[i]
    total_var = total_vars[i]
    
    l.append('%.3f' % (total_var / p_var)) 
print(' & '.join(l) + ' \\\\')

print(
r'''
    \end{tabular}
    \caption{Step-by-step derivation for the efficiency ratio $r$ (fourth row) of the perfect annotator estimator for SummEval as defined in \S 4.1. Square roots are taken so that values are in terms of the original units (standard deviations, judgments range from 1-5). Note that there is little agreement between experts and turkers at the system level.} \label{perfect_annotator_derivation_summeval}
\end{table}
'''
)


\begin{table}[!h]
    \small
    \centering
    \begin{tabular}{l|cc}
         & Expert & Turker \\
         \midrule

$\sqrt{\Var(H(x))}$ & 0.72 & 0.75 \\
$\sqrt{\Expect[\Var(H(x) | x)]}$ & 0.29 & 0.48 \\
$\sqrt{\Var(P(x))}$ & 0.655 & 0.574 \\
$\Var(H(x)) / \Var(P(x))$ & 1.201 & 1.686 \\

    \end{tabular}
    \caption{Step-by-step derivation for the efficiency ratio $r$ (fourth row) of the perfect annotator estimator for SummEval as defined in \S 4.1. Square roots are taken so that values are in terms of the original units (standard deviations, judgments range from 1-5). Note that there is little agreement between experts and turkers at the system level.} \label{perfect_annotator_derivation_summeval}
\end{table}

