In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from pprint import pprint
from collections import OrderedDict
import scipy.stats


# Load human evaluation results

In [14]:
# Numbers reported in the paper
# mauve_scores_raw = {
#       "('gpt2', 'p0.9')": 0.476346667,
#       "('gpt2', 'p1.0')": 0.468094444,
#       "('gpt2-large', 'p0.95')": 0.480351111,
#       "('gpt2-large', 'p1.0')": 0.472142222,
#       "('gpt2-medium', 'p0.9')": 0.477806667,
#       "('gpt2-medium', 'p1.0')": 0.462046667,
#       "('gpt2-xl', 'p0.95')":0.481145556,
#       "('gpt2-xl', 'p1.0')": 0.472241111}
# mauve_scores_raw = {
#       "('gpt2', 'p0.9')": 0.816555203,
#       "('gpt2', 'p1.0')": 0.815964738,
#       "('gpt2-large', 'p0.95')": 0.818678255,
#       "('gpt2-large', 'p1.0')": 0.813630059,
#       "('gpt2-medium', 'p0.9')": 0.815414144,
#       "('gpt2-medium', 'p1.0')": 0.812838286,
#       "('gpt2-xl', 'p0.95')": 0.821078288,
#       "('gpt2-xl', 'p1.0')": 0.806571319}
# mauve_scores_raw = {
#       "('gpt2', 'p0.9')": 0.193757143,
#       "('gpt2', 'p1.0')": 0.19451222,
#       "('gpt2-large', 'p0.95')": 0.192980018,
#       "('gpt2-large', 'p1.0')": 0.195804673,
#       "('gpt2-medium', 'p0.9')": 0.194424819,
#       "('gpt2-medium', 'p1.0')": 0.196552829,
#       "('gpt2-xl', 'p0.95')": 0.191460869,
#       "('gpt2-xl', 'p1.0')": 0.199148149}

mauve_scores_raw = {
      "('gpt2', 'p0.9')": 0.359315027,
      "('gpt2', 'p1.0')": 0.318698191,
      "('gpt2-large', 'p0.95')": 0.355567282,
      "('gpt2-large', 'p1.0')": 0.337723148,
      "('gpt2-medium', 'p0.9')": 0.357929565,
      "('gpt2-medium', 'p1.0')": 0.309786716,
      "('gpt2-xl', 'p0.95')": 0.359468025,
      "('gpt2-xl', 'p1.0')": 0.331293712}

mauve_scores = pd.Series(mauve_scores_raw, name="mauve")

mauve_scores.to_frame()

Unnamed: 0,mauve
"('gpt2', 'p0.9')",0.193757
"('gpt2', 'p1.0')",0.194512
"('gpt2-large', 'p0.95')",0.19298
"('gpt2-large', 'p1.0')",0.195805
"('gpt2-medium', 'p0.9')",0.194425
"('gpt2-medium', 'p1.0')",0.196553
"('gpt2-xl', 'p0.95')",0.191461
"('gpt2-xl', 'p1.0')",0.199148


Download the raw output file `mauve-human-eval-anon.csv` from [here](https://github.com/krishnap25/mauve-experiments/blob/main/human_evaluation.md) and place it in the same folder as this notebook. 

In [15]:
results_fn = 'mauve-human-eval-anon.csv' 


In [16]:
df0 = pd.read_csv(results_fn, index_col=0)

df0.head(2)

Unnamed: 0,HITId,WorkerId,WorkTimeInSeconds,Input.idx,Input.model_b,Input.model_a,Input.ctx,Input.completionb,Input.completiona,Input.len_b,Input.len_a,Answer.q1,Answer.q2,Answer.q3,Answer.te
0,0,W196,2040,4848,"('gpt2', 'p0.9')","('gpt2-xl', 'p0.95')",<p><strong>Cost segregation benefits multifami...,<p>increasing the cost of property when proper...,<p>decreasing personal property taxes for cert...,1024,545,1a,1a,1b,28.109
1,1,W132,1606,3352,"('gpt2', 'p1.0')","('gpt2-large', 'p1.0')",<p><strong>Endpoint for getting Luas (Dublin l...,<p>(Dublin light rail).</p><p>Added &quot;subs...,<p>from Dublin and Dublin City using the Verti...,1022,1021,1a,1a,2a,105.285



The columns in the CSV files are: 
- `HITId`: Integer indexing the row 
- `WorkerId`: Unique identifier of the crowd-worker 
- `WorkTimeInSeconds`: Amount of time the HIT was open on AMT
- `Input.idx`: Index of the prompt 
- `Input.ctx`: Context/prompt that each completion is based upon
- `Input.model_a`: Name of player A
- `Input.completiona`: Completion generated by player A
- `Input.len_a`: Total length (prompt + generation) of player A's text
- `Input.model_b`: Name of player B
- `Input.completionb`: Completion generated by player B
- `Input.len_b`: Total length (prompt + generation) of player B's text
- `Answer.q1`:  Answer of crowd-worker to the question: "Which continuation is more interesting or creative, given the context?"
- `Answer.q2`: Answer of crowd-worker to the question: "Which continuation makes more sense, given the context?"
- `Answer.q3`: Answer of crowd-worker to the question: "Which continuation is more likely to be written by a human?" 
- `Answer.te`: Our (pessimistic) estimate of the amount of time the crowd-worker took to answer the question.


Key to `Answer.q*` fields: The responses of the crowd-workers to each question is stored with the following key: 
- Definitely A: 2a
- Slightly A: 1a
- Tie: 1a 
- Slightly B: 1b
- Definitely B: 2b

Note that both "Tie" and "Slightly A" are recorded as `1a`. Since for each pair, the choice of A versus B is randomized, this amounts to randomly assigning each tie as a win to one of the two players. 


In [17]:
player_names = np.array(list(mauve_scores_raw.keys()) + ["human"])
print(player_names)

["('gpt2', 'p0.9')" "('gpt2', 'p1.0')" "('gpt2-large', 'p0.95')"
 "('gpt2-large', 'p1.0')" "('gpt2-medium', 'p0.9')"
 "('gpt2-medium', 'p1.0')" "('gpt2-xl', 'p0.95')" "('gpt2-xl', 'p1.0')"
 'human']


In [18]:
def process_field_name(field_name):
    if 'q1' in field_name:
        final_name = 'Interesting'
    elif 'q2' in field_name:
        final_name = 'Sensible'
    elif 'q3' in field_name:
        final_name = 'Human-like'
    else:
        raise ValueError(f'Unknown name: {field_name}')
    return final_name

# Bradley-Terry Scores: Implementation

In [19]:
player_name_to_idx = OrderedDict(enumerate(player_names))

In [20]:
def get_model1_v_model2(results, model1, model2):
    df1 = results[(results['model1'] == model1) & (results['model2'] == model2)]
    df2 = results[(results['model2'] == model1) & (results['model1'] == model2)]
    m1_better = df1['m1 better'].sum() + df2['m2 better'].sum()
    m2_better = df1['m2 better'].sum() + df2['m1 better'].sum()
    return m1_better, m2_better

In [21]:
def get_head2head_and_BT_rank(field_name='Answer.q3', threshold_time=25, max_iterations=1000):
    df = df0.copy()[df0['Answer.te'] > threshold_time]   # Filter all responses made under `threshold_time`
        
    # Collect head2head numbers from the results dataframe
    # Account for randomization of model_a versus model_b for the human eval
    results = []
    for i, m1 in enumerate(player_names):
        for j, m2 in enumerate(player_names): 
            if i <= j: 
                continue
            df1 = df[(df['Input.model_a'] == m1) & (df['Input.model_b'] == m2)]
            df2 = df[(df['Input.model_b'] == m1) & (df['Input.model_a'] == m2)]
            total = df1.shape[0] + df2.shape[0]
            if total == 0: continue
            m1_better = df1[df1[field_name].isin(['1a', '2a'])].shape[0] + df2[df2[field_name].isin(['1b', '2b'])].shape[0]
            m2_better = df2[df2[field_name].isin(['1a', '2a'])].shape[0] + df1[df1[field_name].isin(['1b', '2b'])].shape[0]
            tie = df1[df1[field_name] == '0'].shape[0] + df2[df2[field_name] == '0'].shape[0]
            res = OrderedDict([('model1', m1), ('model2', m2), ('m1 better', m1_better), ('m2 better', m2_better),
                              ('m1 frac', m1_better/total), ('m2 frac', m2_better/total)
                              ])
            results.append(res)
    results = pd.DataFrame(results)  
    
    # Compute B-T preprocessing: collect the head-to-head
    all_results = np.zeros((player_names.shape[0], player_names.shape[0]), dtype=np.int)  # head-to-head
    wins_per_model = np.zeros(player_names.shape[0], dtype=np.int)  # total #wins per model
    
    for i, m1 in player_name_to_idx.items():
        total = 0
        for j, m2 in player_name_to_idx.items():
            if m1 != m2:
                t = get_model1_v_model2(results, m1, m2)[0]  # m1 better than m2
                all_results[i, j] = t
                total += t
        wins_per_model[i] = total
        
    # Compute B-T probs
    ps = np.random.rand(player_names.shape[0])
    ps /= ps.sum()
    qs = np.zeros_like(ps)

    # Run iterations of Zeremelo's algorithm. See e.g. for details: 
    # https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model
    for iteration in range(max_iterations):
        for i in range(player_names.shape[0]):
            denom = sum([(all_results[i, j] + all_results[j, i]) / (ps[i] + ps[j]) 
                         for j in range(player_names.shape[0]) if i != j])
            qs[i] = wins_per_model[i] / denom 
        ps_new = qs / qs.sum()
        if np.linalg.norm(ps_new - ps, 1) < 1e-16:
            # Algorithm converged
            break
        ps = ps_new
    
    # Convert `ps` into logspace and scale them as described in Appendix E.2 of
    # the [paper](https://arxiv.org/pdf/2102.01454.pdf).
    ps = np.log(ps)
    ps -= ps.mean()
    ps *= 100
    
    # Clean up the output
    final_name = process_field_name(field_name)
    out = pd.Series(dict(zip(player_name_to_idx.values(), ps)), name=f'BT/{final_name}')
    return out.sort_values(ascending=False)

# Obtain the Bradley-Terry scores 

We discard all annotations made under `THRESHOLD_TIME=25` seconds for quality control using our pessimistic estimates obtained from the field `Answer.te`. 
See p. 29 of the paper under the heading "Quality Control" for details. 

The correlations are reported in Table 5 (and Table 14) of the paper. 
The raw Bradley-Terry scores are reported in Table 13. 

In [22]:
THRESHOLD_TIME = 25

In [23]:
# Answer.q3: Answer of crowd-worker to the question: 
# "Which continuation is more likely to be written by a human?" 

h3 = get_head2head_and_BT_rank(field_name='Answer.q3', threshold_time=THRESHOLD_TIME)

correlation = scipy.stats.spearmanr(h3.drop("human").sort_index(), mauve_scores.sort_index())
print("Correlation =", correlation)
h3.to_frame()

Correlation = SpearmanrResult(correlation=-0.523809523809524, pvalue=0.18272075053971484)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Unnamed: 0,BT/Human-like
human,47.251038
"('gpt2-xl', 'p0.95')",15.663819
"('gpt2-large', 'p0.95')",12.552649
"('gpt2-xl', 'p1.0')",8.966299
"('gpt2-medium', 'p0.9')",-3.429421
"('gpt2-large', 'p1.0')",-6.934852
"('gpt2', 'p0.9')",-15.78329
"('gpt2', 'p1.0')",-27.517603
"('gpt2-medium', 'p1.0')",-30.76864


In [24]:
# Answer.q2: Answer of crowd-worker to the question: 
# "Which continuation makes more sense, given the context?"

h2 = get_head2head_and_BT_rank(field_name='Answer.q2', threshold_time=THRESHOLD_TIME)
correlation = scipy.stats.spearmanr(h2.drop("human").sort_index(), mauve_scores.sort_index())
print("Correlation =", correlation)
h2.to_frame()

Correlation = SpearmanrResult(correlation=-0.4285714285714286, pvalue=0.2894032248467901)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Unnamed: 0,BT/Sensible
human,43.229275
"('gpt2-xl', 'p0.95')",31.887734
"('gpt2-large', 'p0.95')",8.781433
"('gpt2-xl', 'p1.0')",7.752505
"('gpt2-large', 'p1.0')",-7.10611
"('gpt2-medium', 'p0.9')",-7.29327
"('gpt2', 'p0.9')",-7.441769
"('gpt2-medium', 'p1.0')",-32.004313
"('gpt2', 'p1.0')",-37.805484


In [25]:
# Answer.q1: Answer of crowd-worker to the question: 
# "Which continuation is more interesting or creative, given the context?"
        
h1 = get_head2head_and_BT_rank(field_name='Answer.q1', threshold_time=THRESHOLD_TIME)
correlation = scipy.stats.spearmanr(h1.drop("human").sort_index(), mauve_scores.sort_index())
print("Correlation =", correlation)
h1.to_frame()

Correlation = SpearmanrResult(correlation=-0.4285714285714286, pvalue=0.2894032248467901)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Unnamed: 0,BT/Interesting
human,25.503156
"('gpt2-xl', 'p0.95')",23.045606
"('gpt2-xl', 'p1.0')",9.529022
"('gpt2-large', 'p0.95')",6.785066
"('gpt2', 'p0.9')",-0.696617
"('gpt2-large', 'p1.0')",-1.532425
"('gpt2-medium', 'p0.9')",-12.823619
"('gpt2', 'p1.0')",-15.487289
"('gpt2-medium', 'p1.0')",-34.322899
