<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/sandboxes/RR/rr_triviaqa_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [1]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
from matplotlib import pyplot as plt

import scipy.stats as st

from pprint import pprint

pd.set_option("precision", 5)
sns.set_theme()

In [3]:
evaluation_root = "/content/drive/MyDrive/w266 NLP Final Project/Evaluation/"
filename = evaluation_root + "evaluation_database.json"

In [4]:
evaluation_exclusions = [
    "bleu",
    "rouge1",
    "rouge2",
    "rougeL",
    "rougeLsum",
    "bertscore-precision",
    "bertscore-recall",
]

### TriviaQA Sliced

In [5]:
filename_1 = evaluation_root + "evaluation_database.json"
filename_2 = evaluation_root + "evaluation_database_triviaqa_sliced.json"

df_1_source = pd.read_json(filename_1)
df_2_source = pd.read_json(filename_2)

In [6]:

df_1 = df_1_source.drop(columns=evaluation_exclusions)
df_1 = df_1[df_1['tested_on'] =="triviaqa"]
df_2 = df_2_source.drop(columns=evaluation_exclusions)
df = pd.concat([df_1, df_2])
df = df[(df['base_model'] == 'bart') &
        (df['trained_on'].isin(['amalgam', 'triviaqa']))
        ]

In [7]:
focus='meteor'
drop=['bertscore-f1', 'use']

df.groupby(['tested_on', 'base_model', 'trained_on']).mean().sort_values(by=[focus], ascending=False).multiply(100).reset_index().drop(columns=drop)

Unnamed: 0,tested_on,base_model,trained_on,meteor
0,triviaqa_v1,bart,triviaqa,38.50562
1,triviaqa_v2,bart,amalgam,38.43945
2,triviaqa_d1,bart,amalgam,38.39928
3,triviaqa_v1,bart,amalgam,38.35918
4,triviaqa_d1,bart,triviaqa,38.203
5,triviaqa_v2,bart,triviaqa,37.89976
6,triviaqa,bart,amalgam,35.45594
7,triviaqa,bart,triviaqa,35.09811


In [8]:
### All of the shorter sets are better than the full set.
### The distinction between any of the first six and either of the last two is significant.
### There are no statistically significant differences among the first six.

x_val = ['triviaqa_v1', 'triviaqa']
y_val = ['triviaqa', 'amalgam']

x = df[(df['tested_on'] == x_val[0]) & (df['trained_on'] == x_val[1])]
y = df[(df['tested_on'] == y_val[0]) & (df['trained_on'] == y_val[1])]

st.ttest_ind(x[focus], y[focus], equal_var=False)[1]

0.004879176609981347

In [9]:
focus='bertscore-f1'
drop=['meteor', 'use']

df.groupby(['tested_on', 'base_model', 'trained_on']).mean().sort_values(by=[focus], ascending=False).multiply(100).reset_index().drop(columns=drop)

Unnamed: 0,tested_on,base_model,trained_on,bertscore-f1
0,triviaqa_v1,bart,triviaqa,84.01969
1,triviaqa_v1,bart,amalgam,83.94157
2,triviaqa_d1,bart,amalgam,83.89692
3,triviaqa_d1,bart,triviaqa,83.88058
4,triviaqa_v2,bart,amalgam,83.85218
5,triviaqa_v2,bart,triviaqa,83.74118
6,triviaqa,bart,amalgam,82.64203
7,triviaqa,bart,triviaqa,82.59038


In [10]:
### All of the shorter sets are better than the full set.
### The distinction between any of the first six and either of the last two is significant.
### There are no statistically significant differences among the first six.

x_val = ['triviaqa_v1', 'triviaqa']
y_val = ['triviaqa', 'amalgam']

x = df[(df['tested_on'] == x_val[0]) & (df['trained_on'] == x_val[1])]
y = df[(df['tested_on'] == y_val[0]) & (df['trained_on'] == y_val[1])]

st.ttest_ind(x[focus], y[focus], equal_var=False)[1]

8.204646752547784e-06

In [11]:
focus='use'
drop=['bertscore-f1', 'meteor']

df.groupby(['tested_on', 'base_model', 'trained_on']).mean().sort_values(by=[focus], ascending=False).multiply(100).reset_index().drop(columns=drop)

Unnamed: 0,tested_on,base_model,trained_on,use
0,triviaqa_v2,bart,triviaqa,61.63178
1,triviaqa_v2,bart,amalgam,61.41134
2,triviaqa_d1,bart,triviaqa,60.99276
3,triviaqa,bart,amalgam,60.91905
4,triviaqa_d1,bart,amalgam,60.89107
5,triviaqa,bart,triviaqa,60.77681
6,triviaqa_v1,bart,amalgam,60.37185
7,triviaqa_v1,bart,triviaqa,60.35503


In [12]:
### There are no statistically significant differences among any of these.

x_val = ['triviaqa_v2', 'triviaqa']
y_val = ['triviaqa_v1', 'triviaqa']

x = df[(df['tested_on'] == x_val[0]) & (df['trained_on'] == x_val[1])]
y = df[(df['tested_on'] == y_val[0]) & (df['trained_on'] == y_val[1])]

st.ttest_ind(x[focus], y[focus], equal_var=False)[1]

0.26862591869636643