<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/sandboxes/BB/analysis/metrics_text_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Metrics: Text Evaluation

### Imports/Setup

In [51]:
from os import listdir
from os.path import isfile, join

import csv
import json
import pprint

import pandas as pd
from tqdm import tqdm
from sklearn.utils import shuffle

In [32]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
evaluation_root = "/content/drive/MyDrive/w266 NLP Final Project/Evaluation/"
filename = join(evaluation_root, "evaluation_database.json")

# Read JSON into dataframe
evaluation_df = pd.read_json(filename)

### Clean up evaluation dataframe to only include columns needed for text analysis

In [34]:
# Clean up evaluation db df to only include columns needed for text analysis
clean_eval_df = evaluation_df[
    [
        "nickname",
        "base_model",
        "trained_on",
        "tested_on",
        "hyperparameter",
        "target",
        "prediction",
        "bleu",
        "rougeL",
        "meteor",
        "bertscore-f1",
        "use",
    ]
]

clean_eval_df

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
0,bart_nq_nq,bart,nq,nq,{'defaults': True},what was the real name of saudi arabia,what is the name of the kingdom of saudi arabia,0.000000,0.666667,0.623306,0.915830,0.794086
1,bart_nq_nq,bart,nq,nq,{'defaults': True},whats the most liked picture on instagram 2018,what is the most liked picture on instagram,0.680375,0.750000,0.864796,0.932259,0.884909
2,bart_nq_nq,bart,nq,nq,{'defaults': True},where does the movie proof of life take place,where does alice go in the new movie,0.000000,0.470588,0.354635,0.835192,0.517866
3,bart_nq_nq,bart,nq,nq,{'defaults': True},where is net profit on the balance sheet,where does net profit come from in a financial...,0.000000,0.333333,0.311653,0.856130,0.728012
4,bart_nq_nq,bart,nq,nq,{'defaults': True},when was fingerprinting first used by the police,when was fingerprint technology first used in ...,0.000000,0.588235,0.694444,0.876353,0.704206
...,...,...,...,...,...,...,...,...,...,...,...,...
286285,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},Which ex British daily newspaper was first pub...,What is the name of the city of Manchester?,0.000000,0.093023,0.058309,0.679618,0.322748
286286,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},Which cartoon character lived in Bunkerton Cas...,What was the name of Lord Marmaduke of Bunkerton?,0.000000,0.125000,0.121951,0.794428,0.515345
286287,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},In which prison was 'Amy' born in the novel 'L...,What is the name of the prison in Charles Dick...,0.000000,0.416667,0.248227,0.821790,0.593840
286288,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},The Sign Of Four was a detective story written...,Who wrote the novels 'The Sign of Four' and 'A...,0.000000,0.400000,0.381426,0.796076,0.585255


In [None]:
clean_eval_df["bertscore-f1"].describe()

count    286290.000000
mean          0.789801
std           0.082735
min           0.521592
25%           0.729552
50%           0.784537
75%           0.845546
max           1.000000
Name: bertscore-f1, dtype: float64

In [None]:
clean_eval_df["use"].describe()

count    286290.000000
mean          0.483113
std           0.246386
min          -0.147977
25%           0.291765
50%           0.487792
75%           0.671069
max           1.000000
Name: use, dtype: float64

In [None]:
clean_eval_df.describe()

Unnamed: 0,bleu,rougeL,meteor,bertscore-f1,use
count,286290.0,286290.0,286290.0,286290.0,286290.0
mean,0.048842,0.295583,0.279015,0.789801,0.483113
std,0.158423,0.233494,0.234177,0.082735,0.246386
min,0.0,0.0,0.0,0.521592,-0.147977
25%,0.0,0.125,0.097087,0.729552,0.291765
50%,0.0,0.25,0.2,0.784537,0.487792
75%,0.0,0.428571,0.408791,0.845546,0.671069
max,1.0,1.0,0.999937,1.0,1.0


### Find the overall averages for each prediction set and metric

In [9]:
# Find the overall averages for each prediction set and metric
eval_means_df = clean_eval_df.groupby(["nickname"]).mean()

# Reset index so able to groupby and sort below to find top scoring prediction sets for each metric
eval_means_df = eval_means_df.reset_index("nickname")

eval_means_df = eval_means_df[["nickname", "bleu", "rougeL", "meteor", "bertscore-f1", "use"]]
eval_means_df

Unnamed: 0,nickname,bleu,rougeL,meteor,bertscore-f1,use
0,T5_amalgam_nq,0.169746,0.527758,0.497871,0.858036,0.719863
1,T5_amalgam_quac,0.053671,0.249039,0.262385,0.77806,0.378001
2,T5_amalgam_squad,0.142885,0.470772,0.469798,0.867048,0.656399
3,T5_amalgam_triviaqa,0.06639,0.35913,0.337926,0.822766,0.601577
4,T5_nq_nq,0.172979,0.527675,0.497261,0.859476,0.718928
5,T5_nq_quac,0.001471,0.1519,0.111625,0.696885,0.287737
6,T5_nq_squad,0.017446,0.346182,0.27346,0.791693,0.554671
7,T5_nq_triviaqa,0.00745,0.278408,0.189823,0.766649,0.518441
8,T5_quac_nq,0.0,0.068525,0.040712,0.662784,0.180669
9,T5_quac_quac,0.023456,0.173453,0.189914,0.756046,0.303081


### Create BART and T5 dataframes to use if want to focus on one

In [10]:
# Create BART dataframe
bart_df = clean_eval_df[clean_eval_df["base_model"] == "bart"]

In [11]:
# Create T5 dataframe
t5_df = clean_eval_df[clean_eval_df["base_model"] == "T5"]

## Model Evaluation

## BERTScore vs. USE Scores

### BERTScore > USE

All Models: BERTScore > USE

In [52]:
# Find predictions with semantic metric score differences

sem_diff = clean_eval_df[
    (clean_eval_df["bertscore-f1"] >= 0.85)  # top 25th percentile bertscore
    & (clean_eval_df["use"] <= 0.3)   # bottom 25th percentile use
]

sem_diff = sem_diff.sort_values(by="use", ascending=False)

sem_diff

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
244410,T5_triviaqa_squad,T5,triviaqa,squad,{'defaults': True},What are Plastoglobuli made of?,The plastoglobules are spherical bubbles of what?,0.000000,0.333333,0.322581,0.899336,0.299999
240950,T5_triviaqa_squad,T5,triviaqa,squad,{'defaults': True},A molecular phylogeny analysis confirmed that ...,The cydippids are not what?,0.220834,0.400000,0.474286,0.857079,0.299485
97805,bart_triviaqa_squad,bart,triviaqa,squad,{'defaults': True},A molecular phylogeny analysis confirmed that ...,The cydippids are not what?,0.220834,0.400000,0.474286,0.857079,0.299485
75697,bart_squad_squad,bart,squad,squad,{'defaults': True},What is one work by Olivier Messiaen?,What was the name of Messiaen's 1935 work?,0.000000,0.250000,0.243902,0.859928,0.299191
91534,bart_triviaqa_quac,bart,triviaqa,quac,{'defaults': True},What was Azarath like?,What was the name of the alternate dimension c...,0.000000,0.428571,0.563616,0.871783,0.298804
...,...,...,...,...,...,...,...,...,...,...,...,...
162552,T5_nq_triviaqa,T5,nq,triviaqa,{'defaults': True},Hedonophobia is the irrational fear of what?,what is the meaning of hedonophobia,0.000000,0.461538,0.476923,0.859920,0.113248
247493,T5_triviaqa_squad,T5,triviaqa,squad,{'defaults': True},What are the small tentacles on Cydippids called?,The cydippids have what?,0.000000,0.333333,0.232558,0.873061,0.102150
45397,bart_quac_squad,bart,quac,squad,{'defaults': True},Along with private individuals and organizatio...,What is Ergänzungsschulen?,0.000000,0.375000,0.211203,0.871793,0.083942
238618,T5_triviaqa_squad,T5,triviaqa,squad,{'defaults': True},What do Plastoglobuli exchange contents with?,The plastoglobulus is anchored to which network?,0.000000,0.000000,0.070423,0.862789,0.066797


A fairly examples when looking at bottom two...

In [53]:
# Convert to Latex

# Take bottom two scoring ones
sem_diff = sem_diff.tail(2)

# Printe entire text
with pd.option_context("max_colwidth", 1000):
    print (sem_diff[["target", "prediction"]].to_latex(index=False))

\begin{tabular}{ll}
\toprule
                                                                                         target &                                       prediction \\
\midrule
                                                  What do Plastoglobuli exchange contents with? & The plastoglobulus is anchored to which network? \\
Along with private individuals and organizations, what groups sometimes runs ergänzungsschulen? &               Ergänzungsschulen are German what? \\
\bottomrule
\end{tabular}



Not great examples when random shuffle...

In [54]:
# Convert to Latex

# Take random two scoring ones
sem_diff = shuffle(sem_diff).tail(2)

# Printe entire text
with pd.option_context("max_colwidth", 1000):
    print (sem_diff[["target", "prediction"]].to_latex(index=False))

\begin{tabular}{ll}
\toprule
                                                                                         target &                                       prediction \\
\midrule
                                                  What do Plastoglobuli exchange contents with? & The plastoglobulus is anchored to which network? \\
Along with private individuals and organizations, what groups sometimes runs ergänzungsschulen? &               Ergänzungsschulen are German what? \\
\bottomrule
\end{tabular}



BART Model: BERScore > USE

In [65]:
# Find BART predictions with semantic metric score differences
# Specifically chose trained on = amalgam because high performing model

bart_sem_diff = bart_df[
    (bart_df["trained_on"] == "amalgam")
    & (bart_df["bertscore-f1"] >= 0.85)  # top 25th percentile bertscore
    & (bart_df["use"] <= 0.3)   # bottom 25th percentile use
]

bart_sem_diff = bart_sem_diff.sort_values(by="use", ascending=False)

bart_sem_diff

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
128985,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},What type of support does co-teaching provide?,What does co-teaching provide to students to r...,0.0,0.47619,0.524706,0.864651,0.298659
119148,bart_amalgam_quac,bart,amalgam,quac,{'defaults': True},What was Dana's character like?,What was Scully's faith?,0.0,0.545455,0.457428,0.863713,0.297248
125258,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},Where is the Tyneside Bar located?,What extension does the Tyneside Cinema have?,0.0,0.307692,0.359937,0.853858,0.296461
118099,bart_amalgam_quac,bart,amalgam,quac,{'defaults': True},What was Brando's lifestyle like?,What was Brando's reputation as a bad boy?,0.0,0.533333,0.663014,0.89021,0.295167
130667,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},How are ergänzungsschulen funded?,What does Ergungsschulen charge their students?,0.0,0.0,0.096154,0.868782,0.287752
129804,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},What do current ctenophores have that fossils ...,What are ctenophores thought to have?,0.0,0.352941,0.274457,0.886462,0.282767
123000,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},What were the fossils that were found to repre...,What are ctenophores thought to have?,0.0,0.285714,0.209023,0.86305,0.280719
127483,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},Who were two of Kublai's Chinese advisers?,Who gave strong influence to Kublai's early co...,0.0,0.352941,0.346841,0.872934,0.278422
120975,bart_amalgam_quac,bart,amalgam,quac,{'defaults': True},did he break any other records with speed boat...,Did he win any other awards?,0.0,0.533333,0.459794,0.863179,0.273609
122949,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},What do pyrenoids store?,What is accumulated as the pyrenoids mature?,0.0,0.363636,0.283019,0.892096,0.272564


In [66]:
# Convert to Latex

# Take bottom two scoring ones
bart_sem_diff = bart_sem_diff.tail(2)

# Printe entire text
with pd.option_context("max_colwidth", 1000):
    print (bart_sem_diff[["target", "prediction"]].to_latex(index=False))

\begin{tabular}{ll}
\toprule
                                       target &                                                 prediction \\
\midrule
What was the reaction to the 2009 revelation? &             What was Graham's response to the Nixon tapes? \\
          What does phycoerytherin appear in? & Phycoerytherin is one of the pigments that makes what red? \\
\bottomrule
\end{tabular}



In [67]:
# Convert to Latex

# Take bottom two random ones
bart_sem_diff = shuffle(bart_sem_diff).tail(2)

# Printe entire text
with pd.option_context("max_colwidth", 1000):
    print (bart_sem_diff[["target", "prediction"]].to_latex(index=False))

\begin{tabular}{ll}
\toprule
                                       target &                                                 prediction \\
\midrule
          What does phycoerytherin appear in? & Phycoerytherin is one of the pigments that makes what red? \\
What was the reaction to the 2009 revelation? &             What was Graham's response to the Nixon tapes? \\
\bottomrule
\end{tabular}



T5 Model: BERScore > USE

In [15]:
# Find T5 predictions with semantic metric score differences

t5_sem_diff = t5_df[
    (t5_df["trained_on"] == "amalgam")
    & (t5_df["bertscore-f1"] >= 0.85)  # top 25th percentile bertscore
    & (t5_df["use"] <= 0.3)   # bottom 25th percentile use
]

t5_sem_diff = t5_sem_diff.sort_values(by="use", ascending=False)

t5_sem_diff

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
260520,T5_amalgam_quac,T5,amalgam,quac,{'defaults': True},Who's mistress was Lillie Langtry?,Who was Albert Edward Langtry's husband?,0.0,0.461538,0.352113,0.861289,0.297351
273812,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},How are ergänzungsschulen funded?,What do Ergänzungsschulen charge their students?,0.0,0.333333,0.192308,0.928205,0.295489
272728,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},What are phycobilisomes?,What do phycobilins often organize into?,0.0,0.222222,0.232558,0.878591,0.292014
272909,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},What plateau has groups of clay pits?,What plateau has only a few natural and artifi...,0.0,0.352941,0.451807,0.865636,0.29055
285236,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},What is an oblation in relation to a god? Offe...,What is an oblation?,0.042999,0.444444,0.275,0.864237,0.286224
270069,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},What does isobaric mean?,What isobaric process in the Rankine cycle?,0.0,0.363636,0.283019,0.855307,0.28375
271355,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},What is inequality associated with higher leve...,What is associated with lower levels of growth...,0.0,0.545455,0.661139,0.869704,0.283681
284105,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},What does a costermonger sell?,The term costermonger refers to a street selle...,0.0,0.133333,0.307692,0.851562,0.282168
270628,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},Who were two of Kublai's Chinese advisers?,Who gave strong influence to Kublai's early co...,0.0,0.352941,0.346841,0.872934,0.278422
276384,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},What proclamation officially ended limited Hug...,What gave Protestant rule to the Huguenots?,0.0,0.142857,0.1875,0.872132,0.274933


### BERTScore < USE

All Models: BERScore < USE

In [49]:
# Find predictions with semantic metric score differences

sem_diff = clean_eval_df[
    (clean_eval_df["bertscore-f1"] <= 0.73)  # top 25th percentile bertscore
    & (clean_eval_df["use"] >= 0.68)   # bottom 25th percentile use 
]

sem_diff = sem_diff.sort_values(by="bertscore-f1", ascending=False)

sem_diff

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
10484,bart_nq_squad,bart,nq,squad,{'defaults': True},How many pounds of steam per kilowatt hour doe...,what is the weight of a steam engine,0.0,0.272727,0.139860,0.729795,0.681846
166068,T5_nq_triviaqa,T5,nq,triviaqa,{'defaults': True},What pope resigned in 2013?,who was the last pope to leave the pontificate,0.0,0.142857,0.079365,0.729795,0.690938
258358,T5_amalgam_nq,T5,amalgam,nq,{'defaults': True},what is the number of total presidential elect...,how many electors are there in the united states,0.0,0.111111,0.111111,0.729511,0.697076
123714,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},Which running back did the Panthers waive?,who played for the panthers last year,0.0,0.285714,0.323488,0.729367,0.689400
200551,T5_squad_nq,T5,squad,nq,{'defaults': True},when did lynyrd skynyrd's plane crash happen,When did the Convair CV-240 crash?,0.0,0.400000,0.323488,0.729049,0.683831
...,...,...,...,...,...,...,...,...,...,...,...,...
229486,T5_triviaqa_nq,T5,triviaqa,nq,{'defaults': True},when did lionel messi play his first game for ...,The Messi family moved to Camp Nou in which year?,0.0,0.100000,0.049505,0.694685,0.692701
158827,T5_nq_squad,T5,nq,squad,{'defaults': True},Who did Emma Marry?,who was the king of england who married emma i...,0.0,0.266667,0.267857,0.694451,0.756505
23063,bart_nq_triviaqa,bart,nq,triviaqa,{'defaults': True},Which of England's 1966 World Cup winning foot...,who played for fulham in the premier league,0.0,0.347826,0.246711,0.694070,0.720818
154927,T5_nq_squad,T5,nq,squad,{'defaults': True},What were the win/loss game stats for the Denv...,how many times did the broncos win the super bowl,0.0,0.230769,0.097403,0.690972,0.680442


In [50]:
# Convert to Latex


# Take random two scoring ones
sem_diff = shuffle(sem_diff).head(2)

# Printe entire text
with pd.option_context("max_colwidth", 1000):
    print (sem_diff[["target", "prediction"]].to_latex(index=False))

\begin{tabular}{ll}
\toprule
                                                                          target &                                         prediction \\
\midrule
                                     when was the last time giants won superbowl & , in which year did the Giants win the Super Bowl? \\
How many pounds of steam per kilowatt hour does the Energiprojekt AB engine use? &               what is the weight of a steam engine \\
\bottomrule
\end{tabular}



BART Model: BERScore < USE

In [16]:
# Find BART predictions with semantic metric score differences

bart_sem_diff = bart_df[
    (bart_df["trained_on"] == "amalgam")
    & (bart_df["bertscore-f1"] <= 0.73)  # top 25th percentile bertscore
    & (bart_df["use"] >= 0.68)   # bottom 25th percentile use 
]

bart_sem_diff = bart_sem_diff.sort_values(by="use", ascending=False)

bart_sem_diff

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
125783,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},What was the number of times the Denver Bronco...,how many times have the broncos been to the su...,0.0,0.25,0.092166,0.718619,0.827251
115782,bart_amalgam_nq,bart,amalgam,nq,{'defaults': True},where does us highway 1 start and end,where is the longest highway in the united states,0.0,0.235294,0.123457,0.710291,0.708976
115573,bart_amalgam_nq,bart,amalgam,nq,{'defaults': True},when does season 7 game of thrones dvd release,when does the new season of the walking dead c...,0.0,0.4,0.343071,0.721894,0.70751
132996,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.704499
128982,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.704499
123714,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},Which running back did the Panthers waive?,who played for the panthers last year,0.0,0.285714,0.323488,0.729367,0.6894


T5 Model: BERScore < USE

In [17]:
# Find T5 predictions with semantic metric score differences

t5_sem_diff = t5_df[
    (t5_df["trained_on"] == "amalgam")
    & (t5_df["bertscore-f1"] <= 0.73)  # top 25th percentile bertscore
    & (t5_df["use"] >= 0.68)   # bottom 25th percentile use
]

t5_sem_diff = t5_sem_diff.sort_values(by="use", ascending=False)

t5_sem_diff

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
258767,T5_amalgam_nq,T5,amalgam,nq,{'defaults': True},who will take the throne after the queen dies,who is the heir apparent of queen elizabeth II,0.0,0.333333,0.166667,0.728162,0.701911
258358,T5_amalgam_nq,T5,amalgam,nq,{'defaults': True},what is the number of total presidential elect...,how many electors are there in the united states,0.0,0.111111,0.111111,0.729511,0.697076
259720,T5_amalgam_nq,T5,amalgam,nq,{'defaults': True},when is the first a revealed in pretty little ...,when does mona vanderwaal become a,0.0,0.25,0.104167,0.714545,0.691913
259792,T5_amalgam_nq,T5,amalgam,nq,{'defaults': True},who sings the rap in baby by justin bieber,who sings baby baby baby oh i thought you woul...,0.0,0.272727,0.271868,0.704157,0.684645
258817,T5_amalgam_nq,T5,amalgam,nq,{'defaults': True},what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623


## All Perfect Scores

In [18]:
# Find BART predictions with all metric scores of 100

bart_df_perf = bart_df[
    (bart_df["bleu"] >= 0.99)
    & (bart_df["rougeL"] >= 0.99)
    & (bart_df["meteor"] >= 0.99)
    & (bart_df["bertscore-f1"] >= 0.99)
    & (bart_df["use"] >= 0.99)
]

bart_df_perf = bart_df_perf.sort_values(by="use", ascending=False)

bart_df_perf

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
126408,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},When was the 8-4-4 system launched?,When was the 8-4-4 system launched?,1.0,1.0,0.998542,1.0,1.000000
69887,bart_squad_squad,bart,squad,squad,{'defaults': True},How often do Parliament elections take place?,How often do Parliament elections take place?,1.0,1.0,0.999023,1.0,1.000000
69150,bart_squad_squad,bart,squad,squad,{'defaults': True},When was the 8-4-4 system launched?,When was the 8-4-4 system launched?,1.0,1.0,0.998542,1.0,1.000000
125247,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},When did Ronald Robinson die?,When did Ronald Robinson die?,1.0,1.0,0.997685,1.0,1.000000
74609,bart_squad_squad,bart,squad,squad,{'defaults': True},How often do Parliament elections take place?,How often do Parliament elections take place?,1.0,1.0,0.999023,1.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
119041,bart_amalgam_quac,bart,amalgam,quac,{'defaults': True},How did the album do on the charts?,How did the album do on the charts?,1.0,1.0,0.999314,1.0,0.999999
66602,bart_squad_squad,bart,squad,squad,{'defaults': True},When was the University Library founded?,When was the University Library founded?,1.0,1.0,0.998542,1.0,0.999999
123860,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},When was the University Library founded?,When was the University Library founded?,1.0,1.0,0.998542,1.0,0.999999
132600,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},What is the main gap to continued urbanization?,What is the main gap to continued urbanization?,1.0,1.0,0.999314,1.0,0.999999


In [19]:
# Find T5 predictions with all metric scores of 100

t5_df_perf = t5_df[
    (t5_df["bleu"] >= 0.99)
    & (t5_df["rougeL"] >= 0.99)
    & (t5_df["meteor"] >= 0.99)
    & (t5_df["bertscore-f1"] >= 0.99)
    & (t5_df["use"] >= 0.99)
]

t5_df_perf = t5_df_perf.sort_values(by="use", ascending=False)

t5_df_perf

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
274775,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},Who did Philip I want to marry?,Who did Philip I want to marry?,1.0,1.0,0.999023,1.0,1.000000
217754,T5_squad_squad,T5,squad,squad,{'defaults': True},How often do Parliament elections take place?,How often do Parliament elections take place?,1.0,1.0,0.999023,1.0,1.000000
270290,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},How often do Parliament elections take place?,How often do Parliament elections take place?,1.0,1.0,0.999023,1.0,1.000000
269553,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},When was the 8-4-4 system launched?,When was the 8-4-4 system launched?,1.0,1.0,0.998542,1.0,1.000000
213032,T5_squad_squad,T5,squad,squad,{'defaults': True},How often do Parliament elections take place?,How often do Parliament elections take place?,1.0,1.0,0.999023,1.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
272177,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},Who organized the trees of the Amazon into fou...,Who organized the trees of the Amazon into fou...,1.0,1.0,0.999624,1.0,1.000000
271652,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},What was Joseph Haas arrested for?,What was Joseph Haas arrested for?,1.0,1.0,0.998542,1.0,1.000000
275745,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},What is the main gap to continued urbanization?,What is the main gap to continued urbanization?,1.0,1.0,0.999314,1.0,0.999999
209747,T5_squad_squad,T5,squad,squad,{'defaults': True},When was the University Library founded?,When was the University Library founded?,1.0,1.0,0.998542,1.0,0.999999


### Examples of Perfect Semantic but NOT Lexical Scores

Note: There were no examples of perfect lexical but not semantic scores.

In [20]:
# Find BART predictions with all metric scores of 100

bart_df_perf = bart_df[
    (bart_df["bleu"] < 0.99)
    & (bart_df["rougeL"] < 0.99)
    & (bart_df["meteor"] < 0.99)
    & (bart_df["bertscore-f1"] >= 0.99)
    & (bart_df["use"] >= 0.99)
]

bart_df_perf = bart_df_perf.sort_values(by="use", ascending=False)

bart_df_perf

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
74375,bart_squad_squad,bart,squad,squad,{'defaults': True},The European Court of Justice cannot uphold me...,The European Court of Justice cannot uphold me...,0.782542,0.923077,0.931973,0.996113,0.991793
131633,bart_amalgam_squad,bart,amalgam,squad,{'defaults': True},The European Court of Justice cannot uphold me...,The European Court of Justice cannot uphold me...,0.782542,0.923077,0.931973,0.996113,0.991793
73834,bart_squad_squad,bart,squad,squad,{'defaults': True},What kind of protists are Euglenophytes?,What type of protists are Euglenophytes?,0.643459,0.833333,0.84127,0.990525,0.991606
115517,bart_amalgam_nq,bart,amalgam,nq,{'defaults': True},how many lines of symmetry are there in a equi...,how many lines of symmetry are there in an equ...,0.741945,0.909091,0.905455,0.997177,0.991213


In [21]:
# Find T5 predictions with all metric scores of 100

t5_df_perf = t5_df[
    (t5_df["bleu"] < 0.99)
    & (t5_df["rougeL"] < 0.99)
    & (t5_df["meteor"] < 0.99)
    & (t5_df["bertscore-f1"] >= 0.99)
    & (t5_df["use"] >= 0.99)
]

t5_df_perf = t5_df_perf.sort_values(by="use", ascending=False)

t5_df_perf

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
267661,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},What must the adoption of laws which will have...,What must the adoption of laws that will have ...,0.815355,0.933333,0.936389,0.995413,0.992263
216979,T5_squad_squad,T5,squad,squad,{'defaults': True},What kind of protists are Euglenophytes?,What type of protists are Euglenophytes?,0.643459,0.833333,0.84127,0.990525,0.991606
274237,T5_amalgam_squad,T5,amalgam,squad,{'defaults': True},What kind of protists are Euglenophytes?,What type of protists are euglenophytes?,0.0,0.833333,0.84127,0.990525,0.991606


### Specific example of perfect semantic but not lexical

In [22]:
spec_ex = clean_eval_df[
    (clean_eval_df["target"] == "What kind of protists are Euglenophytes?")
    & (clean_eval_df["bertscore-f1"] >= 0.99)
    & (clean_eval_df["use"] >= 0.99)
]

spec_ex = spec_ex[["nickname", "target", "prediction", "bleu", "rougeL", "meteor", "bertscore-f1", "use"]]

spec_ex

Unnamed: 0,nickname,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
73834,bart_squad_squad,What kind of protists are Euglenophytes?,What type of protists are Euglenophytes?,0.643459,0.833333,0.84127,0.990525,0.991606
216979,T5_squad_squad,What kind of protists are Euglenophytes?,What type of protists are Euglenophytes?,0.643459,0.833333,0.84127,0.990525,0.991606
274237,T5_amalgam_squad,What kind of protists are Euglenophytes?,What type of protists are euglenophytes?,0.0,0.833333,0.84127,0.990525,0.991606


In [23]:
# Convert to Latex
print(spec_ex.to_latex(index=False))

\begin{tabular}{lllrrrrr}
\toprule
        nickname &                                   target &                               prediction &     bleu &   rougeL &  meteor &  bertscore-f1 &      use \\
\midrule
bart\_squad\_squad & What kind of protists are Euglenophytes? & What type of protists are Euglenophytes? & 0.643459 & 0.833333 & 0.84127 &      0.990525 & 0.991606 \\
  T5\_squad\_squad & What kind of protists are Euglenophytes? & What type of protists are Euglenophytes? & 0.643459 & 0.833333 & 0.84127 &      0.990525 & 0.991606 \\
T5\_amalgam\_squad & What kind of protists are Euglenophytes? & What type of protists are euglenophytes? & 0.000000 & 0.833333 & 0.84127 &      0.990525 & 0.991606 \\
\bottomrule
\end{tabular}



### Examples of High Semantic but Low Lexical

In [24]:
# Find predictions with high semantic and low lexical scores

hi_sem_low_lex = clean_eval_df[
    (clean_eval_df["bleu"] < 0.1)  # bottom 25th percentile bleu
    & (clean_eval_df["rougeL"] < 0.13)  # bottom 25th percentile rougeL
    & (clean_eval_df["meteor"] < 0.10)  # bottom 25th percentile meteor
    & (clean_eval_df["bertscore-f1"] >= 0.85)  # top 25th percentile bertscore
    & (clean_eval_df["use"] >= 0.67)   # top 25th percentile use
]

hi_sem_low_lex = hi_sem_low_lex.sort_values(by="use", ascending=False)

hi_sem_low_lex

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
35171,bart_quac_quac,bart,quac,quac,{'defaults': True},How did she die?,What was her cause of death?,0.0,0.0,0.096154,0.85532,0.754843
121058,bart_amalgam_quac,bart,amalgam,quac,{'defaults': True},How did she die?,What was the cause of her death?,0.0,0.0,0.09434,0.851208,0.705527
264203,T5_amalgam_quac,T5,amalgam,quac,{'defaults': True},How did she die?,What was the cause of her death?,0.0,0.0,0.09434,0.851208,0.705527
37325,bart_quac_squad,bart,quac,squad,{'defaults': True},What was the total number of patents that Tesl...,How many patents did he have?,0.0,0.125,0.09434,0.862127,0.687061


In [25]:
# Convert to Latex
print(hi_sem_low_lex[["base_model", "target", "prediction", "bleu", "rougeL", "meteor", "bertscore-f1", "use"]].to_latex(index=False))

\begin{tabular}{lllrrrrr}
\toprule
base\_model &                                             target &                       prediction &  bleu &  rougeL &   meteor &  bertscore-f1 &      use \\
\midrule
      bart &                                   How did she die? &     What was her cause of death? &   0.0 &   0.000 & 0.096154 &      0.855320 & 0.754843 \\
      bart &                                   How did she die? & What was the cause of her death? &   0.0 &   0.000 & 0.094340 &      0.851208 & 0.705527 \\
        T5 &                                   How did she die? & What was the cause of her death? &   0.0 &   0.000 & 0.094340 &      0.851208 & 0.705527 \\
      bart & What was the total number of patents that Tesla... &    How many patents did he have? &   0.0 &   0.125 & 0.094340 &      0.862127 & 0.687061 \\
\bottomrule
\end{tabular}



### Examples of High Lexical but Low Semantic

In [26]:
# Find predictions with high semantic and low lexical scores

hi_lex_low_sem = clean_eval_df[
    (clean_eval_df["bleu"] >= 0.0)  # bottom 25th percentile bleu
    & (clean_eval_df["rougeL"] >= 0.43)  # bottom 25th percentile rougeL
    & (clean_eval_df["meteor"] >= 0.41)  # bottom 25th percentile meteor
    & (clean_eval_df["bertscore-f1"] < 0.73)  # top 25th percentile bertscore
    & (clean_eval_df["use"] < 0.3)   # top 25th percentile use
    # & (clean_eval_df["bertscore-f1"] < 0.73)  # top 25th percentile bertscore
    # & (clean_eval_df["use"] < 0.3)   # top 25th percentile use
]

hi_lex_low_sem = hi_lex_low_sem.sort_values(by="use", ascending=False)

hi_lex_low_sem

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
152223,T5_nq_squad,T5,nq,squad,{'defaults': True},What was the forerunner of the modern-day IEEE?,who was the vice president of the eie,0.0,0.470588,0.421348,0.699801,0.28443
5397,bart_nq_quac,bart,nq,quac,{'defaults': True},What happened after the mass?,what happened to the president of the philippines,0.0,0.461538,0.412186,0.72161,0.277746
149879,T5_nq_quac,T5,nq,quac,{'defaults': True},What happened to the poor?,what was pepys's response to the plague,0.0,0.461538,0.412186,0.710317,0.266592
147979,T5_nq_quac,T5,nq,quac,{'defaults': True},What was the highlight of his career?,what was the goal of dare in salt lake city,0.0,0.470588,0.457317,0.725878,0.26085
149538,T5_nq_quac,T5,nq,quac,{'defaults': True},Who was at the conference?,who was the prime minister of australia at the...,0.0,0.444444,0.471082,0.675146,0.25361
159881,T5_nq_squad,T5,nq,squad,{'defaults': True},What is the exam at the end of Form Four?,what is the end of primary education in kenya,0.0,0.526316,0.448148,0.729596,0.251426
4834,bart_nq_quac,bart,nq,quac,{'defaults': True},What was the highlight of his career?,what was the purpose of salt lake city's dare ...,0.0,0.444444,0.451807,0.728641,0.24677
235050,T5_triviaqa_quac,T5,triviaqa,quac,{'defaults': True},What is the most interesting statistic in the ...,What is the common belief in Hong Kong that wo...,0.0,0.454545,0.491453,0.71486,0.229665
94938,bart_triviaqa_squad,bart,triviaqa,squad,{'defaults': True},What building is the most interesting of the l...,"What is the name of the university in Warsaw, ...",0.0,0.454545,0.425926,0.722306,0.22184
148449,T5_nq_quac,T5,nq,quac,{'defaults': True},What was the reaction to the event?,what was the name of the man in tiananmen square,0.0,0.470588,0.457317,0.724706,0.208394


In [None]:
spec_ex = clean_eval_df[
    (clean_eval_df["target"] == "Who was at the conference?")
    & (clean_eval_df["nickname"] == "T5_nq_quac")
]

spec_ex = spec_ex[["nickname", "target", "prediction", "bleu", "rougeL", "meteor", "bertscore-f1", "use"]]

spec_ex

Unnamed: 0,nickname,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
149538,T5_nq_quac,Who was at the conference?,who was the prime minister of australia at the...,0.0,0.444444,0.471082,0.675146,0.25361


In [None]:
# Convert to Latex
print(spec_ex[["target", "prediction", "bleu", "rougeL", "meteor", "bertscore-f1", "use"]].to_latex(index=False))

\begin{tabular}{llrrrrr}
\toprule
                    target &                                         prediction &  bleu &   rougeL &   meteor &  bertscore-f1 &     use \\
\midrule
Who was at the conference? & who was the prime minister of australia at the ... &   0.0 & 0.444444 & 0.471082 &      0.675146 & 0.25361 \\
\bottomrule
\end{tabular}



In [None]:
spec_ex = clean_eval_df[
    (clean_eval_df["target"] == "What is the exam at the end of Form Four?")
    & (clean_eval_df["nickname"] == "T5_nq_squad")
]

spec_ex = spec_ex[["nickname", "target", "prediction", "bleu", "rougeL", "meteor", "bertscore-f1", "use"]]

spec_ex

Unnamed: 0,nickname,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
159881,T5_nq_squad,What is the exam at the end of Form Four?,what is the end of primary education in kenya,0.0,0.526316,0.448148,0.729596,0.251426


In [None]:
# Convert to Latex
print(spec_ex[["target", "prediction", "bleu", "rougeL", "meteor", "bertscore-f1", "use"]].to_latex(index=False))

\begin{tabular}{llrrrrr}
\toprule
                                   target &                                    prediction &  bleu &   rougeL &   meteor &  bertscore-f1 &      use \\
\midrule
What is the exam at the end of Form Four? & what is the end of primary education in kenya &   0.0 & 0.526316 & 0.448148 &      0.729596 & 0.251426 \\
\bottomrule
\end{tabular}



Check BLEU scores because looks like they are all 0..?

In [27]:
bleu_zero = clean_eval_df[clean_eval_df["bleu"] >= 0.0001]

bleu_zero

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
1,bart_nq_nq,bart,nq,nq,{'defaults': True},whats the most liked picture on instagram 2018,what is the most liked picture on instagram,0.680375,0.750000,0.864796,0.932259,0.884909
5,bart_nq_nq,bart,nq,nq,{'defaults': True},when was the last time the los angeles lakers ...,when was the last time the lakers won a champi...,0.643219,0.909091,0.844068,0.962039,0.961726
6,bart_nq_nq,bart,nq,nq,{'defaults': True},where are trigger points located in the body,where are trigger points located in the body,1.000000,1.000000,0.999023,1.000000,1.000000
12,bart_nq_nq,bart,nq,nq,{'defaults': True},when was the last time anyone was on the moon,when was the last time the us went to the moon,0.403528,0.666667,0.566781,0.926440,0.904742
14,bart_nq_nq,bart,nq,nq,{'defaults': True},who wrote papa got a brand new bag,who sang papa got a brand new bag,0.707107,0.875000,0.864796,0.965819,0.852755
...,...,...,...,...,...,...,...,...,...,...,...,...
286259,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},What is the common name for the perennial-flow...,What is the common name of the African violet?,0.349833,0.600000,0.529487,0.844998,0.599724
286273,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},"Feb 15, 1564 saw the birth of what famed Itali...",Which Italian astronomer discovered the four l...,0.058371,0.424242,0.302515,0.836991,0.697034
286278,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},Who played the title role in the 1951 film ‘Ca...,Who played Captain Horatio Hornblower in the 1...,0.249186,0.480000,0.612658,0.889873,0.863451
286281,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},"Which novelist, born in Bombay in 1865, was aw...",Who won the Nobel Prize for Literature in 1907?,0.183689,0.560000,0.441557,0.889193,0.795078


## Format code to PEP 8 Standards

### Steps

*   Install:

In [None]:
!pip install black[jupyter]


*   To format your code run:

In [None]:
!black /content/drive/MyDrive/'Colab Notebooks'/results_text_evaluation.ipynb


*   Don't save your notebook, hit F5 (Command + r) to refresh the page
*   Voila!
*   Now save!