<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/evaluation/evaluation_probe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Google Drive Mounting

In [15]:
import json
import pprint

import numpy as np
import pandas as pd

# Hugging Face Evaluation module
!pip install -q evaluate
import evaluate

# For loading the Universal Sentence Encoder
import tensorflow_hub as hub

# Pandas display setting
pd.options.display.max_colwidth = 500

In [3]:
# pip installs

!pip install -q rouge_score
!pip install bert_score

  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert_score
  Downloading bert_score-0.3.12-py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 3.6 MB/s 
Collecting transformers>=3.0.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 14.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 46.7 MB/s 
Installing collected packages: tokenizers, transformers, bert-score
Successfully installed bert-score-0.3.12 tokenizers-0.13.2 transformers-4.24.0


In [4]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


___
# `Evaluation_Probe` Class

In [26]:
class Evaluation_Probe:
  """
  docstring
  """
  
  def __init__(self):
    
    # Dictionary to store functions that compute each metric
    self.metric_computes = {}
    self.internal_computes = {}
    
    # DataFrame to store history of computations
    self.eval_history_df = pd.DataFrame()
    
    # Load all metric modules upon initializing
    self._load_metrics()

  
  def _load_metrics(self):
    """
    Load evaluation modules to run metrics.

    Metrics loaded:
    - BLEU
    - ROUGE-L
    - METEOR
    - BERTScore
    - Universal Sentence Encoder
    """
    
    # BLEU
    bleu = evaluate.load("bleu")

    # ROUGE
    rouge = evaluate.load('rouge')

    # METEOR
    meteor = evaluate.load('meteor')

    # BERTScore
    bertscore = evaluate.load('bertscore')

    # USE
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    use_model = hub.load(module_url)

    # Add functions to metric compute dictionary
    self.metric_computes.update(
        {
            'bleu': bleu,
            'rougeL': rouge,
            'meteor': meteor,
            'bertscore': bertscore,
            'use': use_model
        }
    )
    self.internal_computes.update(
        {
            'bleu': self.bleu_eval,
            'rougeL': self.rougeL_eval,
            'meteor': self.meteor_eval,
            'bertscore': self.bertscore_eval,
            'use': self.use_eval
        }
    )


  def get_metric_compute(self, metric_name):
    return self.metric_computes[metric_name]

  def get_internal_compute(self, metric_name):
    return self.internal_computes[metric_name]

  
  def bleu_eval(self, prediction, target):
    bleu = self.metric_computes['bleu']
    return bleu.compute(predictions=[prediction],
                        references=[target])['bleu']
  
  def use_eval(self, prediction, target):
    """
    Calculate cosine similarity between USE embeds of inputs.
    """
    use_model = self.get_metric_compute('use')
    embeds = use_model(
        [
          target,
          prediction
        ]
    )

    # Calculate correlation between embeddings
    corr = np.inner(embeds, embeds)

    # Grab prediction's correlation score
    pred_corr = corr[0].tolist()[1]

    return pred_corr


  def rougeL_eval(self, prediction, target):
    rouge = self.metric_computes['rougeL']
    return rouge.compute(predictions=[prediction],
                         references=[target])['rougeL']

  def meteor_eval(self, prediction, target):
    meteor = self.metric_computes['meteor']
    return meteor.compute(predictions=[prediction],
                          references=[target])['meteor']
  
  def bertscore_eval(self, prediction, target):
    bertscore = self.metric_computes['bertscore']
    return bertscore.compute(predictions=[prediction],
                             references=[target],
                             model_type='distilbert-base-uncased')['f1'][0]


  def evaluate(self, prediction, target):
    """
    Evaluates a prediction based on the target across all loaded metrics
    in the internal metric dictionary.
    """

    results = {}
    # Add target and prediction
    results.update(
        {
            'target': target,
            'prediction': prediction
        }
    )

    # Evaluate on each metric
    for metric in self.metric_computes.keys():
      results.update(
          {
              metric: self.internal_computes[metric](prediction, target)
          }
      )

    # Add to history DF
    self.update_history(results)
    
    # Return
    return results

  
  def update_history(self, new_row):
    new_row_df = pd.DataFrame(new_row, index=[0])

    self.eval_history_df = pd.concat([self.eval_history_df, new_row_df],
                                     ignore_index=True)


  def get_history(self):
    return self.eval_history_df


# How to Use `Evaluation_Probe`

In [27]:
eval = Evaluation_Probe()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [28]:
target =     "who has the power to approve or veto legislation constitution?"

prediction = "who has the power to veto a bill?"

In [29]:
eval.evaluate(prediction, target)

{'target': 'who has the power to approve or veto legislation constitution?',
 'prediction': 'who has the power to veto a bill?',
 'bleu': 0.3887514204144021,
 'rougeL': 0.6666666666666665,
 'meteor': 0.6226379440665155,
 'bertscore': 0.9280303120613098,
 'use': 0.7975350618362427}

In [30]:
df = eval.get_history()
df

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535


In [31]:
target =     "who scored the game winning run in the last inning of last night's game?"

prediction = "what player scored the final run in yesterday's match to win it?"

eval.evaluate(prediction, target)

{'target': "who scored the game winning run in the last inning of last night's game?",
 'prediction': "what player scored the final run in yesterday's match to win it?",
 'bleu': 0.0,
 'rougeL': 0.3571428571428571,
 'meteor': 0.3367286392405063,
 'bertscore': 0.8611516356468201,
 'use': 0.714165210723877}

In [32]:
df = eval.get_history()
df

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165


In [33]:
target =     "Who won Super Bowl XLIX?"

prediction = "who did the broncos play in the superbowl"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045


In [34]:
target =     "what is the role of hcl in the stomach"

prediction = "What does hydrochloric acid do?"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623


In [35]:
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623


In [36]:
target =     "who has the power to do anything"

prediction = "who has the power to do nothing"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623
4,who has the power to do anything,who has the power to do nothing,0.809107,0.857143,0.855159,0.976179,0.904624


In [37]:
target =     "who has the power to do anything they want"

prediction = "who has the power to do nothing"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623
4,who has the power to do anything,who has the power to do nothing,0.809107,0.857143,0.855159,0.976179,0.904624
5,who has the power to do anything they want,who has the power to do nothing,0.608025,0.75,0.68024,0.912429,0.794857


In [38]:
target =     "who has the power to do anything"

prediction = "who has the power to do nothing except for what they must"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623
4,who has the power to do anything,who has the power to do nothing,0.809107,0.857143,0.855159,0.976179,0.904624
5,who has the power to do anything they want,who has the power to do nothing,0.608025,0.75,0.68024,0.912429,0.794857
6,who has the power to do anything,who has the power to do nothing except for what they must,0.417226,0.631579,0.798148,0.900351,0.787268


In [39]:
target =     "who has the power to do anything"

prediction = "who among us has the power"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623
4,who has the power to do anything,who has the power to do nothing,0.809107,0.857143,0.855159,0.976179,0.904624
5,who has the power to do anything they want,who has the power to do nothing,0.608025,0.75,0.68024,0.912429,0.794857
6,who has the power to do anything,who has the power to do nothing except for what they must,0.417226,0.631579,0.798148,0.900351,0.787268
7,who has the power to do anything,who among us has the power,0.0,0.615385,0.543478,0.826104,0.621501


In [40]:
target =     "this is small test"

prediction = "this is a test"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623
4,who has the power to do anything,who has the power to do nothing,0.809107,0.857143,0.855159,0.976179,0.904624
5,who has the power to do anything they want,who has the power to do nothing,0.608025,0.75,0.68024,0.912429,0.794857
6,who has the power to do anything,who has the power to do nothing except for what they must,0.417226,0.631579,0.798148,0.900351,0.787268
7,who has the power to do anything,who among us has the power,0.0,0.615385,0.543478,0.826104,0.621501
8,this is small test,this is a test,0.0,0.75,0.638889,0.89743,0.708941


In [41]:
target =     "one two three four hiya"

prediction = "one two three four cheese"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623
4,who has the power to do anything,who has the power to do nothing,0.809107,0.857143,0.855159,0.976179,0.904624
5,who has the power to do anything they want,who has the power to do nothing,0.608025,0.75,0.68024,0.912429,0.794857
6,who has the power to do anything,who has the power to do nothing except for what they must,0.417226,0.631579,0.798148,0.900351,0.787268
7,who has the power to do anything,who among us has the power,0.0,0.615385,0.543478,0.826104,0.621501
8,this is small test,this is a test,0.0,0.75,0.638889,0.89743,0.708941
9,one two three four hiya,one two three four cheese,0.66874,0.8,0.79375,0.915571,0.64395


In [42]:
target =     "one two three go"

prediction = "one two three no"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623
4,who has the power to do anything,who has the power to do nothing,0.809107,0.857143,0.855159,0.976179,0.904624
5,who has the power to do anything they want,who has the power to do nothing,0.608025,0.75,0.68024,0.912429,0.794857
6,who has the power to do anything,who has the power to do nothing except for what they must,0.417226,0.631579,0.798148,0.900351,0.787268
7,who has the power to do anything,who among us has the power,0.0,0.615385,0.543478,0.826104,0.621501
8,this is small test,this is a test,0.0,0.75,0.638889,0.89743,0.708941
9,one two three four hiya,one two three four cheese,0.66874,0.8,0.79375,0.915571,0.64395


In [43]:
target =     "one two three four okay woah"

prediction = "one two three four no"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623
4,who has the power to do anything,who has the power to do nothing,0.809107,0.857143,0.855159,0.976179,0.904624
5,who has the power to do anything they want,who has the power to do nothing,0.608025,0.75,0.68024,0.912429,0.794857
6,who has the power to do anything,who has the power to do nothing except for what they must,0.417226,0.631579,0.798148,0.900351,0.787268
7,who has the power to do anything,who among us has the power,0.0,0.615385,0.543478,0.826104,0.621501
8,this is small test,this is a test,0.0,0.75,0.638889,0.89743,0.708941
9,one two three four hiya,one two three four cheese,0.66874,0.8,0.79375,0.915571,0.64395


In [44]:
target =     "three two one go"

prediction = "one two three go"

eval.evaluate(prediction, target)
eval.get_history()

Unnamed: 0,target,prediction,bleu,rougeL,meteor,bertscore,use
0,who has the power to approve or veto legislation constitution?,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last inning of last night's game?,what player scored the final run in yesterday's match to win it?,0.0,0.357143,0.336729,0.861152,0.714165
2,Who won Super Bowl XLIX?,who did the broncos play in the superbowl,0.0,0.153846,0.080645,0.709131,0.7045
3,what is the role of hcl in the stomach,What does hydrochloric acid do?,0.0,0.142857,0.057471,0.727882,0.680623
4,who has the power to do anything,who has the power to do nothing,0.809107,0.857143,0.855159,0.976179,0.904624
5,who has the power to do anything they want,who has the power to do nothing,0.608025,0.75,0.68024,0.912429,0.794857
6,who has the power to do anything,who has the power to do nothing except for what they must,0.417226,0.631579,0.798148,0.900351,0.787268
7,who has the power to do anything,who among us has the power,0.0,0.615385,0.543478,0.826104,0.621501
8,this is small test,this is a test,0.0,0.75,0.638889,0.89743,0.708941
9,one two three four hiya,one two three four cheese,0.66874,0.8,0.79375,0.915571,0.64395
