<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/evaluation/evaluation_probe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Google Drive Mounting

In [1]:
import json
import pprint

import numpy as np
import pandas as pd

# Hugging Face Evaluation module
!pip install -q evaluate
import evaluate

# For loading the Universal Sentence Encoder
import tensorflow_hub as hub

In [2]:
# pip Installs

!pip install -q rouge_score
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


___
# `Evaluation_Probe` Class

In [4]:
class Evaluation_Probe:
  """
  docstring
  """
  
  def __init__(self):
    
    # Dictionary to store functions that compute each metric
    self.metric_computes = {}
    self.internal_computes = {}
    
    # DataFrame to store history of computations
    self.eval_history_df = pd.DataFrame()
    
    # Load all metric modules upon initializing
    self._load_metrics()

  
  def _load_metrics(self):
    """
    Load evaluation modules to run metrics.

    Metrics loaded:
    - BLEU
    - ROUGE
    - METEOR
    - BERTScore
    - Universal Sentence Encoder
    """
    
    # BLEU
    bleu = evaluate.load("bleu")

    # ROUGE
    rouge = evaluate.load('rouge')

    # METEOR
    meteor = evaluate.load('meteor')

    # BERTScore
    bertscore = evaluate.load('bertscore')

    # USE
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    use_model = hub.load(module_url)

    # Add functions to metric compute dictionary
    self.metric_computes.update(
        {
            'bleu': bleu,
            'rouge': rouge,
            'meteor': meteor,
            'bertscore': bertscore,
            'use': use_model
        }
    )
    self.internal_computes.update(
        {
            'bleu': self.bleu_eval,
            'rouge': self.rougeL_eval,
            'meteor': self.meteor_eval,
            'bertscore': self.bertscore_eval,
            'use': self.use_eval
        }
    )


  def get_metric_compute(self, metric_name):
    return self.metric_computes[metric_name]

  def get_internal_compute(self, metric_name):
    return self.internal_computes[metric_name]

  
  def bleu_eval(self, prediction, target):
    bleu = self.metric_computes['bleu']
    return bleu.compute(predictions=[prediction],
                        references=[target])['bleu']
  
  def use_eval(self, prediction, target):
    """
    Calculate cosine similarity between USE embeds of inputs.
    """
    use_model = self.get_metric_compute('use')
    embeds = use_model(
        [
          target,
          prediction
        ]
    )

    # Calculate correlation between embeddings
    corr = np.inner(embeds, embeds)

    # Grab prediction's correlation score
    pred_corr = corr[0].tolist()[1]

    return pred_corr


  def rougeL_eval(self, prediction, target):
    rouge = self.metric_computes['rouge']
    return rouge.compute(predictions=[prediction],
                         references=[target])['rougeL']

  def meteor_eval(self, prediction, target):
    meteor = self.metric_computes['meteor']
    return meteor.compute(predictions=[prediction],
                          references=[target])['meteor']
  
  def bertscore_eval(self, prediction, target):
    bertscore = self.metric_computes['bertscore']
    return bertscore.compute(predictions=[prediction],
                             references=[target],
                             model_type='distilbert-base-uncased')['f1'][0]


  def evaluate(self, prediction, target):
    """
    Evaluates a prediction based on the target across all loaded metrics
    in the internal metric dictionary.
    """

    results = {}
    # Add target and prediction
    results.update(
        {
            'target': target,
            'prediction': prediction
        }
    )

    # Evaluate on each metric
    for metric in self.metric_computes.keys():
      results.update(
          {
              metric: self.internal_computes[metric](prediction, target)
          }
      )

    # Add to history DF
    self.update_history(results)
    
    # Return
    return results

  
  def update_history(self, new_row):
    new_row_df = pd.DataFrame(new_row, index=[0])

    self.eval_history_df = pd.concat([self.eval_history_df, new_row_df],
                                     ignore_index=True)


  def get_history(self):
    return self.eval_history_df


# How to Use `Evaluation_Probe`

In [5]:
eval = Evaluation_Probe()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
target =     "who has the power to approve or veto legislation constitution?"

prediction = "who has the power to veto a bill?"

In [7]:
eval.evaluate(prediction, target)

{'target': 'who has the power to approve or veto legislation constitution?',
 'prediction': 'who has the power to veto a bill?',
 'bleu': 0.3887514204144021,
 'rouge': 0.6666666666666665,
 'meteor': 0.6226379440665155,
 'bertscore': 0.9280303120613098,
 'use': 0.7975350618362427}

In [8]:
df = eval.get_history()
df

Unnamed: 0,target,prediction,bleu,rouge,meteor,bertscore,use
0,who has the power to approve or veto legislati...,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535


In [9]:
target =     "who scored the game winning run in the last inning of last night's game?"

prediction = "what player scored the final run in yesterday's match to win it?"

eval.evaluate(prediction, target)

{'target': "who scored the game winning run in the last inning of last night's game?",
 'prediction': "what player scored the final run in yesterday's match to win it?",
 'bleu': 0.0,
 'rouge': 0.3571428571428571,
 'meteor': 0.3367286392405063,
 'bertscore': 0.8611516356468201,
 'use': 0.714165210723877}

In [10]:
df = eval.get_history()
df

Unnamed: 0,target,prediction,bleu,rouge,meteor,bertscore,use
0,who has the power to approve or veto legislati...,who has the power to veto a bill?,0.388751,0.666667,0.622638,0.92803,0.797535
1,who scored the game winning run in the last in...,what player scored the final run in yesterday'...,0.0,0.357143,0.336729,0.861152,0.714165
