<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/inference/evaluate_inferences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Google Drive Mounting

In [54]:
from os import listdir
from os.path import isfile, join

import csv
import pprint

import pandas as pd

In [2]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
inference_root = "/content/drive/MyDrive/w266 NLP Final Project/Predictions/"

In [11]:
inference_files = listdir(inference_root)
print(inference_files)

['t5_simple_transformers_preds.csv', 'predictions.T5_base_pt.squad.quac.csv', 'predictions.T5_base_pt.squad.squad.csv', 'predictions.T5_base_pt.quac.squad.csv']


# Evaluations
## Load Data

Inferences will be saved into the `inference_dict` nested dictionary, whose format is:
- keys: CSV filenames
- values:
  - `target`: list of target values
  - `prediction`: list of prediction values

In [33]:
inference_dict = {}

for id, inf_file in enumerate(inference_files):

  # Load CSV file containing predictions
  filename = join(inference_root, inf_file)
  
  # If the file exists, load it into pandas
  if isfile(filename):
    print(f"Opening file {id + 1} of {len(inference_files)}: {inf_file}\n")

    df = pd.read_csv(filename)
    
    # If the CSV does not have correct column names, warn user and skip file
    if 'target' not in df.columns and 'prediction' not in df.columns:
      print("WARNING: Columns `target` and `prediction` not found in CSV. Skipping CSV.")
      print(f"Check file: {filename}")
      # continue

    # Columns exist, so continue
    else:
      targets = df['target']
      predictions = df['prediction']

      print('CSV loaded.')
      print(f"Length of targets:      {len(targets)}")
      print(f"Length of predictions:  {len(predictions)}")
      
      # Save lists into prediction dictionary under file's name
      inference_dict.update(
          {inf_file: {'target': targets,
                      'prediction': predictions}
          }
      )
      print('\nTargets and predictions saved.')
    
    print('________________________________________\n')


print(f"\nTotal of {len(inference_dict.keys())} datasets loaded:")
for dataset in inference_dict.keys():
  print('    ' + dataset)

Opening file 1 of 4: t5_simple_transformers_preds.csv

Check file: /content/drive/MyDrive/w266 NLP Final Project/Predictions/t5_simple_transformers_preds.csv
________________________________________

Opening file 2 of 4: predictions.T5_base_pt.squad.quac.csv

CSV loaded.
Length of targets:      5868
Length of predictions:  5868

Targets and predictions saved.
________________________________________

Opening file 3 of 4: predictions.T5_base_pt.squad.squad.csv

CSV loaded.
Length of targets:      10570
Length of predictions:  10570

Targets and predictions saved.
________________________________________

Opening file 4 of 4: predictions.T5_base_pt.quac.squad.csv

CSV loaded.
Length of targets:      10570
Length of predictions:  10570

Targets and predictions saved.
________________________________________


Total of 3 datasets loaded:
    predictions.T5_base_pt.squad.quac.csv
    predictions.T5_base_pt.squad.squad.csv
    predictions.T5_base_pt.quac.squad.csv


## Evaluate Predictions

We'll be using:
- ROUGE
- BLEU-RT
- BERTScore

And storing evaluations in `evaluation_dict` formatted as:
- keys: CSV filenames
- values:
  - metric_name: metric_value

### Load Evaluation Metrics

In [34]:
!pip install -q evaluate
import evaluate

[K     |████████████████████████████████| 72 kB 945 kB/s 
[K     |████████████████████████████████| 163 kB 27.0 MB/s 
[K     |████████████████████████████████| 212 kB 51.0 MB/s 
[K     |████████████████████████████████| 441 kB 54.8 MB/s 
[K     |████████████████████████████████| 115 kB 50.3 MB/s 
[K     |████████████████████████████████| 95 kB 4.1 MB/s 
[K     |████████████████████████████████| 127 kB 53.5 MB/s 
[K     |████████████████████████████████| 115 kB 52.8 MB/s 
[?25h

#### ROUGE

In [38]:
!pip install -q rouge_score

rouge = evaluate.load('rouge')

#### BLEU-RT

In [39]:
!pip install git+https://github.com/google-research/bleurt.git

bleurt = evaluate.load('bleurt', module_type='metric', checkpoint='BLEURT-20')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-71yenjit
  Running command git clone -q https://github.com/google-research/bleurt.git /tmp/pip-req-build-71yenjit
Collecting tf-slim>=1.1
  Downloading tf_slim-1.1.0-py2.py3-none-any.whl (352 kB)
[K     |████████████████████████████████| 352 kB 6.6 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 51.1 MB/s 
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456783 sha256=919d0a50b92e090d7f5554a2e9f197f3ebf8f648bfbf0af649825f844ccbae81
  Stored in directory: /tmp/pip-ephem-wheel-cache-8j

Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

#### BERTScore

In [40]:
!pip install bert_score

bertscore = evaluate.load('bertscore')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert_score
  Downloading bert_score-0.3.12-py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 280 kB/s 
Collecting transformers>=3.0.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 15.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 27.7 MB/s 
Installing collected packages: tokenizers, transformers, bert-score
Successfully installed bert-score-0.3.12 tokenizers-0.13.1 transformers-4.24.0


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

### Calculate Metrics on Each Dataset

In [59]:
# test out on one of the datasets
dataset = list(inference_dict.keys())[0]
print(dataset)

targets = inference_dict[dataset]['target'].tolist()
predictions = inference_dict[dataset]['prediction'].tolist()
print(len(targets), len(predictions))

print('ROUGE')
rouge_results = rouge.compute(predictions=predictions,
                              references=targets,
                              use_aggregator=False)

predictions.T5_base_pt.squad.quac.csv
5868 5868
ROUGE


In [64]:
print(type(rouge_results))
print(len(rouge_results))
print(rouge_results.keys())
print(len(rouge_results['rouge1']))

print(f"Averages:")
for k in rouge_results.keys():
  print(k + ': ', end='')
  print(sum(rouge_results[k])/len(rouge_results[k]))


<class 'dict'>
4
dict_keys(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])
5868
Averages:
rouge1: 0.18249635750457507
rouge2: 0.03966958090769743
rougeL: 0.17632388612567854
rougeLsum: 0.17632388612567854
Maxs:
rouge1: 1.0
rouge2: 1.0
rougeL: 1.0
rougeLsum: 1.0


##### WORKING ON THIS CELL

⚠️ This cell takes some time. ⚠️

In [49]:
evaluation_dict = {}

for id, dataset in enumerate(inference_dict.keys()):

  # Get this dataset's `target` and `prediction` values
  targets = inference_dict[dataset]['target'].tolist()
  predictions = inference_dict[dataset]['prediction'].tolist()

  
  # Evaluations

  # ROUGE
  rouge_results = rouge.compute(predictions=predictions,
                                references=targets,
                                use_aggregator=False)
  evaluation_dict.update(
      {
          dataset: {'rouge': rouge_results}
      }
  )

  # BLEU-RT
  bleurt_results = bleurt.compute(predictions=predictions,
                                  references=targets)
  evaluation_dict.update(
      {
          dataset: {'bleurt': bleurt_results}
      }
  )

  # BERTScore
  bertscore_results = bertscore.compute(predictions=predictions,
                                        references=targets,
                                        model_type='distilbert-base-uncased')
  evaluation_dict.update(
      {
          dataset: {'bertscore': bleurt_results}
      }
  )

  print(f"Dataset {dataset} evaluated.")

KeyboardInterrupt: ignored