<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/sandboxes/RR/awesome_T5_pt_inference_triviaqa_sliced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Generate Predictions From An Awesome Validation Dataset

This notebook assumes a T5 PyTorch model.

Setting the constants in the next call should be all that is necessary to run the validation set.

In [1]:
# Set these constants for each model and validation dataset combination

model_name = "T5_base_pt_long.quac"
validation_dataset_names = ["v1", "v2", "d1"]

save_predictions = True
save_mode = 'w' # w for write, a for append

max_input_length = 1024

num_beams = 4
no_repeat_ngram_size = 3
max_target_length = 50
min_target_length = 1
early_stopping = True

batch_size = 25

start_sample = None  # If None, then 0 will be used
end_sample = None # If None, then the end of the set will be used

### Generate Predictions

In [2]:
!pip install -q transformers

In [3]:
!pip install -q sentencepiece

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os
import numpy as np
import pandas as pd

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from google.colab import data_table
data_table.enable_dataframe_formatter()

In [6]:
# Some important file locations and constants

project_root = "/content/drive/MyDrive/w266 NLP Final Project/"
#project_root = "/home/localadmin/Documents/w266_NLP_Final_Project/"
dataset_prefix = project_root + "Data/triviaqa/valid_pairs_"
model_root = project_root + "Models/"
prediction_folder = project_root + "Predictions_3/checkpoint/"

tokenizer = "google/t5-v1_1-base"
model_folder = model_root + model_name

In [7]:
# Get the model and tokenizer

T5_tokenizer = T5Tokenizer.from_pretrained(tokenizer)
T5_model = T5ForConditionalGeneration.from_pretrained(model_folder)
T5_model.to(torch.device('cuda:0'))
pass

In [8]:
for dataset_name in validation_dataset_names:
#  if dataset_name == "squad":
#    validation_data_file = f"{dataset_root}squad.hf/valid_pairs.csv"
#  else:  
#    validation_data_file = f"{dataset_root}{dataset_name}/valid_pairs.csv"
  validation_data_file = f"{dataset_prefix}{dataset_name}.csv"
  print(validation_data_file)
  validation_df = pd.read_csv(validation_data_file)
  prediction_file = f"{prediction_folder}predictions.{model_name}.triviaqa_{dataset_name}.csv" 
  
  start_sample = None
  end_sample = None

  predictions = []
  
  if start_sample is None: start_sample = 0
  if end_sample is None: end_sample = validation_df.shape[0]
  
  print(f"Generating predictions using {dataset_name} from {start_sample} to {end_sample}:")
  for start in range (start_sample, end_sample, batch_size):
    to = min([end_sample, start + batch_size])
    inputs = T5_tokenizer(validation_df['orig'][start:to].to_list(), return_tensors='pt', max_length=max_input_length, truncation=True, padding=True)
    output_ids = T5_model.generate(inputs['input_ids'].cuda(),
                                   min_length=min_target_length,
                                   max_length=max_target_length,
                                   num_beams=num_beams, 
                                   no_repeat_ngram_size=no_repeat_ngram_size, 
                                   early_stopping=early_stopping)  
    prediction_batch = T5_tokenizer.batch_decode(output_ids, skip_special_tokens=True)   
    predictions.extend(prediction_batch)
    print (f"{to} ", end="")
    if to%1000 == 0: print()
  print("\nPredictions generated.")

  df=pd.DataFrame()
  df['context'] = [str.split('context: ')[1] for str in validation_df['orig'][start_sample:end_sample]]
  df['answer'] =  [str.split('context: ')[0][26: ] for str in validation_df['orig'][start_sample:end_sample]]
  df['target'] = validation_df['target']
  df['prediction'] = predictions

  if save_predictions:
    df.to_csv(prediction_file, mode=save_mode)
    print(f"Write: {prediction_file}")

/content/drive/MyDrive/w266 NLP Final Project/Data/triviaqa/valid_pairs_v1.csv
Generating predictions using v1 from 0 to 493:
25 50 75 100 125 150 175 200 225 250 275 300 325 350 375 400 425 450 475 493 
Predictions generated.
Write: /content/drive/MyDrive/w266 NLP Final Project/Predictions_3/checkpoint/predictions.T5_base_pt_long.quac.triviaqa_v1.csv
/content/drive/MyDrive/w266 NLP Final Project/Data/triviaqa/valid_pairs_v2.csv
Generating predictions using v2 from 0 to 492:
25 50 75 100 125 150 175 200 225 250 275 300 325 350 375 400 425 450 475 492 
Predictions generated.
Write: /content/drive/MyDrive/w266 NLP Final Project/Predictions_3/checkpoint/predictions.T5_base_pt_long.quac.triviaqa_v2.csv
/content/drive/MyDrive/w266 NLP Final Project/Data/triviaqa/valid_pairs_d1.csv
Generating predictions using d1 from 0 to 985:
25 50 75 100 125 150 175 200 225 250 275 300 325 350 375 400 425 450 475 500 525 550 575 600 625 650 675 700 725 750 775 800 825 850 875 900 925 950 975 985 
Predicti