<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/inference/awesome_T5_pt_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Generate Predictions From An Awesome Validation Dataset

This notebook assumes a T5 PyTorch model.

Setting the constants in the next call should be all that is necessary to run the validation set.

In [1]:
# Set these constants for each model and validation dataset combination

model_name = "T5_base_pt_long.quac"
validation_dataset_name = "quac"

save_predictions = True
save_mode = 'w' # w for write, a for append

max_length = 1024 # 1024 for long model and 512 otherwise
batch_size = 200

start_sample = None  # If None, then 0 will be used
end_sample = None # If None, then the end of the set will be used

### Generate Predictions

In [None]:
!pip install -q transformers

In [None]:
!pip install -q sentencepiece

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import numpy as np
import pandas as pd

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from google.colab import data_table
data_table.enable_dataframe_formatter()

In [None]:
# Some important file locations and constants

project_root = "/content/drive/MyDrive/w266 NLP Final Project/"
dataset_root = project_root + "Data/"
model_root = project_root + "Models/"
prediction_folder = project_root + "Predictions/"

tokenizer = "google/t5-v1_1-base"

model_folder = model_root + model_name

validation_data_file = f"{dataset_root}squad.hf/valid_pairs.csv"
if validation_dataset_name != "squad":
  validation_data_file = f"{dataset_root}{validation_dataset_name}/valid_pairs.csv"

prediction_file = f"{prediction_folder}predictions.{model_name}.{validation_dataset_name}.csv"

In [None]:
validation_df = pd.read_csv(validation_data_file)
validation_df[['orig', 'target']][:2]

Unnamed: 0,orig,target
0,generate question: answer: one context: Goliat...,"When David killed Goliath, how many of his fiv..."
1,generate question: answer: Apaches context: Ge...,Of which tribe of Red Indians was Geronimo a c...


In [None]:
validation_df.shape[0]

9835

In [None]:
# Download tokenizer and model, associate the model with the GPU

t5_tokenizer = T5Tokenizer.from_pretrained(tokenizer)
t5_model = T5ForConditionalGeneration.from_pretrained(model_folder)
t5_model.to(torch.device('cuda:0'))
pass

In [None]:
predictions = []

if start_sample is None:
  start_sample = 0

if end_sample is None:
  end_sample = validation_df.shape[0]

print(f"Generating predictions from {start_sample} to {end_sample}:")
for start in range (start_sample, end_sample, batch_size):
  to = min([end_sample, start + batch_size])
  inputs = t5_tokenizer(validation_df['orig'][start:to].to_list(), return_tensors='pt', max_length=max_length, truncation=True, padding=True)
  output_ids = t5_model.generate(inputs['input_ids'].cuda(), max_length=max_length)
  prediction_batch = t5_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  predictions.extend(prediction_batch)
  print (f"{to} ", end="")
  if to%1000 == 0: print()
print("Predictions generated.")

Generating predictions from 5000 to 9835:
5050 5100 5150 5200 5250 5300 5350 5400 5450 5500 5550 5600 5650 5700 5750 5800 5850 5900 5950 6000 
6050 6100 6150 6200 6250 6300 6350 6400 6450 6500 6550 6600 6650 6700 6750 6800 6850 6900 6950 7000 
7050 7100 7150 7200 7250 7300 7350 7400 7450 7500 7550 7600 7650 7700 7750 7800 7850 7900 7950 8000 
8050 8100 8150 8200 8250 8300 8350 8400 8450 8500 8550 8600 8650 8700 8750 8800 8850 8900 8950 9000 
9050 9100 9150 9200 9250 9300 9350 9400 9450 9500 9550 9600 9650 9700 9750 9800 9835 Predictions generated.


In [None]:
df=pd.DataFrame()
df['context'] = [str.split('context: ')[1] for str in validation_df['orig'][start_sample:end_sample]]
df['answer'] =  [str.split('context: ')[0][26: ] for str in validation_df['orig'][start_sample:end_sample]]
df['target'] = validation_df['target']
df['prediction'] = predictions

In [None]:
df[:10]

Unnamed: 0,context,answer,target,prediction
0,The Moon Is a Harsh Mistress is a 1966 science...,Robert A. Heinlein,"When David killed Goliath, how many of his fiv...",What was the most interesting aspects about th...
1,"John Davison Rockefeller Sr. ( July 8 , 1839 –...",Standard Oil,Of which tribe of Red Indians was Geronimo a c...,What was his first job?
2,Tallahassee is the capital of the U.S. state o...,Tallahassee,"According to Jewish tradition, whose chair is ...",What was his first position?
3,Sweeney Todd is a fictional character who firs...,Fleet Street,What island is shared by Haiti and the Dominic...,What was his first name?
4,The Battle of Salamanca ( in French and Spanis...,Duke of Wellington,"In humans, the medical condition Hyposmia affe...",What was the military?
5,"Florence Nightingale , ( ; 12 May 1820 – 13 Au...",The Lady with the Lamp,Which hit for 'Wet Wet Wet' was the biggest-se...,What was her first name?
6,"H.M.S . Pinafore ; or , The Lass That Loved a ...",Gilbert and Sullivan,What American industrialist is credited as the...,What was her first career?
7,The United States presidential election of 195...,Eisenhower,What became the last city on earth to experien...,What was his first position?
8,is the luxury vehicle marque of Japanese autom...,Honda Clio,What is the more common name for the order Col...,What was the car?
9,"Hair loss , also known as alopecia or baldness...",Hair,"When introduced into Britain in 1752, what cau...",What was the disease?


In [None]:
if save_predictions:
  df.to_csv(prediction_file, mode=save_mode)