<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/inference/awesome_T5_pt_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Generate Predictions From An Awesome Validation Dataset

This notebook assumes a T5 PyTorch model.

Setting the constants in the next call should be all that is necessary to run the validation set.

In [1]:
# Set these constants for each model and validation dataset combination

model_name = "T5_base_pt_long.quac"
validation_dataset_name = "triviaqa"
save_predictions = True


max_length = 1024 # 1024 for long model and 512 otherwise
batch_size = 75 # 150 is the norm, but dial back when needed

### Generate Predictions

In [2]:
!pip install -q transformers

[K     |████████████████████████████████| 5.5 MB 19.1 MB/s 
[K     |████████████████████████████████| 7.6 MB 52.7 MB/s 
[K     |████████████████████████████████| 163 kB 60.7 MB/s 
[?25h

In [3]:
!pip install -q sentencepiece

[K     |████████████████████████████████| 1.3 MB 36.6 MB/s 
[?25h

In [4]:
import os
import numpy as np
import pandas as pd

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from google.colab import data_table
data_table.enable_dataframe_formatter()

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Some important file locations and constants

project_root = "/content/drive/MyDrive/w266 NLP Final Project/"
dataset_root = project_root + "Data/"
model_root = project_root + "Models/"
prediction_folder = project_root + "Predictions/"

tokenizer = "google/t5-v1_1-base"

model_folder = model_root + model_name

validation_data_file = f"{dataset_root}squad.hf/valid_pairs.csv"
if validation_dataset_name != "squad":
  validation_data_file = f"{dataset_root}{validation_dataset_name}/valid_pairs.csv"

prediction_file = f"{prediction_folder}predictions.{model_name}.{validation_dataset_name}.csv"

In [7]:
validation_df = pd.read_csv(validation_data_file)
validation_df[['orig', 'target']][:2]

Unnamed: 0,orig,target
0,generate question: answer: one context: Goliat...,"When David killed Goliath, how many of his fiv..."
1,generate question: answer: Apaches context: Ge...,Of which tribe of Red Indians was Geronimo a c...


In [8]:
# Download tokenizer and model, associate the model with the GPU

t5_tokenizer = T5Tokenizer.from_pretrained(tokenizer)
t5_model = T5ForConditionalGeneration.from_pretrained(model_folder)
t5_model.to(torch.device('cuda:0'))
pass

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/605 [00:00<?, ?B/s]

In [9]:
predictions = []
count = validation_df.shape[0]

print("Generating predictions:")
for start in range (0, count, batch_size):
  to = min([count, start + batch_size])
  inputs = t5_tokenizer(validation_df['orig'][start:to].to_list(), return_tensors='pt', max_length=max_length, truncation=True, padding=True)
  output_ids = t5_model.generate(inputs['input_ids'].cuda(), max_length=max_length)
  prediction_batch = t5_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  predictions.extend(prediction_batch)
  print (f"{to} ", end="")
  if to%1000 == 0: print()
print("Predictions generated")

Generating predictions:


RuntimeError: ignored

In [None]:
df=pd.DataFrame()
df['context'] = [str.split('context: ')[1] for str in validation_df['orig']]
df['answer'] =  [str.split('context: ')[0][26: ] for str in validation_df['orig']]
df['target'] = validation_df['target']
df['prediction'] = predictions

In [None]:
df[:10]

In [None]:
if save_predictions:
  df.to_csv(prediction_file)