<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/awesome_T5_pt_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Generate Predictions From An Awesome Validation Dataset

This notebook assumes a T5 PyTorch model.

Setting the constants in the next call should be all that is necessary to run the validation set.

In [1]:
# Set these constants for each model and validation dataset combination

model_name = "T5_base_pt.squad"
validation_dataset_name = "squad"
save_predictions = True

### Generate Predictions

In [2]:
!pip install -q transformers

[K     |████████████████████████████████| 5.5 MB 14.9 MB/s 
[K     |████████████████████████████████| 163 kB 70.3 MB/s 
[K     |████████████████████████████████| 7.6 MB 51.9 MB/s 
[?25h

In [3]:
!pip install -q sentencepiece

[?25l[K     |▎                               | 10 kB 32.8 MB/s eta 0:00:01[K     |▌                               | 20 kB 11.6 MB/s eta 0:00:01[K     |▊                               | 30 kB 15.8 MB/s eta 0:00:01[K     |█                               | 40 kB 13.6 MB/s eta 0:00:01[K     |█▎                              | 51 kB 12.2 MB/s eta 0:00:01[K     |█▌                              | 61 kB 14.2 MB/s eta 0:00:01[K     |█▉                              | 71 kB 15.0 MB/s eta 0:00:01[K     |██                              | 81 kB 16.5 MB/s eta 0:00:01[K     |██▎                             | 92 kB 14.1 MB/s eta 0:00:01[K     |██▋                             | 102 kB 15.3 MB/s eta 0:00:01[K     |██▉                             | 112 kB 15.3 MB/s eta 0:00:01[K     |███                             | 122 kB 15.3 MB/s eta 0:00:01[K     |███▍                            | 133 kB 15.3 MB/s eta 0:00:01[K     |███▋                            | 143 kB 15.3 MB/s eta 0:

In [4]:
import os
import numpy as np
import pandas as pd

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from google.colab import data_table
data_table.enable_dataframe_formatter()

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# Some important file locations and constants

project_root = "/content/drive/MyDrive/w266 NLP Final Project/"
dataset_root = project_root + "Data/"
model_root = project_root + "Models/"
prediction_folder = project_root + "Predictions/"

tokenizer = "google/t5-v1_1-base"

model_folder = model_root + model_name

validation_data_file = f"{dataset_root}squad.hf/valid_pairs.csv"
if validation_dataset_name != "squad":
  validation_data_file = f"{dataset_root}{validation_dataset_name}/valid_pairs.csv"

prediction_file = f"{prediction_folder}predictions.{model_name}.{validation_dataset_name}.csv"

max_length = 512
batch_size = 125

In [10]:
validation_df = pd.read_csv(validation_data_file)
validation_df[['orig', 'target']][:2]

Unnamed: 0,orig,target
0,generate question: answer: four context: Princ...,How many levels of galleries do the façades su...
1,generate question: answer: ink context: When s...,What are the secretions commonly called?


In [11]:
# Download tokenizer and model, associate the model with the GPU

t5_tokenizer = T5Tokenizer.from_pretrained(tokenizer)
t5_model = T5ForConditionalGeneration.from_pretrained(model_folder)
t5_model.to(torch.device('cuda:0'))
pass

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/605 [00:00<?, ?B/s]

In [12]:
predictions = []
count = validation_df.shape[0]

print("Generating predictions:")
for start in range (0, count, batch_size):
  to = min([count, start + batch_size])
  inputs = t5_tokenizer(validation_df['orig'][start:to].to_list(), return_tensors='pt', max_length=max_length, truncation=True, padding=True)
  output_ids = t5_model.generate(inputs['input_ids'].cuda(), max_length=max_length)
  prediction_batch = t5_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  predictions.extend(prediction_batch)
  print (f"{to} ", end="")
  if to%1000 == 0: print()
print("Predictions generated")

Generating predictions:
125 250 375 500 625 750 875 1000 
1125 1250 1375 1500 1625 1750 1875 2000 
2125 2250 2375 2500 2625 2750 2875 3000 
3125 3250 3375 3500 3625 3750 3875 4000 
4125 4250 4375 4500 4625 4750 4875 5000 
5125 5250 5375 5500 5625 5750 5875 6000 
6125 6250 6375 6500 6625 6750 6875 7000 
7125 7250 7375 7500 7625 7750 7875 8000 
8125 8250 8375 8500 8625 8750 8875 9000 
9125 9250 9375 9500 9625 9750 9875 10000 
10125 10250 10375 10500 10570 Predictions generated


In [13]:
df=pd.DataFrame()
df['context'] = [str.split('context: ')[1] for str in validation_df['orig']]
df['answer'] =  [str.split('context: ')[0][26: ] for str in validation_df['orig']]
df['target'] = validation_df['target']
df['prediction'] = predictions

In [14]:
df[:2]

Unnamed: 0,context,answer,target,prediction
0,Prince Albert appears within the main arch abo...,four,How many levels of galleries do the façades su...,What levels of galleries are in the galleries?
1,"When some species, including Bathyctena chuni,...",ink,What are the secretions commonly called?,What is the name of the secretions produced by...


In [15]:
if save_predictions:
  df.to_csv(prediction_file)