<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/inference/awesome_T5_pt_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Generate Predictions From An Awesome Validation Dataset

This notebook assumes a T5 PyTorch model.

Setting the constants in the next call should be all that is necessary to run the validation set.

In [9]:
# Set these constants for each model and validation dataset combination

model_name = "T5_base_pt_long.amalgam"
validation_dataset_names = ["nq", "quac", "squad", "triviaqa"]

save_predictions = True
save_mode = 'w' # w for write, a for append

max_length = 1024 # 1024 for long model and 512 otherwise
batch_size = 50

start_sample = None  # If None, then 0 will be used
end_sample = None # If None, then the end of the set will be used

### Generate Predictions

In [2]:
!pip install -q transformers

[K     |████████████████████████████████| 5.5 MB 27.7 MB/s 
[K     |████████████████████████████████| 163 kB 81.9 MB/s 
[K     |████████████████████████████████| 7.6 MB 70.6 MB/s 
[?25h

In [3]:
!pip install -q sentencepiece

[?25l[K     |▎                               | 10 kB 36.3 MB/s eta 0:00:01[K     |▌                               | 20 kB 25.5 MB/s eta 0:00:01[K     |▊                               | 30 kB 32.5 MB/s eta 0:00:01[K     |█                               | 40 kB 23.9 MB/s eta 0:00:01[K     |█▎                              | 51 kB 27.7 MB/s eta 0:00:01[K     |█▌                              | 61 kB 31.6 MB/s eta 0:00:01[K     |█▉                              | 71 kB 24.6 MB/s eta 0:00:01[K     |██                              | 81 kB 26.0 MB/s eta 0:00:01[K     |██▎                             | 92 kB 28.2 MB/s eta 0:00:01[K     |██▋                             | 102 kB 30.4 MB/s eta 0:00:01[K     |██▉                             | 112 kB 30.4 MB/s eta 0:00:01[K     |███                             | 122 kB 30.4 MB/s eta 0:00:01[K     |███▍                            | 133 kB 30.4 MB/s eta 0:00:01[K     |███▋                            | 143 kB 30.4 MB/s eta 0:

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import numpy as np
import pandas as pd

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from google.colab import data_table
data_table.enable_dataframe_formatter()

In [6]:
# Some important file locations and constants

project_root = "/content/drive/MyDrive/w266 NLP Final Project/"
dataset_root = project_root + "Data/"
model_root = project_root + "Models/"
prediction_folder = project_root + "Predictions/checkpoint/"

tokenizer = "google/t5-v1_1-base"
model_folder = model_root + model_name

In [7]:
# Get the model and tokenizer

T5_tokenizer = T5Tokenizer.from_pretrained(tokenizer)
T5_model = T5ForConditionalGeneration.from_pretrained(model_folder)
T5_model.to(torch.device('cuda:0'))
pass

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/605 [00:00<?, ?B/s]

In [10]:
for dataset_name in validation_dataset_names:
  if dataset_name == "squad":
    validation_data_file = f"{dataset_root}squad.hf/valid_pairs.csv"
  else:  
    validation_data_file = f"{dataset_root}{dataset_name}/valid_pairs.csv"
  print(validation_data_file)
  validation_df = pd.read_csv(validation_data_file)
  prediction_file = f"{prediction_folder}predictions.{model_name}.{dataset_name}.csv" 
  
  start_sample = None
  end_sample = None

  predictions = []
  
  if start_sample is None: start_sample = 0
  if end_sample is None: end_sample = validation_df.shape[0]
  
  print(f"Generating predictions using {dataset_name} from {start_sample} to {end_sample}:")
  for start in range (start_sample, end_sample, batch_size):
     to = min([end_sample, start + batch_size])
     inputs = T5_tokenizer(validation_df['orig'][start:to].to_list(), return_tensors='pt', max_length=max_length, truncation=True, padding=True)
     output_ids = T5_model.generate(inputs['input_ids'].cuda(), max_length=max_length)
     prediction_batch = T5_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
     predictions.extend(prediction_batch)
     print (f"{to} ", end="")
     if to%1000 == 0: print()
  print("\nPredictions generated.")

  df=pd.DataFrame()
  df['context'] = [str.split('context: ')[1] for str in validation_df['orig'][start_sample:end_sample]]
  df['answer'] =  [str.split('context: ')[0][26: ] for str in validation_df['orig'][start_sample:end_sample]]
  df['target'] = validation_df['target']
  df['prediction'] = predictions

  if save_predictions:
    df.to_csv(prediction_file, mode=save_mode)
    print(f"Write: {prediction_file}")

/content/drive/MyDrive/w266 NLP Final Project/Data/nq/valid_pairs.csv
Generating predictions using nq from 0 to 2356:
50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 
1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 
2050 2100 2150 2200 2250 2300 2350 2356 
Predictions generated.


ValueError: ignored