<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/inference/awesome_BART_pt_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Generate Predictions From An Awesome Validation Dataset

This notebook assumes a BART PyTorch model.

Setting the constants in the next call should be all that is necessary to run the validation set.

In [1]:
# Set these constants for each model and validation dataset combination

model_name = "bart_base_pt_long.amalgam"
validation_dataset_names = ["nq", "quac", "squad", "triviaqa"]

save_predictions = True
save_mode = 'w' # w for write, a for append

max_input_length = 1024

num_beams = 4
no_repeat_ngram_size = 3
max_target_length = 50
min_target_length = 1
early_stopping = True

batch_size = 50

start_sample = None  # If None, then 0 will be used
end_sample = None # If None, then the end of the set will be used

### Generate Predictions

In [2]:
!pip install -q transformers

In [3]:
!pip install -q sentencepiece

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

In [5]:
import os
import numpy as np
import pandas as pd

import torch
from transformers import BartTokenizer, BartForConditionalGeneration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#from google.colab import data_table
#data_table.enable_dataframe_formatter()

2022-11-22 20:54:09.348610: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [6]:
# Some important file locations and constants

#project_root = "/content/drive/MyDrive/w266 NLP Final Project/"
project_root = "/home/localadmin/Documents/w266_NLP_Final_Project/"
dataset_root = project_root + "Data/"
model_root = project_root + "Models/"
prediction_folder = project_root + "Predictions/checkpoint/"

tokenizer = "facebook/bart-base"
model_folder = model_root + model_name

In [7]:
# Get the model and tokenizer

bart_tokenizer = BartTokenizer.from_pretrained(tokenizer)
bart_model = BartForConditionalGeneration.from_pretrained(model_folder)
bart_model.to(torch.device('cuda:0'))
pass

In [8]:
for dataset_name in validation_dataset_names:
  if dataset_name == "squad":
    validation_data_file = f"{dataset_root}squad.hf/bart_valid_pairs.csv"
  else:  
    validation_data_file = f"{dataset_root}{dataset_name}/bart_valid_pairs.csv"
  print(validation_data_file)
  validation_df = pd.read_csv(validation_data_file)
  prediction_file = f"{prediction_folder}predictions.{model_name}.{dataset_name}.csv" 
  
  start_sample = None
  end_sample = None

  predictions = []
  
  if start_sample is None: start_sample = 0
  if end_sample is None: end_sample = validation_df.shape[0]
  
  print(f"Generating predictions using {dataset_name} from {start_sample} to {end_sample}:")
  for start in range (start_sample, end_sample, batch_size):
     to = min([end_sample, start + batch_size])
     inputs = bart_tokenizer(validation_df['orig'][start:to].to_list(), return_tensors='pt', max_length=max_input_length, truncation=True, padding=True)
     output_ids = bart_model.generate(inputs['input_ids'].cuda(),
                                      min_length=min_target_length,
                                      max_length=max_target_length,
                                      num_beams=num_beams, 
                                      no_repeat_ngram_size=no_repeat_ngram_size, 
                                      early_stopping=early_stopping)
     prediction_batch = bart_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
     predictions.extend(prediction_batch)
     print (f"{to} ", end="")
     if to%1000 == 0: print()
  print("\nPredictions generated.")

  df=pd.DataFrame()
  df['context'] = [str.split('</s>')[1] for str in validation_df['orig'][start_sample:end_sample]]
  df['answer'] =  [str.split('</s>')[0] for str in validation_df['orig'][start_sample:end_sample]]
  df['target'] = validation_df['target']
  df['prediction'] = predictions

  if save_predictions:
    df.to_csv(prediction_file, mode=save_mode)
    print(f"Write: {prediction_file}")

/home/localadmin/Documents/w266_NLP_Final_Project/Data/nq/bart_valid_pairs.csv
Generating predictions using nq from 0 to 2356:
50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 
1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 
2050 2100 2150 2200 2250 2300 2350 2356 
Predictions generated.
Write: /home/localadmin/Documents/w266_NLP_Final_Project/Predictions/checkpoint/predictions.bart_base_pt_long.amalgam.nq.csv
/home/localadmin/Documents/w266_NLP_Final_Project/Data/quac/bart_valid_pairs.csv
Generating predictions using quac from 0 to 5868:
50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 
1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 
2050 2100 2150 2200 2250 2300 2350 2400 2450 2500 2550 2600 2650 2700 2750 2800 2850 2900 2950 3000 
3050 3100 3150 3200 3250 3300 3350 3400 3450 3500 3550 3600 3650 3700 3750 3800 3850 390