# Finetuning GPT-2 on custom dataset with Data Science knowledge base

In this notebook, we first investigate how GPT-2, a 124M parameter generative model by OpenAI, performs on a curated dataset for a question and answer task. 

As Phase 1, we load the dataset and the GPT-2 inference pipeline, and prompt a few questions, the responses of which are provided in the dataset. 

As Phase 2, we instruction fine tune the GPT-2 model to update its weights so that it learns the specific knowledge in the dataset and then re-test to check the quality of the responses.

In [None]:
import torch 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Phase 1: Test the pre-trained GPT-2 if it is good enough

In [None]:
# Load the inference pipeline pertaining to GPT-2
from transformers import pipeline, set_seed
generate = pipeline("text-generation", model="openai-community/gpt2",
                    clean_up_tokenization_spaces=True,
                    device=device)

In [None]:
# Load the dataset
from datasets import load_dataset
dataset = load_dataset("team-bay/data-science-qa",split='train')

In [None]:
dataset=dataset.train_test_split(test_size=0.3)

In [None]:
dataset

In [None]:
# Inspect the first five questions/answer pairs in the dataset
for i in range(5):
    print(f"Q:{dataset['train']['question'][i]} \nA:{dataset['train']['answer'][i]}")

In [None]:
set_seed(42)
generate("what is Spectral Analysis?", max_length=30, num_return_sequences=5) 

Hmmm! The responses are pretty unsatifactory. Let's try educating our model with by Supervised Fine Tuning, specifically Instruction Fine Tuning. 

In [None]:
import nltk
import evaluate
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd

In [None]:
import warnings
from transformers import logging

logging.set_verbosity_error()
warnings.filterwarnings('ignore')

## Load model

In [None]:
model_name="openai-community/gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)

## Create a subset of our dataset
To facilitate some quick prototyping, we take a few sample pairs from the dataset and attempt to fine-tune our model to see if it works. If so, we will than courageously fine tuning it on the full dataset.  

## Constructing tokenizer
In this section, we will load the tokenizer specific to GPT-2. 
We will then tokenize our sub-dataset and then split it into trianing and testing dataset.

In [None]:
# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


In [None]:
# Define the preprocessing function

def tokenize_function(examples,max_length=512):
    
    query="Question: " + examples["question"][0].strip()
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(query, return_tensors="np",
                                 padding='max_length',truncation=True,
                                 max_length=max_length
                                 )
    response="Answer: " + examples["answer"][0].strip()
    labels=tokenizer(response, return_tensors="np",
                                 padding='max_length',truncation=True,
                                 max_length=max_length
                                 )
    tokenized_inputs['labels']=labels['input_ids']
    #print(f"[{len(tokenized_inputs['input_ids'][0])}]Q: {tokenized_inputs['input_ids']}")
    return tokenized_inputs

In [None]:
tokenized_dataset=dataset.map(tokenize_function,fn_kwargs={'max_length':1024},
    batched=True,batch_size=1)

In [None]:
tokenized_dataset.remove_columns(['type', 'question', 'answer'])

In [None]:
# Set up Rouge score for evaluation
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

In [None]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH = 1
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 2
NUM_EPOCHS = 5

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=L_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
    weight_decay=WEIGHT_DECAY,
    save_total_limit=SAVE_TOTAL_LIM,
    num_train_epochs=NUM_EPOCHS,
    push_to_hub=False
)

In [None]:

# Set up trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
)

In [None]:
print(f"Number of training itreations: {NUM_EPOCHS*len(tokenized_dataset['train'])/BATCH_SIZE}")

In [None]:
# Trigger the model training
trainer.train()

In [None]:
ft_model=GPT2LMHeadModel.from_pretrained("results/checkpoint-415/")

In [None]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [None]:
inference(text="Question: what is Spectral Analysis?",
          model=ft_model,
         tokenizer=tokenizer)