<a href="https://colab.research.google.com/github/kumudharam/Transformers/blob/main/QuestionAnsweringUsingT5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# check for the GPU provided in the runtime
!nvidia-smi

In [None]:
# using quiet method for controlling the log
# for suppressing the colored errors and warning in the terminal
!pip install --quiet transformers==4.1.1
# pytorch lightning for smoother model training and data loading
!pip install --quiet https://github.com/PyTorchLightning/pytorch-lightning/releases/download/1.2.6/pytorch-lightning-1.2.6.tar.gz 
# using HuggingFace tokenizers
!pip install --quiet tokenizers==0.9.4
# Google's sentencepiece
!pip install --quiet sentencepiece==0.1.94

In [None]:
# mostly pl is used while doing complex model training
import pytorch_lightning as pl
print(pl.__version__)

In [None]:
# argparse makes it easier to write user friendly command line interfaces
import argparse
# package for faster file name matching
import glob
# makiing directories for data 
import os
# reading json files as the data is present in json files
import json
# time module for calculating the model runtime
import time
# Allows writing status messages to a file
import logging
# generate random float numbers uniformly
import random
# regex module for text 
import re
# module provides various functions which work on 
# iterators too produce complex iterators
from itertools import chain
from string import punctuation

# pandas for data manipulation
import pandas as pd
# numpy for array operations
import numpy as np
# PyTorch
import torch
# provides various classes representing file system paths
# with appropriate semantics
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

# splitting the data 
from sklearn.model_selection import train_test_split
# ANSII color formatting for ouput in terminal
from termcolor import colored
# wrapping paragraphs into string
import textwrap

# model checkpoints in pretrained model
from pytorch_lightning.callbacks import ModelCheckpoint

'''
optimizer - AdamW
T5 Conditional Generator in which we'll give conditions
T5 tokenizer because it is fast
training the model without a learning rate
'''
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [None]:
# Seeds all the processes including numpy torch and other imported modules.
pl.seed_everything(0)

In [None]:
# check the version provided by Lightning
import pytorch_lightning as pl
print(pl.__version__)

In [None]:
# QA dataset from https://github.com/dmis-lab/bioasq-biobert
# which is in Zip format
!gdown --id 1mxVUywvKzvA9bvrUc11RYuOTy7MYcXHF

In [None]:
# Unzipping the folder
!unzip -q bio-QA.zip

In [None]:
# let's have a look at one of the files
with Path("BioASQ/BioASQ-train-factoid-4b.json").open() as json_file:
  data = json.load(json_file)                                            

In [None]:
# Data is a dictionary
data.keys()

In [None]:
data['version']

In [None]:
# len of each file
len(data['data'])

In [None]:
# We have a list of dictionaries in the "data". We can explore the 0th element
data['data'][0].keys()

In [None]:
data['data'][0]['title']

In [None]:
len(data['data'][0]['paragraphs'])

In [None]:
questions = data['data'][0]['paragraphs']

In [None]:
# datapoint sample
questions[0]

# Function to Create a pandas dataframes of questions and answers

In [None]:
def extract_questions_and_answers(factoid_path = Path):
  with factoid_path.open() as json_file:
    data = json.load(json_file)
    questions = data['data'][0]['paragraphs']
    data_rows = []
    for question in questions:
      context = question['context']
      for question_and_answers in question['qas']:
        question = question_and_answers['question']
        answers = question_and_answers['answers']
        for answer in answers:
          answer_text = answer['text']
          answer_start = answer['answer_start']
          answer_end = answer['answer_start'] + len(answer_text)  #Gets the end index of each answer in the paragraph
          
          data_rows.append({
                "question" : question,
                "context"  : context,
                "answer_text" : answer_text,
                "answer_start" : answer_start,
                "answer_end" : answer_end
            })
    return pd.DataFrame(data_rows)

In [None]:
factoid_path = Path("BioASQ/BioASQ-train-factoid-4b.json")
extract_questions_and_answers(factoid_path).head()      

In [None]:
factoid_paths = sorted(list(Path('BioASQ/').glob('BioASQ-train-*')))
factoid_paths

In [None]:
dfs = []

for factoid_path in factoid_paths:
  df = extract_questions_and_answers(factoid_path)
  dfs.append(df)

df = pd.concat(dfs)

dfs = []

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Dropping all the rows with repeated context and questions pairs.

df = df.drop_duplicates(subset=["context"]).reset_index(drop=True)

In [None]:
df.shape

In [None]:
len(df.question.unique())

In [None]:
len(df.context.unique())

In [None]:
sample_question = df.iloc[243]
sample_question

In [None]:
# Using textcolor to visualize the answer within the context

def color_answer(question):
  answer_start, answer_end = question["answer_start"],question["answer_end"]
  context = question['context']

  return  colored(context[:answer_start], "white") + \
    colored(context[answer_start:answer_end + 1], "green") + \
    colored(context[answer_end+1:], "white")


In [None]:
print(sample_question['question'])
print()
print("Answer: ")
for wrap in textwrap.wrap(color_answer(sample_question), width = 100):
  print(wrap)

# Tokenization

In [None]:
# using the base T5 model having 222M params
MODEL_NAME ='t5-base'

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [None]:
sample_encoding = tokenizer('is the glass half empty or half full?', 'It depends on the initial state of the glass. If the glass starts out empty and liquid is added until it is half full, it is half full. If the glass starts out full and liquid is removed until it is half empty, it is half empty.')

In [None]:
sample_encoding.keys()

In [None]:
print(sample_encoding["input_ids"])

In [None]:
print(sample_encoding["attention_mask"])

In [None]:
print(len(sample_encoding['input_ids']), len(sample_encoding['attention_mask']))

In [None]:
# Checking the decoding of the input ids

preds = [
         tokenizer.decode(input_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
         for input_id in sample_encoding['input_ids']
]

In [None]:
preds= " ".join(preds)
for wrap in textwrap.wrap(preds, width = 80):
  print(wrap)

There exists a special seperator token in between the question and its answers.

Checking the encoding on the sample question

In [None]:
encoding = tokenizer(
    sample_question['question'],
    sample_question['context'],
    max_length=396,
    padding='max_length',
    truncation="only_second",
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
)

In [None]:
encoding.keys()

In [None]:
tokenizer.special_tokens_map

In [None]:
tokenizer.eos_token, tokenizer.eos_token_id
# Input id of 1 represents end of sequence token.

In [None]:
# Text representation pf the input ids

tokenizer.decode(encoding['input_ids'].squeeze())

## Creating the labels for the answers

In [None]:
answer_encoding = tokenizer(
    sample_question['answer_text'],
    max_length=32,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    add_special_tokens=True,
    return_tensors="pt"
)

In [None]:
tokenizer.decode(answer_encoding['input_ids'].squeeze())

In [None]:
labels = answer_encoding["input_ids"]
labels

Labels after the end of sequence in the answer encoding has to be converted to -100 from 0 for the model evaluation.

In [None]:
labels[labels == 0] = -100

In [None]:
labels

## To create dataset

In [None]:
class BioQADataset(Dataset):
  def __init__(
      self,
      data:pd.DataFrame,
      tokenizer:T5Tokenizer,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32,

      ):
    
    self.data =  data
    self.tokenizer =  tokenizer
    self.source_max_token_len =  source_max_token_len
    self.target_max_token_len =  target_max_token_len


  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
      data_row['question'],
      data_row['context'],
      max_length=self.source_max_token_len,
      padding='max_length',
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    target_encoding = tokenizer(
      data_row['answer_text'],
      max_length=self.target_max_token_len,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    labels = target_encoding['input_ids']
    labels[labels==0] = -100

    return dict(
        question=data_row['question'],
        context=data_row['context'],
        answer_text=data_row['answer_text'],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding['attention_mask'].flatten(),
        labels=labels.flatten()
    )




In [None]:
sample_dataset = BioQADataset(df, tokenizer)

In [None]:
for data in sample_dataset:
  print("Question: ", data['question'])
  print("Answer text: ", data['answer_text'])
  print("Input_ids: ", data['input_ids'][:10])
  print("Labels: ", data['labels'][:10])
  break

## Splitting into train and validation sets

In [None]:
train_df, val_df = train_test_split(df, test_size=0.05)

In [None]:
train_df.shape,  val_df.shape

# Create pytorch lightning datamodule

In [None]:
class BioDataModule(pl.LightningDataModule):
  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer:T5Tokenizer,
      batch_size: int = 8,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32,
      ):
    super().__init__()
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self):
    self.train_dataset = BioQADataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
        )

    self.test_dataset = BioQADataset(
    self.test_df,
    self.tokenizer,
    self.source_max_token_len,
    self.target_max_token_len
    )
 
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=4
        )
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=self.batch_size,
        num_workers=4
        )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=4
        )

In [None]:
BATCH_SIZE = 4
N_EPOCHS = 6

data_module = BioDataModule(train_df, val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

## Loading and finetuning the T5-base model

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)

In [None]:
model.config

In [None]:
# To check the translation from English to German built-in task 

input_ids_translated = tokenizer(
    "translate English to German : Oppertunity did not knock until I built a door",
    return_tensors = 'pt'
).input_ids

generated_ids = model.generate(input_ids = input_ids_translated)
generated_ids

In [None]:
pred_translated = [
         tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
         for gen_id in generated_ids
]

In [None]:
pred_translated

In [None]:
"".join(pred_translated)

In [None]:
# To check the summarization built-in task

text = """summarize : Musk was born to a Canadian mother and South African father and raised in Pretoria, South Africa. 
He briefly attended the University of Pretoria before moving to Canada aged 17 to attend Queen's University. 
He transferred to the University of Pennsylvania two years later, where he received bachelor's degrees in economics and physics. 
He moved to California in 1995 to attend Stanford University but decided instead to pursue a business career, 
co-founding the web software company Zip2 with his brother Kimbal. The startup was acquired by Compaq for $307 million in 1999. 
Musk co-founded online bank X.com that same year, which merged with Confinity in 2000 to form PayPal. 
The company was bought by eBay in 2002 for $1.5 billion. In 2002, Musk founded SpaceX, an aerospace manufacturer and space transport 
services company, of which he is CEO and CTO. In 2004, he joined electric vehicle manufacturer Tesla Motors, Inc. (now Tesla, Inc.) 
as chairman and product architect, becoming its CEO in 2008. In 2006, he helped create SolarCity, a solar energy services company that 
was later acquired by Tesla and became Tesla Energy. In 2015, he co-founded OpenAI, a nonprofit research company that promotes friendly 
artificial intelligence. In 2016, he co-founded Neuralink, a neurotechnology company focused on developing brain–computer interfaces, 
and founded The Boring Company, a tunnel construction company. Musk has proposed the Hyperloop, a high-speed vactrain transportation system."""

In [None]:
input_ids_summary = tokenizer(
    text,
    return_tensors = 'pt'
).input_ids

generated_ids_summary = model.generate(input_ids = input_ids_summary)
generated_ids_summary

In [None]:
pred_summary = [
         tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
         for gen_id in generated_ids
]

In [None]:
" ".join(pred_summary)

In [None]:
# Model config

model.config

In [None]:
output = model(
    input_ids = encoding['input_ids'],
    attention_mask = encoding['attention_mask'],
    labels = labels
)

In [None]:
output.logits.shape

In [None]:
output.loss

## Building the PyTorch lightning module using T5ForConditionalGeneration model

In [None]:
class BioQAModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)


  def forward(self, input_ids, attention_mask, labels=None):
    output = self.model(
        input_ids, 
        attention_mask=attention_mask,
        labels=labels)

    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions":outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask=batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=0.0001)
    return optimizer

In [None]:
model = BioQAModel() 

## Using trainer from pytorch lightning to finetune model using our dataset

In [None]:
model = BioQAModel()

In [None]:
# To record the best performing model using checkpoint

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

In [None]:
#logger = TensorBoardLogger("training-logs", name="bio-qa")

In [None]:
#logger = TensorBoardLogger("training-logs", name="bio-qa")
trainer = pl.Trainer(
    #logger = logger,
    checkpoint_callback=checkpoint_callback,
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate = 30
)

## Loading Tensorboard

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./lightning_logs

In [None]:
#!rm --rf lightning_logs

In [None]:
trainer.fit(model, data_module)

In [None]:
trainer.test()  # evaluate the model according to the last checkpoint

# Predictions

In [None]:
trained_model = BioQAModel.load_from_checkpoint("checkpoints/best-checkpoint.ckpt")
trained_model.freeze() # 

## Generate answers for the questions in the validation set

In [None]:
def generate_answer(question):
  source_encoding=tokenizer(
      question["question"],
      question['context'],
      max_length = 396,
      padding="max_length",
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"

  )

  generated_ids = trained_model.model.generate(
      input_ids=source_encoding["input_ids"],
      attention_mask=source_encoding["attention_mask"],
      num_beams=1,  # greedy search
      max_length=80,
      repetition_penalty=2.5,
      early_stopping=True,
      use_cache=True)
  
  preds = [
          tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
          for generated_id in generated_ids
  ]

  return "".join(preds)

In [None]:
sample_question = val_df.iloc[20]

In [None]:
sample_question["question"]

In [None]:
sample_question["answer_text"]  # Label Answer

In [None]:
generate_answer(sample_question)  # Predicted answer

In [None]:
sample_question = val_df.iloc[66]

In [None]:
sample_question["answer_text"]

In [None]:
generate_answer(sample_question)    

In [None]:
sample_question = val_df.iloc[114]

In [None]:
sample_question["answer_text"]

In [None]:
generate_answer(sample_question)

In [None]:
#mkdir zip

In [None]:
!zip -r /content.zip /content

In [None]:
from google.colab import files
files.download("/content.zip")