<a href="https://colab.research.google.com/github/joshuaalpuerto/ML-guide/blob/main/Fine_tune_T5_model_QnA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install torch
# T5Tokenizer requires SentencePiece because SentencePiece is the underlying tokenization library used by T5.
# SentencePiece is a library for unsupervised text segmentation and provides a unified solution for both subword-level and character-level tokenization.
!pip install sentencepiece

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

df1 = pd.read_csv('/content/drive/MyDrive/datasets/S08_question_answer_pairs.txt', sep='\t')
df2 = pd.read_csv('/content/drive/MyDrive/datasets/S09_question_answer_pairs.txt', sep='\t')
df3 = pd.read_csv('/content/drive/MyDrive/datasets/S10_question_answer_pairs.txt', sep='\t', encoding = 'ISO-8859-1')

In [None]:
data = df1.append([df2,df3])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3998 entries, 0 to 1457
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ArticleTitle              3998 non-null   object
 1   Question                  3961 non-null   object
 2   Answer                    3422 non-null   object
 3   DifficultyFromQuestioner  3043 non-null   object
 4   DifficultyFromAnswerer    3418 non-null   object
 5   ArticleFile               3996 non-null   object
dtypes: object(6)
memory usage: 218.6+ KB


  data = df1.append([df2,df3])


In [None]:
# Clean up of the data includes dropping na values, dropping duplicates, casting the answers to lowercase,
# removing extra punctuation in the answers, and removing whitespace from the questions.
# We only want the Question and Answer columns + an additional column that contains the original question.
# This will be handy when we print results as you'll see at the end.
data = data.dropna()
data = data.drop_duplicates(subset='Question')
data.shape

(1525, 6)

In [None]:
data['Answer'] = data['Answer'].apply(lambda x: x.lower())
data['Answer'] = data['Answer'].str.strip(".")
data['Question'] = data['Question'].str.strip()
data.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months,medium,easy,S08_set3_a4
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832,medium,easy,S08_set3_a4


In [None]:
data['ArticleTitle'] = data['ArticleTitle'].str.replace('_', ' ')
data['QuestionOriginal'] = data['Question']
data['Question'] = data['ArticleTitle'] + " " + data['Question'] + " " + data['Answer'] #include article title and answer for more information
data = data[['QuestionOriginal','Question','Answer']]
data.head()

Unnamed: 0,QuestionOriginal,Question,Answer
0,Was Abraham Lincoln the sixteenth President of...,Abraham Lincoln Was Abraham Lincoln the sixtee...,yes
2,Did Lincoln sign the National Banking Act of 1...,Abraham Lincoln Did Lincoln sign the National ...,yes
4,Did his mother die of pneumonia?,Abraham Lincoln Did his mother die of pneumoni...,no
6,How many long was Lincoln's formal education?,Abraham Lincoln How many long was Lincoln's fo...,18 months
8,When did Lincoln begin his political career?,Abraham Lincoln When did Lincoln begin his pol...,1832


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

# Load the model
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/261M [00:00<?, ?B/s]

In [None]:
# Move to GPU
model = model.to('cuda')

In [None]:
import torch
class QADataset(torch.utils.data.Dataset):
    def __init__(self, questions, answers, tokenizer, max_len):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]

        # This is the required format by t5 model to do question and answering
        input_text = "question: " + question + " context: " + answer
        target_text = answer
        # we use encode_plus  vs encode because
        # 1.- Split the sentence into tokens.
        # 2.- Add the special [CLS] and [SEP] tokens.
        # 3.- Map the tokens to their IDs.
        # 4.- Pad or truncate all sentences to the same length.
        # 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens.
        input_ids = self.tokenizer.encode_plus(input_text,
                                               max_length=self.max_len,
                                               padding="max_length",
                                               truncation=True,
                                               return_attention_mask=True,
                                               add_special_tokens=True,
                                               return_tensors="pt")["input_ids"]

        target_ids = self.tokenizer.encode_plus(target_text,
                                                max_length=self.max_len,
                                                padding="max_length",
                                                truncation=True,
                                                return_attention_mask=True,
                                                add_special_tokens=True,
                                                return_tensors="pt")["input_ids"]
        ## We need to decode targetId to apply "Teacher forcing"  technique
        # is a technique used during training to help the model learn to generate correct outputs

        # This means that all rows of target_ids will be selected except the last column.
        # tensor([[  4, 138,  87,   6,   0], [  4, 143,  55,   6,   0]])
        # tensor([[  4, 138,  87,   6 ], [  4, 143,  55,   6 ]])
        # .contigous() mean that tensors are stored in a memory without blocks or padding (continuous)
        decoder_input_ids = target_ids[:, :-1].contiguous()
        # This means that all rows of target_ids will be selected except the first column.
        # tensor([[  4, 138,  87,   6,   0], [  4, 143,  55,   6,   0]])
        # tensor([[  138,  87,   6, 0 ], [  143,  55,   6, 0 ]])
        # detaching the target_ids tensor from the computation graph is an important step to prevent memory leaks
        # and ensure that gradients are only computed with respect to the model's parameters.
        decoder_target_ids = target_ids[:, 1:].clone().detach()

        # Squeeze is to remove dimension by 1
        # [[101, 2054, 2003, 1037, 2157, 1997, 2026, 2924, 102, 0, 0, 0]] -> [101, 2054, 2003, 1037, 2157, 1997, 2026, 2924, 102, 0, 0, 0]
        # The (input_ids != 0) and (decoder_input_ids != 0) statements create Boolean tensors that are True
        # where there are actual tokens and False where there are padding tokens (represented by a 0).
        # This is how we tell to the model via attentin mask that we don't care for 0 (padded) values
        return {"input_ids": input_ids.squeeze(),
                # We didn't uset the attention mask from `encode_plus` rather we implicitly compute it from input_ids(if value is not 0)
                "attention_mask": (input_ids != 0).squeeze(),
                "decoder_input_ids": decoder_input_ids.squeeze(),
                "decoder_attention_mask": (decoder_input_ids != 0).squeeze(),
                "labels": decoder_target_ids.squeeze()}

In [None]:
from sklearn.model_selection import train_test_split

# Create the input and target sequences
input_sequences = list(data['Question'])
target_sequences = list(data['Answer'])

# split dataset into training and validation sets
train_questions, val_questions, train_answers, val_answers = train_test_split(
    input_sequences, target_sequences, test_size=0.2, random_state=42)

train_dataset = QADataset(train_questions, train_answers, tokenizer, max_len=512)
val_dataset = QADataset(val_questions, val_answers, tokenizer, max_len=512)

In [None]:
from transformers import Trainer, TrainingArguments

# set up the training arguments
training_args = TrainingArguments(
    # output_dir: The directory where the trained model will be saved.
    output_dir='./results',
    # num_train_epochs: The number of epochs (complete passes through the training dataset) to train the model.
    num_train_epochs=8,
    # per_device_train_batch_size: The number of training samples per batch on each GPU device.
    per_device_train_batch_size=8,
    # per_device_eval_batch_size: The number of evaluation samples per batch on each GPU device.
    per_device_eval_batch_size=8,
    # warmup_steps: The number of initial steps during which the learning rate increases linearly from 0 to its set value.
    warmup_steps=500,
    # weight_decay: The amount of L2 regularization to apply to the model weights during training.
    weight_decay=0.01,
    # logging_dir: The directory where the training logs will be saved.
    logging_dir='./logs',
    # logging_steps: The number of training steps after which to log the current training loss and other metrics.
    logging_steps=100,
    # evaluation_strategy: The strategy used for evaluating the model on the validation dataset. "steps" means evaluate at specific intervals defined by eval_steps, while "epoch" means evaluate at the end of each epoch.
    evaluation_strategy="steps",
    # save_strategy: The strategy used for saving the model checkpoints during training. "steps" means save at specific intervals defined by save_steps, while "epoch" means save at the end of each epoch.
    save_strategy="steps",
    # save_steps: The number of training steps after which to save a checkpoint of the trained model.
    save_steps=500,
    # eval_steps: The number of training steps after which to evaluate the model on the validation dataset.
    eval_steps=100,
    # learning_rate: The learning rate for the optimizer used during training.
    learning_rate=3e-4,
    # fp16: Whether or not to use mixed precision training with half-precision floating point numbers to speed up training and reduce memory usage.
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # Data_collator prepares the input data and target labels in a format that can be consumed by the model's forward() method during training.
    # torch.stack() essentially concatenates a list of tensors along a new or existing dimension, resulting in a new tensor with a different shape.
    # torch.stack(tensor_list, dim=0)
    #   tensor_list = [tensor([1, 2]), tensor([3, 4]), tensor([5, 6])]  === tensor([[1, 2], [3, 4], [5, 6]])
    # torch.stack(tensor_list, dim=1)
    #   tensor_list = [tensor([1, 2]), tensor([3, 4]), tensor([5, 6])]  === tensor([[1, 3, 5], [2, 4, 6]])
    data_collator=lambda data: {'input_ids': torch.stack([item['input_ids'] for item in data]),
                               'attention_mask': torch.stack([item['attention_mask'] for item in data]),
                               'decoder_input_ids': torch.stack([item['decoder_input_ids'] for item in data]),
                               'decoder_attention_mask': torch.stack([item['decoder_attention_mask'] for item in data]),
                               'labels': torch.stack([item['labels'] for item in data])}
)

In [None]:
trainer.train()



KeyError: ignored

In [None]:
def generate_answers(question):
    input_text = "question: " + question + " answer: "
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to('cuda')
    output = model.generate(input_ids=input_ids, max_length=1024, num_beams=5, early_stopping=True)
    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_answer

In [None]:
generated_answer = generate_answers("Not good results at all")
print(generated_answer)

duplicate
