<a href="https://colab.research.google.com/github/kaledai069/Answer-Validity-Checker-with-Word-Vectorizer-Neural-Nets/blob/master/Alternate_Solution_Ranker_Clue_Answer_pair_ranker_with_T5_Small_(Training).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
!pip install -q sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torch
import pandas as pd
import numpy as np

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer
from torch.optim import AdamW

In [4]:
DATASET_PATH = "/content/gdrive/MyDrive/Clue-Answer Dataset/CLUE_ANSWER_DATA_VERSION_1.csv"

chunk_size = 3000000
clue_answer_df = pd.read_csv(DATASET_PATH)

In [5]:
chunked_df = clue_answer_df.sample(n = chunk_size, random_state = 69)
chunked_df = chunked_df[['clue', 'answer']].reset_index(drop = True)
chunked_df.head(5)

Unnamed: 0,clue,answer
0,"swarm, another way",warms
1,"actor b. d. of ""law & order: s.v.u.""",wong
2,data storage prefix,tera
3,most challenging,hardest
4,cause of nose-pinching,odor


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [7]:
class ClueAnswerDataset(Dataset):
    def __init__(self, dataframe, tokenizer, clue_max_length = 32, answer_max_length = 32):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.clue_max_length = clue_max_length
        self.answer_max_length = answer_max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        clue = str(row['clue'])
        answer = str(row['answer'])

        inputs = self.tokenizer(
            f"clue: {clue}",
            return_tensors="pt",
            max_length = self.clue_max_length,
            truncation = True,
            padding = 'max_length'
        )

        labels = self.tokenizer(answer, return_tensors="pt", max_length = self.answer_max_length, truncation = True, padding = 'max_length')

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze(),
        }

BATCH_SIZE = 64

tokenizer = AutoTokenizer.from_pretrained("t5-small")
dataset = ClueAnswerDataset(chunked_df, tokenizer)
dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle=True)

model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
epochs = 1
learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr = learning_rate)

iterations_per_epoch = len(dataloader)
print_every = 500
total_iterations = epochs * iterations_per_epoch

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for iteration, batch in enumerate(tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}", ncols = 130)):
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # print average loss every [train_evaluation] iterations
        if (iteration + 1) % print_every == 0 or iteration == iterations_per_epoch - 1:
            average_loss = total_loss / (iteration + 1)
            print(f"\nIteration {iteration + 1}/{total_iterations} - Average Loss: {average_loss:.4f}")

    # average loss over an epoch
    average_loss_epoch = total_loss / iterations_per_epoch
    print(f"Epoch {epoch + 1}/{epochs} - Average Loss for Epoch: {average_loss_epoch:.4f}")


Epoch 1/1:   1%|▊                                                                           | 500/46875 [02:16<3:31:43,  3.65it/s]


Iteration 500/46875 - Average Loss: 0.4398


Epoch 1/1:   2%|█▌                                                                         | 1000/46875 [04:32<3:27:35,  3.68it/s]


Iteration 1000/46875 - Average Loss: 0.4339


Epoch 1/1:   3%|██▍                                                                        | 1500/46875 [06:48<3:25:49,  3.67it/s]


Iteration 1500/46875 - Average Loss: 0.4298


Epoch 1/1:   4%|███▏                                                                       | 2000/46875 [09:04<3:23:02,  3.68it/s]


Iteration 2000/46875 - Average Loss: 0.4265


Epoch 1/1:   5%|████                                                                       | 2500/46875 [11:20<3:20:54,  3.68it/s]


Iteration 2500/46875 - Average Loss: 0.4238


Epoch 1/1:   6%|████▊                                                                      | 3000/46875 [13:36<3:19:04,  3.67it/s]


Iteration 3000/46875 - Average Loss: 0.4216


Epoch 1/1:   7%|█████▌                                                                     | 3500/46875 [15:52<3:16:13,  3.68it/s]


Iteration 3500/46875 - Average Loss: 0.4196


Epoch 1/1:   9%|██████▍                                                                    | 4000/46875 [18:07<3:14:15,  3.68it/s]


Iteration 4000/46875 - Average Loss: 0.4180


Epoch 1/1:  10%|███████▏                                                                   | 4500/46875 [20:23<3:12:12,  3.67it/s]


Iteration 4500/46875 - Average Loss: 0.4166


Epoch 1/1:  11%|████████                                                                   | 5000/46875 [22:39<3:10:05,  3.67it/s]


Iteration 5000/46875 - Average Loss: 0.4151


Epoch 1/1:  12%|████████▊                                                                  | 5500/46875 [24:55<3:08:04,  3.67it/s]


Iteration 5500/46875 - Average Loss: 0.4136


Epoch 1/1:  13%|█████████▌                                                                 | 6000/46875 [27:11<3:04:47,  3.69it/s]


Iteration 6000/46875 - Average Loss: 0.4123


Epoch 1/1:  14%|██████████▍                                                                | 6500/46875 [29:27<3:02:47,  3.68it/s]


Iteration 6500/46875 - Average Loss: 0.4112


Epoch 1/1:  15%|███████████▏                                                               | 7000/46875 [31:43<3:01:05,  3.67it/s]


Iteration 7000/46875 - Average Loss: 0.4101


Epoch 1/1:  16%|███████████▉                                                               | 7489/46875 [33:57<2:59:13,  3.66it/s]

In [None]:
row = clue_answer_df.iloc[0]
clue = row['clue']
answer = row['answer']
answer = 'taxpro'

input_tokens = tokenizer(clue, return_tensors="pt")['input_ids']
decode_tokens = tokenizer(answer, return_tensors = 'pt')['input_ids']
model.eval().to("cpu")
output = model(input_tokens, labels = decode_tokens)
print(answer, output.loss.item())
print(-output.loss.item() * decode_tokens.shape[1])

tax pro 6.145267486572266
-18.435802459716797


In [None]:
# the fine-tuned model
model.save_pretrained("fine_tuned_t5_small")

def score_clue_answer_pair(clue, answer, model, tokenizer):
    input_text = f"question: {clue}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    with torch.no_grad():
        output = model.generate(input_ids)

    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded_output

clue = "Your input clue"
answer = "The expected answer"
score = score_clue_answer_pair(clue, answer, model, tokenizer)
print(f"Score: {score}")


In [None]:
# the actual log_prob for scoring the clue-answer pair
-output[0].item() * decode_tokens.shape[1]

-69.94067573547363