<a href="https://colab.research.google.com/github/kaledai069/Answer-Validity-Checker-with-Word-Vectorizer-Neural-Nets/blob/master/Alternate_Solution_Ranker_Clue_Answer_pair_ranker_with_T5_Small_(Training).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
!pip install -q sentencepiece

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m841.5 kB/s[0m eta [36m0:00:02[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.3 MB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.3 MB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
import torch
import pandas as pd
import numpy as np

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer
from torch.optim import AdamW

In [4]:
DATASET_PATH = "/content/gdrive/MyDrive/Clue-Answer Dataset/CLUE_ANSWER_DATA_VERSION_1.csv"

chunk_size = 50000
clue_answer_df = pd.read_csv(DATASET_PATH)

In [5]:
chunked_df = clue_answer_df.sample(n = chunk_size, random_state = 69)
chunked_df = chunked_df[['clue', 'answer']].reset_index(drop = True)
chunked_df.head(5)

Unnamed: 0,clue,answer
0,"swarm, another way",warms
1,"actor b. d. of ""law & order: s.v.u.""",wong
2,data storage prefix,tera
3,most challenging,hardest
4,cause of nose-pinching,odor


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [10]:
class ClueAnswerDataset(Dataset):
    def __init__(self, dataframe, tokenizer, clue_max_length = 64, answer_max_length = 32):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.clue_max_length = clue_max_length
        self.answer_max_length = answer_max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        clue = str(row['clue'])
        answer = str(row['answer'])

        inputs = self.tokenizer(
            f"question: {clue}",
            return_tensors="pt",
            max_length = self.clue_max_length,
            truncation = True,
            padding = 'max_length'
        )

        labels = self.tokenizer(answer, return_tensors="pt", max_length = self.answer_max_length, truncation = True, padding = 'max_length')

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze(),
        }

BATCH_SIZE = 64

tokenizer = AutoTokenizer.from_pretrained("t5-small")
dataset = ClueAnswerDataset(chunked_df, tokenizer)
dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle=True)

model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

epochs = 4
learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for batch in tqdm(dataloader, desc = f"Epoch {epoch + 1}/{epochs}", ncols = 130):
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{epochs} - Average Loss: {average_loss:.4f}")

Epoch 1/4: 100%|████████████████████████████████████████████████████████████████████████████████| 782/782 [04:37<00:00,  2.82it/s]


Epoch 1/4 - Average Loss: 0.6745


Epoch 2/4: 100%|████████████████████████████████████████████████████████████████████████████████| 782/782 [04:37<00:00,  2.82it/s]


Epoch 2/4 - Average Loss: 0.4456


Epoch 3/4: 100%|████████████████████████████████████████████████████████████████████████████████| 782/782 [04:37<00:00,  2.82it/s]


Epoch 3/4 - Average Loss: 0.4309


Epoch 4/4: 100%|████████████████████████████████████████████████████████████████████████████████| 782/782 [04:37<00:00,  2.82it/s]

Epoch 4/4 - Average Loss: 0.4217





In [22]:
row = clue_answer_df.iloc[0]
clue = row['clue']
answer = row['answer']
answer = 'taxpro'

input_tokens = tokenizer(clue, return_tensors="pt")['input_ids']
decode_tokens = tokenizer(answer, return_tensors = 'pt')['input_ids']
model.eval().to("cpu")
output = model(input_tokens, labels = decode_tokens)
print(answer, output.loss.item())
print(-output.loss.item() * decode_tokens.shape[1])

tax pro 6.145267486572266
-18.435802459716797


In [None]:
# the fine-tuned model
model.save_pretrained("fine_tuned_t5_small")

def score_clue_answer_pair(clue, answer, model, tokenizer):
    input_text = f"question: {clue}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    with torch.no_grad():
        output = model.generate(input_ids)

    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded_output

clue = "Your input clue"
answer = "The expected answer"
score = score_clue_answer_pair(clue, answer, model, tokenizer)
print(f"Score: {score}")


In [12]:
# the actual log_prob for scoring the clue-answer pair
-output[0].item() * decode_tokens.shape[1]

-69.94067573547363