# Installing and Importing Libraries

In [4]:
import os
import pickle
from tqdm import tqdm
import warnings
import torch
import matplotlib.pyplot as plt

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AdamW


warnings.filterwarnings("ignore")

# Optionally, you can reset the warning filters later if needed
# warnings.resetwarnings()

In [6]:
root = r'C:\\Users\\likhi\\Documents\\02 Pycharm Datasets\\01 Master Thesis\\07 QnA\\training_test_data\\'

## Data Preprocessing

In [6]:
class CustomDataset(Dataset):
    def __init__(self, contexts, questions, answers, tokenizer, max_length=2048):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.contexts)
    
    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer = self.answers[idx]
        
        input_text = f"context: {context} question: {question}"
        target_text = answer
        
        # Tokenize input and target text
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt", max_length=self.max_length, pad_to_max_length=True, truncation=True)
        target_ids = self.tokenizer.encode(target_text, return_tensors="pt", max_length=self.max_length, pad_to_max_length=True, truncation=True)
        
        # Ensure both input and target sequences have the same length
        max_seq_length = max(input_ids.size(1), target_ids.size(1))
        
        input_ids = input_ids[:, :max_seq_length]
        target_ids = target_ids[:, :max_seq_length]
                
        return {"input_ids": input_ids.squeeze(0), "target_ids": target_ids.squeeze(0)}


In [7]:

# Load custom dataset
with open(os.path.join(root, "QnA_train_dataset.pkl"), "rb") as f:
    train_dataset = pickle.load(f)

with open(os.path.join(root, "QnA_test_dataset.pkl"), "rb") as f:
    test_dataset = pickle.load(f)

In [15]:
# Create a DataLoader for the dataset
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

## Loading the model

In [11]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Inference before training

In [None]:

# Example input text
context = "Product title is Perler Beads Assorted Multicolor Fuse Beads for Kids Crafts, 11000 pcs. Product overview is Color is Multicolor Material is Plastic Size is 11,000 Count Brand is Perler Shape is Round Item Weight is 1.69 Pounds Number of Pieces is 11000. Product description is Includes (11000) assorted Perler fuse beads and reusable ironing paper in plastic storage jar. This mega set of 11,000 Perler fuse beads comes with 30 different colors, including toothpaste, pastel lavender, butterscotch, and neon pink. Use your assorted Perler fuse beads with pre-made Perler bead design or get creative and make your own. These multicolor Perler beads are great arts and crafts activity for children. Use Perler pegboards, ironing paper, and an iron to complete your craft. Multicolor Perler beads set suitable for ages and up."
question = "What is the Color of the product?"

# Concatenate context and question using a separator token
input_text = f"context: {context} question: {question}"

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate answer
with torch.no_grad():
    output = model.generate(input_ids, max_length=1024)  # Adjust max_length as needed
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

print("Input Text:", input_text)
print("Generated Answer:", answer)

## Model training

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

model.to(device)

# Training loop
model.train()
num_epochs = 10

losses = []
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    epoch_losses = []
    
    for batch in tqdm(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        target_ids = batch["target_ids"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, labels=target_ids)
        
        loss = outputs.loss
        epoch_losses.append(loss.item())
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()

    avg_epoch_loss = sum(epoch_losses) / len(epoch_losses)
    
    print(f"Average Loss for epoch {} : {}".format(epoch+1, avg_epoch_loss))
    

In [None]:
# Plot the loss curve
plt.plot(losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Curve')
plt.legend()
plt.show()

## Inference after training

In [None]:
# Example input text
context = "Product title is Perler Beads Assorted Multicolor Fuse Beads for Kids Crafts, 11000 pcs. Product overview is Color is Multicolor Material is Plastic Size is 11,000 Count Brand is Perler Shape is Round Item Weight is 1.69 Pounds Number of Pieces is 11000. Product description is Includes (11000) assorted Perler fuse beads and reusable ironing paper in plastic storage jar. This mega set of 11,000 Perler fuse beads comes with 30 different colors, including toothpaste, pastel lavender, butterscotch, and neon pink. Use your assorted Perler fuse beads with pre-made Perler bead design or get creative and make your own. These multicolor Perler beads are great arts and crafts activity for children. Use Perler pegboards, ironing paper, and an iron to complete your craft. Multicolor Perler beads set suitable for ages and up."
question = "What is the Color of the product?"

# Concatenate context and question using a separator token
input_text = f"context: {context} question: {question}"

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate answer
with torch.no_grad():
    output = model.generate(input_ids, max_length=64)  # Adjust max_length as needed
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

print("Input Text:", input_text)
print("Generated Answer:", answer)

## Saving model and tokenizer

In [13]:
output_dir = r'C:\\Users\\likhi\\Documents\\02 Pycharm Datasets\\01 Master Thesis\\07 QnA\\Saved Models\\'

In [24]:
# Save the trained model
model.save_pretrained(os.path.join(root, "trained_t5_model_10_percent_data_epoch_10"))
tokenizer.save_pretrained(os.path.join(root, "trained_t5_model_10_percent_data_epoch_10"))

## Loading model and tokenizer

In [14]:
# Save the trained model
model = T5ForConditionalGeneration.from_pretrained(os.path.join(output_dir, 
                                                     "trained_t5_model_10_percent_data_epoch_10"))
tokenizer = T5Tokenizer.from_pretrained(os.path.join(output_dir, 
                                                     "trained_t5_model_10_percent_data_epoch_10"))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Example input text
context = "Product title is Perler Beads Assorted Multicolor Fuse Beads for Kids Crafts, 11000 pcs. Product overview is Color is Multicolor Material is Plastic Size is 11,000 Count Brand is Perler Shape is Round Item Weight is 1.69 Pounds Number of Pieces is 11000. Product description is Includes (11000) assorted Perler fuse beads and reusable ironing paper in plastic storage jar. This mega set of 11,000 Perler fuse beads comes with 30 different colors, including toothpaste, pastel lavender, butterscotch, and neon pink. Use your assorted Perler fuse beads with pre-made Perler bead design or get creative and make your own. These multicolor Perler beads are great arts and crafts activity for children. Use Perler pegboards, ironing paper, and an iron to complete your craft. Multicolor Perler beads set suitable for ages and up."
question = "What is the Color of the product?"

# Concatenate context and question using a separator token
input_text = f"context: {context} question: {question}"

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate answer
with torch.no_grad():
    output = model.generate(input_ids, max_length=64)  # Adjust max_length as needed
    
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

print("Input Text:", input_text)
print("="*50)
print("Generated Answer:", answer)