# T5 Model Fine-tuning

This notebook is used for fine-tuning the T5 base model. Please refer to the `README.md` within the parent `forge/` directory for more details.

## Step 1: Read, Tokenize and Encode Data

The block below reads data to memory and performs tokenization on all IO for the fine tuning process. 

- TRAINING_FILE: The name of the JSONL file, from the `data/training` directory
- MAXIMUM_SIZE: The maximum size of the data you want to read to memory, where `0` extracts all data

In [None]:
import asyncio
import nest_asyncio
from loguru import logger
from scripts.utils.file_utils import jsonl_read
from scripts.prepare_training import tokenize_and_encode


async def main():
    # Read in the training data JSONL file
    TRAINING_FILE = input("Enter the JSONL filename: ")
    MAXIMUM_SIZE = input("Enter a maximum data read size: ")
    training_file_path = f"../data/training/{TRAINING_FILE}"
    data = await jsonl_read(training_file_path, int(MAXIMUM_SIZE))
    if data == []:
        logger.error(
            f"An error occurred during the reading of JSONL file: {training_file_path}"
        )
        exit(1)

    # Tokenize and encode each IO pair
    return await tokenize_and_encode(data)


# Run the async events
def run_asyncio_loop():
    loop = asyncio.get_event_loop()
    return loop.run_until_complete(main())


# Enable nested event loops
nest_asyncio.apply()
prepared_data = run_asyncio_loop()

logger.info(f"Sample of the tokenized and encoded data: {prepared_data[0]}")
logger.info(f"Total count of tokenized and encoded data: {len(prepared_data)}")
logger.success(f"The data has been tokenized and encoded into memory!")
logger.warning(
    f"This tokenized and encoded data is only temporarily stored in the Jupyter Notebook instance."
)
logger.warning(
    f"Failing to save the data to file wil result in loss during restart or clearing of outputs."
)

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from transformers import (
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup,
)
from loguru import logger

# Load the pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Configure the training parameters
epochs = 3
learning_rate = 2e-5
warmup_steps = 1000
total_steps = len(prepared_data) * epochs

# Prepare the input tensors
input_ids = torch.stack([item["input_ids"] for item in prepared_data])
labels = torch.stack([item["labels"] for item in prepared_data])

# Create a dataset and data loader
dataset = TensorDataset(input_ids, labels)
dataloader = DataLoader(dataset, batch_size=16)

# Define the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)

# Start the training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

try:
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in dataloader:
            input_ids = batch[0].to(device)
            labels = batch[1].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, labels=labels)

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        average_loss = total_loss / len(dataloader)
        logger.info(f"Epoch {epoch + 1} - Average Loss: {average_loss:.4f}")

except Exception as e:
    logger.exception(f"An error occurred during training: {e}")
    raise

# Save the trained model and tokenizer
try:
    model.save_pretrained("../models/t5/trained/model-100")
    logger.success("Successfully saved t5-base model!")
except Exception as e:
    logger.exception(f"An error occurred while saving the model: {e}")
    raise