# Bullet Forge Model Fine Tuning


## Step 1: Instantiate Global Parameters

In [None]:
MAX_INPUT_TOKEN_LENGTH = 512
MAX_OUTPUT_TOKEN_LENGTH = 30

## Step 2: Fine Tune Checkpoint Model

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rich.table import Column, Table
from rich import box
from rich.console import Console
from rouge_score import rouge_scorer

from scripts.utils.files import load_jsonl_data
from classes.DataSet import DataSet

model_checkpoint = input("Input the t5 model checkpoint name to be fine tuned")
directory_suffix = input("Input extra suffix to name of model output")

# Load JSONLdata
data = load_jsonl_data(
    "../data/training/training_validation_set.jsonl", isDataFrame=True
)

# Define a rich console logger
console = Console(record=True)


# Setup the data frame display
def display_df(df):
    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for _, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)


training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

# Setting up the device for GPU usage, if available
from torch import cuda

device = "cuda" if cuda.is_available() else "cpu"


# Function to be called for training with the parameters passed from main function
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 10 == 0:
            training_logger.add_row(str(epoch + 1), str(_), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


# Function to evaluate model for predictions and compute ROUGE scores
def validate(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []

    # Initialize the rouge scorer
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = []

    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data["target_ids"].to(device, dtype=torch.long)
            ids = data["source_ids"].to(device, dtype=torch.long)
            mask = data["source_mask"].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=MAX_INPUT_TOKEN_LENGTH,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
            )
            preds = [
                tokenizer.decode(
                    g, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for g in generated_ids
            ]
            targets = [
                tokenizer.decode(
                    t, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for t in y
            ]

            # Calculate rouge scores for each prediction and corresponding target
            for pred, target in zip(preds, targets):
                score = scorer.score(target, pred)
                scores.append(score)

            if _ % 10 == 0:
                console.print(f"Completed {_}")

            predictions.extend(preds)
            actuals.extend(targets)

    # Compute the average ROUGE scores for the entire validation set
    avg_scores = {
        "rouge1": np.mean([score["rouge1"].fmeasure for score in scores]),
        "rouge2": np.mean([score["rouge2"].fmeasure for score in scores]),
        "rougeL": np.mean([score["rougeL"].fmeasure for score in scores]),
    }

    console.print(f"Average ROUGE scores: {avg_scores}")

    return predictions, actuals


# T5 training main function
def T5Trainer(dataframe, source_text, target_text, model_params, output_dir):
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # Tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"], model_max_length=model_params["MAX_SOURCE_TEXT_LENGTH"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of summary
    # Further this model is sent to device (GPU/TPU) for using the hardware
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text]]
    display_df(dataframe.head(2))

    # Creation of Dataset and Dataloader
    # 80% of the data will be used for training and the rest for validation
    train_size = 0.8
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"VALIDATION Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of data loader
    training_set = DataSet(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = DataSet(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of data loaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of data loaders for testing and validation - this will be used down for training and validation stage for the model
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    console.log(f"[Saving Model]...\n")
    # Saving the model after training
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Evaluating validation dataset
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
        final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Validation Completed.]\n")
    console.print(f"""[Model] Model saved @ {output_dir}\n""")
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")


model_params = {
    # model_type: t5-x
    "MODEL": f"{model_checkpoint}",
    # training batch size
    "TRAIN_BATCH_SIZE": 4,
    # validation batch size
    "VALID_BATCH_SIZE": 4,
    # number of training epochs
    "TRAIN_EPOCHS": 4,
    # number of validation epochs
    "VAL_EPOCHS": 1,
    # learning rate
    "LEARNING_RATE": 1e-5,
    # max length of source text
    "MAX_SOURCE_TEXT_LENGTH": MAX_INPUT_TOKEN_LENGTH,
    # max length of target text
    "MAX_TARGET_TEXT_LENGTH": MAX_OUTPUT_TOKEN_LENGTH,
    # set seed for reproducibility
    "SEED": 42,
}

output_directory = f"../models/pytorch-cpu-{model_checkpoint}{directory_suffix}"

# Run training function on the T5 model using data set and training parameters
T5Trainer(
    dataframe=data,
    source_text="input",
    target_text="output",
    model_params=model_params,
    output_dir=f"{output_directory}",
)

## Step 3: Fine Tuned Model Manual Testing

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from scripts.utils.files import load_jsonl_data

data = load_jsonl_data("../data/training/manual_test_set.jsonl", isDataFrame=False)

# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained(
    "../models/" + input("Input the folder of the t5 model to be used")
)

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=MAX_OUTPUT_TOKEN_LENGTH)

for line in data:
    # Preprocess input
    input_text = "summarize: " + line["input"]
    expected_summary = line["output"]

    inputs = tokenizer.encode_plus(
        input_text, return_tensors="pt", truncation=True, max_length=512
    )

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        # Adjust the max length according to your desired summary length
        max_length=MAX_OUTPUT_TOKEN_LENGTH,
        # Adjust the number of beams for beam search
        num_beams=2,
        early_stopping=True,
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Print results
    print(f"> INPUT TEXT: {input_text}")
    print(f"\t> EXPECTED SUMMARY: {expected_summary}")
    print(f"\t> GENERATE SUMMARY: {summary}\n")