# Bullet Forge Model Fine Tuning


## Step 1: Instantiate Global Parameters

In [1]:
# What is the max length of tokens a user may enter for summarization
max_input_token_length = 512
# What is the max length of tokens the model should output for the summary
max_output_token_length = 35

## Step 2: Fine Tune Checkpoint Model

In [6]:
import os
import re
from loguru import logger
import numpy as np
import pandas as pd
import torch
from torch import cuda
from torch.utils.data import DataLoader, Dataset
import os
from rouge_score import rouge_scorer
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup,
)

from scripts.files import load_jsonl_data
from scripts.bullet_patterns import *
from scripts.rich_logger import *

input_model = input(
    "What is the target model's directory path or checkpoint name on Hugging Face?"
)
model_output_directory = "../models/" + input(
    "What name would you like to give the fine-tuned model?"
)

# Load JSONLdata
data = load_jsonl_data(
    "../data/training/training_validation_set.jsonl",
    BULLET_PROMPT_PREFIX,
    isDataFrame=True,
)

device = "cuda" if cuda.is_available() else "cpu"


# Creating a custom dataset for reading the dataset and loading it into the dataloader
# to pass it to the neural network for fine tuning the model
class CustomDataset(Dataset):
    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # Cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }


# Generates a penalty for not complying to bullet formatting
def format_penalty(outputs, tokenizer, format_pattern):
    total_penalty = 0.0
    logits = outputs.logits
    # Converting the logits to token ids
    token_ids = torch.argmax(logits, dim=-1)
    # Decoding the token ids to text
    decoded_outputs = [tokenizer.decode(token_ids[i], skip_special_tokens=True) for i in range(token_ids.shape[0])]
    
    for text in decoded_outputs:
        match = re.fullmatch(format_pattern, text)
        # If the output does not match the desired format exactly, add a penalty
        if not match:
            total_penalty += 1.0

    return torch.tensor(total_penalty, device=logits.device)



# Function to be called for training with the parameters passed from main function
def train(epoch, tokenizer, model, device, loader, optimizer, scheduler):
    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        # Add a penalty to the loss for outputs that don't match the format
        format_loss = format_penalty(outputs, tokenizer, BULLET_PATTERN)
        total_loss = loss + format_loss

        if _ % 10 == 0:
            training_logger.add_row(str(epoch + 1), str(_), str(total_loss))
            general_logger.print(training_logger)

        optimizer.zero_grad()
        total_loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()


# Function to evaluate model for predictions and compute ROUGE scores
def validate(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []

    # Initialize the rouge scorer
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = []

    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data["target_ids"].to(device, dtype=torch.long)
            ids = data["source_ids"].to(device, dtype=torch.long)
            mask = data["source_mask"].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=max_input_token_length,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
            )
            preds = [
                tokenizer.decode(
                    g, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for g in generated_ids
            ]
            targets = [
                tokenizer.decode(
                    t, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for t in y
            ]

            # Calculate rouge scores for each prediction and corresponding target
            for pred, target in zip(preds, targets):
                score = scorer.score(target, pred)
                scores.append(score)

            if _ % 10 == 0:
                general_logger.print(f"Completed {_}")

            predictions.extend(preds)
            actuals.extend(targets)

    # Compute the average ROUGE scores for the entire validation set
    avg_scores = {
        "rouge1": np.mean([score["rouge1"].fmeasure for score in scores]),
        "rouge2": np.mean([score["rouge2"].fmeasure for score in scores]),
        "rougeL": np.mean([score["rougeL"].fmeasure for score in scores]),
    }

    logger.print(f"Average ROUGE scores: {avg_scores}")

    return predictions, actuals


# T5 training main function
def T5Trainer(dataframe, source_text, target_text, model_params, output_dir):
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True
    logger.info(f"Loading {model_params['MODEL']}...")

    # Tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(
        model_params["MODEL"], model_max_length=model_params["MAX_SOURCE_TEXT_LENGTH"]
    )

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of summary
    # Further this model is sent to device (GPU/TPU) for using the hardware
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)
    logger.info(f"Reading data...")

    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text]]

    # Creation of Dataset and Dataloader
    # 80% of the data will be used for training and the rest for validation
    train_size = 0.8
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    logger.info(f"FULL Dataset: {dataframe.shape}")
    logger.info(f"TRAIN Dataset: {train_dataset.shape}")
    logger.info(f"VALIDATION Dataset: {val_dataset.shape}")

    # Creating the Training and Validation dataset for further creation of data loader
    training_set = CustomDataset(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = CustomDataset(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of data loaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }
    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of data loaders for testing and validation - this will be used down for training and validation stage for the model
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Define the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(training_loader) * model_params["TRAIN_EPOCHS"],
    )

    # Training loop
    logger.info(f"Initiating fine tuning...")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer, scheduler)
        scheduler.step()

    logger.info(f"Saving model...")
    # Saving the model after training
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Evaluating validation dataset
    logger.info(f"Initiating validation...")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
        final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

    general_logger.save_text(os.path.join(output_dir, "logs.txt"))

    logger.success(f"Model validation completed!")
    logger.info(f"Fine-tuned model saved to: {output_dir}")
    logger.info(
        f"Validation data saved to: {os.path.join(output_dir,'predictions.csv')}"
    )
    logger.info(f"Notebook logs saved to: {os.path.join(output_dir,'logs.txt')}")


model_params = {
    # model_type: t5-x
    "MODEL": f"{input_model}",
    # training batch size
    "TRAIN_BATCH_SIZE": 16,
    # validation batch size
    "VALID_BATCH_SIZE": 16,
    # number of training epochs
    "TRAIN_EPOCHS": 8,
    # number of validation epochs
    "VAL_EPOCHS": 1,
    # learning rate
    "LEARNING_RATE": 1e-4,
    # max length of source text
    "MAX_SOURCE_TEXT_LENGTH": max_input_token_length,
    # max length of target text
    "MAX_TARGET_TEXT_LENGTH": max_output_token_length,
    # set seed for reproducibility
    "SEED": 42,
}

# Run training function on the T5 model using data set and training parameters
T5Trainer(
    dataframe=data,
    source_text="input",
    target_text="output",
    model_params=model_params,
    output_dir=f"{model_output_directory}",
)

[32m2023-07-15 15:10:31.966[0m | [1mINFO    [0m | [36m__main__[0m:[36mT5Trainer[0m:[36m214[0m - [1mLoading t5-small...[0m
[32m2023-07-15 15:10:34.580[0m | [1mINFO    [0m | [36m__main__[0m:[36mT5Trainer[0m:[36m225[0m - [1mReading data...[0m
[32m2023-07-15 15:10:34.584[0m | [1mINFO    [0m | [36m__main__[0m:[36mT5Trainer[0m:[36m237[0m - [1mFULL Dataset: (400, 2)[0m
[32m2023-07-15 15:10:34.585[0m | [1mINFO    [0m | [36m__main__[0m:[36mT5Trainer[0m:[36m238[0m - [1mTRAIN Dataset: (320, 2)[0m
[32m2023-07-15 15:10:34.585[0m | [1mINFO    [0m | [36m__main__[0m:[36mT5Trainer[0m:[36m239[0m - [1mVALIDATION Dataset: (80, 2)[0m
[32m2023-07-15 15:10:34.587[0m | [1mINFO    [0m | [36m__main__[0m:[36mT5Trainer[0m:[36m288[0m - [1mInitiating fine tuning...[0m


TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'

## Step 3: Fine Tuned Model Manual Testing

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

from scripts.files import load_jsonl_data
from scripts.bullet_patterns import BULLET_PROMPT_PREFIX

input_model = input("What is the target model's directory path or checkpoint name on Hugging Face?")

# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained(f"{input_model}")

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=max_output_token_length)

# Load the data from the manual test file
data = load_jsonl_data("../data/training/manual_test_set.jsonl", BULLET_PROMPT_PREFIX, isDataFrame=False)
for line in data:
    # Preprocess input
    input_text = line["input"]
    expected_summary = line["output"]

    inputs = tokenizer.encode_plus(
        input_text, return_tensors="pt", truncation=True, max_length=512
    )

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        # Adjust the max length according to your desired summary length
        max_length=max_output_token_length,
        # Adjust the number of beams for beam search
        num_beams=2,
        early_stopping=True,
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Print results
    print(f"> INPUT TEXT: {input_text}")
    print(f"\t> EXPECTED SUMMARY: {expected_summary}")
    print(f"\t> GENERATE SUMMARY: {summary}\n")