# The Forge Model Fine Tuning (v1)


## Step 1: Instantiate Global Parameters

In [None]:
"""
Max length of tokens a user may enter for summarization
Increasing this beyond 512 may increase compute time significantly
"""
max_input_token_length = 512
"""
Max length of tokens the model should output for the summary
Approximately the number of tokens it may take to generate a bullet
"""
max_output_token_length = 128
"""
Beams to use for beam search algorithm
Increased beams means increased quality, but increased compute time
"""
number_of_beams = 2

The block below allows the user to pull in the pre-trained and/or raw model checkpoint into this repository's local _forge/models/_ directory. This step is optional, but it allows [Step 2](#step-2-fine-tune-t5-checkpoint-model)'s user input to be a model in your local directory, thus providing offline usage and fine tuning later on. E.g., if you download google/flan-t5-xl to the local directory with name 'my-test-model' first, you can input '../models/my-test-model' for executing fine tuning on. The script below works on any model and tokenizer, but the fine tuning script in [Step 2](#step-2-fine-tune-t5-checkpoint-model) depends on the usage of a T5x variant.

In [None]:
from transformers import AutoModel, AutoTokenizer

# Specify the model name
model_name = input(
    "What is the target model's checkpoint name on Hugging Face?"
)

# Specify your directory path
directory_path = f"../models/{model_name}"

# Download and load the pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Save the model and tokenizer to your specified directory
tokenizer.save_pretrained(directory_path)
model.save_pretrained(directory_path)

## Step 2: Fine Tune T5 Checkpoint Model

### T5 Variants

In [None]:
# Fine tuning scripts
import signal
import re
import traceback
from loguru import logger
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from rouge_score import rouge_scorer
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup,
)

from scripts.file_utils import load_jsonl_data
from scripts.constants import *
from scripts.rich_logger import training_table as table, live_refresher as refresher


input_model = input(
    "What is the target model's relative directory path or checkpoint name on Hugging Face?"
)

# Model fine tuning parameter control object
model_params = {
    # Name of the pre-trained model or checkpoint name that will be fine-tuned
    "MODEL": f"{input_model}",
    "TOKENIZER": "google/flan-t5-base",
    # Number of examples per batch during training
    # Larger batch sizes require more memory, but can speed up training
    "TRAIN_BATCH_SIZE": 1,
    # Number of full passes through the entire training dataset
    # More epochs can lead to better performance, but risk over-fitting
    "TRAIN_EPOCHS": 4,
    # Number of examples per batch during validation
    # Larger batch sizes require more memory, but can speed up the validation process
    "VALID_BATCH_SIZE": 1,
    # Number of full passes through the entire validation dataset
    # Typically kept to a single epoch as the validation set does not need to be repeatedly passed
    "VAL_EPOCHS": 1,
    # Affects how quickly or slowly a model learns
    # Too high can cause instability, too low can cause slow learning
    "LEARNING_RATE": 1e-7,
    "MAX_SOURCE_TEXT_LENGTH": max_input_token_length,
    "MAX_TARGET_TEXT_LENGTH": max_output_token_length,
    # Random seed to ensure reproducibility
    # Using the same seed will yield the same model given the same data and training process
    "SEED": 166,
    "NUM_BEAMS": number_of_beams,
    # Multiplier to penalize repeated n-grams
    # Higher values discourage repetition in the generated text
    "REPETITION_PENALTY": 0.5,
    # Penalty applied for producing long sequences
    # Higher values encourage longer sequences
    "LENGTH_PENALTY": 0.5,
    # The number of steps to take before the gradient is averaged and applied
    # Helps in stabilizing training and requires less memory
    "GRADIENT_ACCUMULATION_STEPS": 1,
    # Weight decay introduced to the optimizer to prevent over-fitting
    # Regularization strategy by adding a small penalty, typically the L2 norm of the weights
    "WEIGHT_DECAY": 0.0,
    # Small constant to prevent any division by zero in the implementation (Adam)
    "ADAM_EPSILON": 1e-8,
    # Number of steps for the warmup phase
    # Helps in avoiding very high and undesirable values of gradients at the start of training
    "WARMUP_STEPS": 3,
    # The split between the training and validation data
    "TRAINING_VALIDATION_SPLIT": 0.85,
}

model_output_directory = "../models/" + input(
    "What name would you like to give the fine-tuned model?"
)

prompt_prefix_option = input(
    "Type the number to choose a prompt prefix type: (1) Bullet Prompt Training or (2) Data Creation Training"
)
prompt_prefix = (
    bullet_data_creation_prefix if prompt_prefix_option == "2" else bullet_prompt_prefix
)
data_set = (
    "../data/training/data_creation_set.jsonl"
    if prompt_prefix_option == "2"
    else "../data/training/training_validation_set.jsonl"
)

data = load_jsonl_data(
    data_set,
    prompt_prefix,
    isDataFrame=True,
)

# Set device to be used based on GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

logger.info(f"Loading {model_params['MODEL']}...")
# Model is sent to device (GPU/TPU) for using the hardware
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
model = model.to(device)
# Tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained(
    model_params["TOKENIZER"], model_max_length=model_params["MAX_SOURCE_TEXT_LENGTH"]
)


# Creating a custom dataset for reading the dataset and loading it into the dataloader
# to pass it to the neural network for fine tuning the model
class CustomDataset(Dataset):
    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # Cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }


# Generates a penalty for not complying to bullet formatting
def format_penalty(outputs, tokenizer, format_pattern):
    total_penalty = 0.0
    logits = outputs.logits
    # Converting the logits to token ids
    token_ids = torch.argmax(logits, dim=-1)
    # Decoding the token ids to text
    decoded_outputs = [
        tokenizer.decode(token_ids[i], skip_special_tokens=True)
        for i in range(token_ids.shape[0])
    ]

    for text in decoded_outputs:
        match = re.fullmatch(format_pattern, text)
        # If the output does not match the desired format exactly, add a penalty
        if not match:
            total_penalty += 1.0

    return torch.tensor(total_penalty, device=logits.device)


# Function to be called for training with the parameters passed from main function
def train(epoch, tokenizer, model, device, loader, optimizer, scheduler):
    # Create a GradScaler object for mixed precision training
    scaler = GradScaler()
    # Training logger refresh flag
    table.switch_epoch_refresh()
    # Stepping through training batches
    for step, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        with autocast():
            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                decoder_input_ids=y_ids,
                labels=lm_labels,
            )
            loss = outputs[0]

            # Add a penalty to the loss for outputs that don't match the format
            format_loss = format_penalty(outputs, tokenizer, bullet_pattern)
            total_loss = loss + format_loss

            if table.get_epoch_refresh():
                # Refresh table once per epoch
                table.refresh_table(epoch, loss)

            # Backward pass with mixed precision
            scaler.scale(total_loss).backward()

            # Check if the accumulated gradients are ready to be applied and the optimizer should be updated
            if (step + 1) % model_params["GRADIENT_ACCUMULATION_STEPS"] == 0:
                # Unscale the gradients to allow proper gradient scaling with mixed precision
                scaler.unscale_(optimizer)
                # Clip the gradients to prevent "exploding gradients" problem
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                # Back-propagate the scaled loss to compute the gradients
                scaler.step(optimizer)
                # Update the scaler's scale factor for the next iteration
                scaler.update()

            # Clear gradients to avoid accumulation of gradients from previous batches
            optimizer.zero_grad()

            # Adjust the learning rate based on the scheduler's update policy
            scheduler.step()


# Function to evaluate model for predictions and compute ROUGE scores
def validate(tokenizer, model, device, loader):
    predictions = []
    actuals = []

    # Initialize the rouge scorer
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = []

    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data["target_ids"].to(device, dtype=torch.long)
            ids = data["source_ids"].to(device, dtype=torch.long)
            mask = data["source_mask"].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=model_params["MAX_SOURCE_TEXT_LENGTH"],
                num_beams=model_params["NUM_BEAMS"],
                repetition_penalty=model_params["REPETITION_PENALTY"],
                length_penalty=model_params["LENGTH_PENALTY"],
                early_stopping=True,
            )
            preds = [
                tokenizer.decode(
                    g, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for g in generated_ids
            ]
            targets = [
                tokenizer.decode(
                    t, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for t in y
            ]

            # Calculate rouge scores for each prediction and corresponding target
            for pred, target in zip(preds, targets):
                score = scorer.score(target, pred)
                scores.append(score)

            predictions.extend(preds)
            actuals.extend(targets)

    # Compute the average ROUGE scores for the entire validation set
    avg_scores = {
        "rouge1": np.mean([score["rouge1"].fmeasure for score in scores]),
        "rouge2": np.mean([score["rouge2"].fmeasure for score in scores]),
        "rougeL": np.mean([score["rougeL"].fmeasure for score in scores]),
    }

    logger.info(f"Average ROUGE scores: {avg_scores}")

    return predictions, actuals


# T5 training main function
def T5Trainer(dataframe, source_text, target_text):
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True

    logger.info("Reading data...")
    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text]]

    # Creation of Dataset and Dataloader
    # 80% of the data will be used for training and the rest for validation
    train_size = model_params["TRAINING_VALIDATION_SPLIT"]
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    logger.info(f"FULL Dataset: {dataframe.shape}")
    logger.info(f"TRAIN Dataset: {train_dataset.shape}")
    logger.info(f"VALIDATION Dataset: {val_dataset.shape}")

    # Creating the Training and Validation dataset for further creation of data loader
    training_set = CustomDataset(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = CustomDataset(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of data loaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }
    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of data loaders for testing and validation - this will be used down for training and validation stage for the model
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session
    optimizer = torch.optim.AdamW(
        params=[p for p in model.parameters() if p.requires_grad],
        lr=model_params["LEARNING_RATE"],
        eps=model_params["ADAM_EPSILON"],
        weight_decay=model_params["WEIGHT_DECAY"],
    )

    # Define the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=model_params["WARMUP_STEPS"],
        num_training_steps=model_params["TRAIN_EPOCHS"]
        * len(training_loader)
        // model_params["GRADIENT_ACCUMULATION_STEPS"],
    )

    # Training loop
    logger.info(f"Initiating fine tuning of {model_params['MODEL']}...")
    # Table logger for training statistics
    with refresher:
        for epoch in range(model_params["TRAIN_EPOCHS"]):
            train(
                epoch, tokenizer, model, device, training_loader, optimizer, scheduler
            )
    logger.info(f"Saving fine-tuned  to {model_output_directory} ...")
    # Saving the model after training
    save_model()

    # Evaluating validation dataset
    logger.info("Initiating validation...")
    for _ in range(model_params["VAL_EPOCHS"]):
        validate(tokenizer, model, device, val_loader)
    logger.success("Model fine tuning, saving, and validation steps completed!")


# Saves the model
def save_model():
    model.save_pretrained(model_output_directory)
    tokenizer.save_pretrained(model_output_directory)
    logger.info(f"Fine-tuned model successfully saved to: {model_output_directory}")
    logger.success("Model saved. Shutting down...")


# In case of interrupt, save model and exit
def save_and_exit(signal, _):
    logger.warning(
        f"Received interrupt signal {signal}, stopping script and saving model..."
    )
    save_model()


# Attach the SIGINT signal (generated by Ctrl+C) to the handler
signal.signal(signal.SIGINT, save_and_exit)

try:
    # Run training function on the T5 model using data set and training parameters
    T5Trainer(dataframe=data, source_text="input", target_text="output")
except Exception as e:
    # Handle other unexpected errors
    logger.error("An unexpected error occurred during fine-tuning:")
    logger.error(traceback.extract_stack())
    # Save the model and any relevant data before exiting gracefully
    save_model()

## Step 3: Instantiate Target Model for Manual Testing

In [None]:
# Manual testing target model

model_name = input("What model from the local models directory would you like to use?")

input_model_directory_path = f"../models/{model_name}/"

## Step 4: Fine Tuned Model Manual Testing

### T5 Variants

In [None]:
# Manual test scripts
from transformers import T5ForConditionalGeneration, T5Tokenizer

from scripts.file_utils import load_jsonl_data
from scripts.constants import bullet_prompt_prefix

# Model generation parameter control object
model_params = {
    # Name of the pre-trained model that will be fine-tuned
    "MODEL": f"{input_model_directory_path}",
    # Maximum number of tokens from source text that model accepts
    "MAX_SOURCE_TEXT_LENGTH": max_input_token_length,
    # Maximum number of tokens from target text that model generates
    "MAX_TARGET_TEXT_LENGTH": max_output_token_length,
    # Number of alternative sequences generated at each step
    # More beams improve results, but increase computation
    "NUM_BEAMS": number_of_beams,
    # Scales logits before soft-max to control randomness
    # Lower values (~0) make output more deterministic
    "TEMPERATURE": 0.1,
    # Limits generated tokens to top K probabilities
    # Reduces chances of rare word predictions
    "TOP_K": 50,
    # Applies nucleus sampling, limiting token selection to a cumulative probability
    # Creates a balance between randomness and determinism
    "TOP_P": 0.90,
}


# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(
    model_params["MODEL"], model_max_length=model_params["MAX_SOURCE_TEXT_LENGTH"]
)

# Load the data from the manual test file
data = load_jsonl_data(
    "../data/training/manual_test_set.jsonl", bullet_prompt_prefix, isDataFrame=False
)
for line in data:
    # Preprocess input
    input_text = line["input"]
    expected_summary = line["output"]

    inputs = tokenizer.encode_plus(
        input_text, return_tensors="pt", truncation=True, max_length=512
    )

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=model_params["MAX_TARGET_TEXT_LENGTH"],
        num_beams=model_params["NUM_BEAMS"],
        temperature=model_params["TEMPERATURE"],
        top_k=model_params["TOP_K"],
        top_p=model_params["TOP_P"],
        early_stopping=True,
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Print results
    print(f"> INPUT TEXT: {input_text}")
    print(f"\t> EXPECTED SUMMARY: {expected_summary}")
    print(f"\t> GENERATED SUMMARY: {summary}\n")