# Self-Written Fine Tuning Experiments


## Step 1: Fine Tune Checkpoint Model

### Option 1: PyTorch with CPU


In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rich.table import Column, Table
from rich import box
from rich.console import Console

model_checkpoint = input("Input the t5 model checkpoint name to be fine tuned")

# Read in the JSONL training and validation data set
data = pd.read_json("../data/training/training_validation_set.jsonl", lines=True)
# Prepend T5's summarize task keyword to inputs
data["input"] = "summarize: " + data["input"]
# Show random sample of 10 data sets
data.sample(10)

# Define a rich console logger
console = Console(record=True)


# Setup the data frame display
def display_df(df):
    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for _, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)


training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

# Setting up the device for GPU usage, if available
from torch import cuda

device = "cuda" if cuda.is_available() else "cpu"


# Creating a custom dataset for reading the dataset and loading it into the dataloader
# to pass it to the neural network for fine tuning the model
class YourDataSetClass(Dataset):
    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # Cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }


# Function to be called for training with the parameters passed from main function
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 10 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


# Function to evaluate model for predictions
def validate(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data["target_ids"].to(device, dtype=torch.long)
            ids = data["source_ids"].to(device, dtype=torch.long)
            mask = data["source_mask"].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
            )
            preds = [
                tokenizer.decode(
                    g, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for g in generated_ids
            ]
            targets = [
                tokenizer.decode(
                    t, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for t in y
            ]
            if _ % 10 == 0:
                console.print(f"Completed {_}")

            predictions.extend(preds)
            actuals.extend(targets)
    return predictions, actuals


# T5 training main function
def T5Trainer(dataframe, source_text, target_text, model_params, output_dir):
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # Tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"], model_max_length=512)

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of summary
    # Further this model is sent to device (GPU/TPU) for using the hardware
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text]]
    display_df(dataframe.head(2))

    # Creation of Dataset and Dataloader
    # 80% of the data will be used for training and the rest for validation
    train_size = 0.8
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"VALIDATION Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of data loader
    training_set = YourDataSetClass(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = YourDataSetClass(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of data loaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of data loaders for testing and validation - this will be used down for training and validation stage for the model
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    console.log(f"[Saving Model]...\n")
    # Saving the model after training
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Evaluating validation dataset
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
        final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Validation Completed.]\n")
    console.print(
        f"""[Model] Model saved @ {output_dir}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")


model_params = {
    # model_type: t5-x
    "MODEL": f"{model_checkpoint}",
    # training batch size
    "TRAIN_BATCH_SIZE": 8,
    # validation batch size
    "VALID_BATCH_SIZE": 60,
    # number of training epochs
    "TRAIN_EPOCHS": 1,
    # number of validation epochs
    "VAL_EPOCHS": 1,
    # learning rate
    "LEARNING_RATE": 1e-4,
    # max length of source text
    "MAX_SOURCE_TEXT_LENGTH": 512,
    # max length of target text
    "MAX_TARGET_TEXT_LENGTH": 50,
    # set seed for reproducibility
    "SEED": 42,
}
output_directory = f"../models/pytorch-cpu-{model_checkpoint}"

# Run training function on the T5 model using data set and training parameters
T5Trainer(
    dataframe=data,
    source_text="input",
    target_text="output",
    model_params=model_params,
    output_dir=f"{output_directory}",
)

### Option 2: TensorFlow Metal with GPU 


In [7]:
import tensorflow as tf
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
import pandas as pd
from rich.console import Console
from rich.table import Table

console = Console()
model_checkpoint = input("Input the t5 model checkpoint name to be fine tuned")

# Load the tokenizer and the model
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = TFT5ForConditionalGeneration.from_pretrained(model_checkpoint)

# Read in the JSONL training and validation data set
data = pd.read_json("../data/training/training_validation_set.jsonl", lines=True)
# Prepend T5's summarize task keyword to inputs
data["input"] = "summarize: " + data["input"]
# Show random sample of 10 data sets
data.sample(10)

# Define the table
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Epoch")
table.add_column("Train Loss")
table.add_column("Val Loss")


# Load and encode the data
def encode(input_text, output_text):
    inputs = tokenizer.encode(
        input_text, return_tensors="tf", truncation=True, padding=True
    )
    targets = tokenizer.encode(
        output_text, return_tensors="tf", truncation=True, padding=True
    )
    return inputs, targets


def tf_encode(input_text, target_text):
    input_text = tf.compat.as_text(input_text.numpy())
    target_text = tf.compat.as_text(target_text.numpy())

    inputs = tokenizer.encode(
        input_text, return_tensors="tf", truncation=True, padding=True
    )
    targets = tokenizer.encode(
        target_text, return_tensors="tf", truncation=True, padding=True
    )
    return inputs, targets


# Convert DataFrame to TensorFlow Dataset
data_tf = tf.data.Dataset.from_tensor_slices(
    (data["input"].values, data["output"].values)
)

# Apply encoding to TensorFlow Dataset
data = data_tf.map(tf_encode)


# Split the data into training and validation
tf.random.set_seed(0)
data = data.shuffle(1000)
train_data = data.take(int(0.8 * len(data)))
val_data = data.skip(int(0.8 * len(data)))

# Define the loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)


# Define the train step function
@tf.function
def train_step(inp, tar):
    with tf.GradientTape() as tape:
        predictions = model(
            inp,
            attention_mask=inp != 0,
            decoder_input_ids=tar[:, :-1],
            labels=tar[:, 1:],
        )[1]
        loss = loss_object(tar[:, 1:], predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


# Define the optimizer
optimizer = tf.keras.optimizers.Adam()

# Train and validate the model
EPOCHS = 5  # Set the number of epochs
BATCH_SIZE = 8  # Lower the batch size for small datasets

for epoch in range(EPOCHS):
    total_loss = 0
    total_val_loss = 0

    for batch, (inp, tar) in enumerate(train_data.batch(BATCH_SIZE)):
        batch_loss = train_step(inp, tar)
        total_loss += batch_loss

    for batch, (inp, tar) in enumerate(val_data.batch(BATCH_SIZE)):
        predictions = model(
            inp,
            attention_mask=inp != 0,
            decoder_input_ids=tar[:, :-1],
            labels=tar[:, 1:],
        )[1]
        val_loss = loss_object(tar[:, 1:], predictions)
        total_val_loss += val_loss

    table.add_row(
        str(epoch + 1),
        f"{total_loss / (batch + 1):.4f}",
        f"{total_val_loss / (batch + 1):.4f}",
    )

console.print(table)

# Save the model
model.save_pretrained(f"../models/tensorflow-gpu-{model_checkpoint}")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
2023-07-15 00:55:53.249913: W tensorflow/core/framework/op_kernel.cc:1816] INVALID_ARGUMENT: ValueError: Input b'summarize: As the ESD focal point, I played a crucial role in tracking, routing, and resolving 375 Tier III and 6 high-level tickets. My efforts enabled AFNET access for 200,000 users, ensuring smooth operations and connectivity.' is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.
Traceback (most recent call last):

  File "/Users/justin/Documents/Code/personal/smarter-bullets/.venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 266, in __call__
    return func

InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} ValueError: Input b'summarize: As the ESD focal point, I played a crucial role in tracking, routing, and resolving 375 Tier III and 6 high-level tickets. My efforts enabled AFNET access for 200,000 users, ensuring smooth operations and connectivity.' is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.
Traceback (most recent call last):

  File "/Users/justin/Documents/Code/personal/smarter-bullets/.venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 266, in __call__
    return func(device, token, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/justin/Documents/Code/personal/smarter-bullets/.venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 144, in __call__
    outputs = self._call(device, args)
              ^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/justin/Documents/Code/personal/smarter-bullets/.venv/lib/python3.11/site-packages/tensorflow/python/ops/script_ops.py", line 151, in _call
    ret = self._func(*args)
          ^^^^^^^^^^^^^^^^^

  File "/Users/justin/Documents/Code/personal/smarter-bullets/.venv/lib/python3.11/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^

  File "/var/folders/f1/_vqvbs517hn1khv6hl8jvy1w0000gn/T/ipykernel_65663/2820134923.py", line 30, in encode
    inputs = tokenizer.encode(
             ^^^^^^^^^^^^^^^^^

  File "/Users/justin/Documents/Code/personal/smarter-bullets/.venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2206, in encode
    encoded_inputs = self.encode_plus(
                     ^^^^^^^^^^^^^^^^^

  File "/Users/justin/Documents/Code/personal/smarter-bullets/.venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2536, in encode_plus
    return self._encode_plus(
           ^^^^^^^^^^^^^^^^^^

  File "/Users/justin/Documents/Code/personal/smarter-bullets/.venv/lib/python3.11/site-packages/transformers/tokenization_utils.py", line 647, in _encode_plus
    first_ids = get_input_ids(text)
                ^^^^^^^^^^^^^^^^^^^

  File "/Users/justin/Documents/Code/personal/smarter-bullets/.venv/lib/python3.11/site-packages/transformers/tokenization_utils.py", line 634, in get_input_ids
    raise ValueError(

ValueError: Input b'summarize: As the ESD focal point, I played a crucial role in tracking, routing, and resolving 375 Tier III and 6 high-level tickets. My efforts enabled AFNET access for 200,000 users, ensuring smooth operations and connectivity.' is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.


	 [[{{node EagerPyFunc}}]] [Op:IteratorGetNext] name: 

## Step 2: Fine Tuned Model Manual Testing

In [None]:
# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained(
    input("Input the directory path of the t5 model to be used")
)

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(input("Input the t5 model name being loaded"), model_max_length=512)

# Preprocess input
input_text = input("Insert a statement to summarize")
input_expected_summary = input("Insert the expected statement summary")
prefix = "summarize: "
input_text = prefix + input_text

inputs = tokenizer.encode_plus(
    input_text, return_tensors="pt", truncation=True, max_length=512
)

# Generate summary
summary_ids = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    # Adjust the max length according to your desired summary length
    max_length=150,
    # Adjust the number of beams for beam search
    num_beams=20,
    early_stopping=True,
)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("> Expected Summary: ", input_expected_summary)
print("> Generated Summary: ", summary)