# Setup Packages

In [None]:
# Import libraries

# Standard Python libraries
import pandas as pd
#import pyreadr
from datasets import load_dataset, load_from_disk, Dataset  # For loading datasets
import datetime
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = "expandable_segments:True"
import json
import matplotlib.pyplot as plt

# Hugging Face Transformers
import transformers
from transformers import (
    AutoTokenizer,            # For tokenizing text
    AutoModelForCausalLM,     # For loading the GPT-2 model
    Trainer,                  # For training the model
    TrainingArguments,        # For specifying training arguments
    logging,                  # For logging
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    DataCollatorWithPadding )

# PyTorch
import torch  # For tensor operations and GPU support

# For PEFT
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model  # For LoRA configuration and model
from trl import SFTTrainer  # For supervised fine-tuning

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Set API Keys
from kaggle_secrets import UserSecretsClient # API Loggins
user_secrets = UserSecretsClient()

## Hugging Face
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")

# Login to Hugging Face
from huggingface_hub import login

login(Hugging_Face_token)

# Load Tokenizer

In [None]:
BASE_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

# Create the tokenizer to measure the length of the text
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, 
                                          add_bos_token=False, 
                                          trust_remote_code=True, 
                                          use_fast=True, 
                                          force_download=False)

#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length

In [None]:
tokenizer.special_tokens_map

# Load Preprocessed Data

In [None]:
today_date = datetime.datetime.now().strftime("%d %b %Y")

# Load the preprocessed JSONL dataset
with open("/kaggle/input/training-data/preprocessed_KitMalthouse.jsonl", "r") as f:
    raw_data = [json.loads(line) for line in f]

# Convert into a dataset format that follows the guide
formatted_data = []

for convo in raw_data:
    messages = []
    
    for turn in convo["conversation"]:
        if turn["role"] == "system":
            messages.append({"role": "system", "content": turn["content"]})
        elif turn["role"] == "user":
            messages.append({"role": "user", "content": turn["content"]})
        elif turn["role"] == "assistant":
            messages.append({"role": "assistant", "content": turn["content"]})
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    prompt = prompt.replace(f"Cutting Knowledge Date: December 2023\nToday Date: {today_date}\n\n","")
    
    formatted_data.append({"prompt": prompt})


# Convert to Hugging Face Dataset
dataset = Dataset.from_list(formatted_data)

# Tokenize Data

In [None]:
def tokenize_function(example):
    tokens = tokenizer(example['prompt'], 
                       add_special_tokens=False,
                       padding="max_length", 
                       truncation=True, 
                       max_length=2048)
    
    tokens['labels'] = [-100 if token == tokenizer.pad_token_id else token for token in tokens['input_ids']]

    return tokens

# Apply tokenization 
tokenized_dataset_KitMalthouse = dataset.map(tokenize_function, batched=True)
tokenized_dataset_KitMalthouse = tokenized_dataset_KitMalthouse.remove_columns(['prompt'])

In [None]:
# Plot Distribution of Token Lenghts
token_lengths = [len(sample["input_ids"]) for sample in tokenized_dataset_KitMalthouse]

# Plot histogram
plt.figure(figsize=(8, 5))
plt.hist(token_lengths, bins=50, color="blue", edgecolor="black", alpha=0.7)
plt.xlabel("Token Length")
plt.ylabel("Frequency")
plt.title("Distribution of Tokenized Sequence Lengths")
#plt.axvline(x=4096, color="red", linestyle="dashed", label="Max Length (4096)")
plt.legend()
plt.show()

print(f"Max Length: {max(token_lengths)} | Min Length: {min(token_lengths)} | Avg Length: {sum(token_lengths)/len(token_lengths)}")

# Model Setup

In [None]:
# Optimize Performance with Configurations
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Load model in 4bit, to redeuce memory and computational requirements
    bnb_4bit_use_double_quant=True,         # Double quantization, further compress the model weights
    bnb_4bit_quant_type="nf4",              # Quantization type = nf4
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in 16bit format, to speed up computation
)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto"  # Automatically assigns model to GPU if available
)

In [None]:
# Apply PEFT (Adapter, LoRA and others)
model.gradient_checkpointing_enable()               # Reduce memory usage by saving intermediate activations
model = prepare_model_for_kbit_training(model)      # Prepare model for kbit training to reduce memory usage

## Inspect Model Architecture

The attention mechanism in this model is implemented with **modular projections**, as opposed to a **combined module**: `query_key_value` .
The model uses distinct linear layers for the query (q_proj), key (k_proj), and value (v_proj) projections

In [None]:
# Inspect Model Architecture
print(model)

# Define LoRA

In [None]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,                                  # Rank of the low-rank matrices, lower ranks -> lower computational load & memory usage
    lora_alpha=32,                        # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Specifies the modules that should be adapted using LoRA (*Depends on model architecture)
    lora_dropout=0.1,                     # A Regularization technique used to prevent overfitting
    bias="none",                          # specifies that no additional bias terms should be added
    task_type="CAUSAL_LM"                 # Define the model: one that is 'predicting the next word'
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

print_trainable_parameters(model)

# Define Training Parameters
Define training parameters, including batch size, learning rate, and the number of training epochs.

In [None]:
training_args = TrainingArguments(
    output_dir="outputs",
    optim="paged_adamw_8bit",
    eval_strategy="no",
    #report_to="none",                       # Disable WandB integration
    per_device_train_batch_size=3,          # Adjust the batch size
    gradient_accumulation_steps=4,          # Increaset gradient-steps to reduce memory usage
    warmup_steps=2,                         # Helps to stabilize training
    num_train_epochs=3,                     # Control duration of Training (use either 'max_steps' or 'num_train_epochs')
    learning_rate=2e-5,
    logging_steps=10,                       # Frequency of Training metrics logs for detailed feedback on process
    weight_decay=0.01,

    fp16=True,                              # Enable mixed precision training
    gradient_checkpointing=True,            # Storing only a subset of activations
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args= training_args,                                 # input Training Arguments
    train_dataset= tokenized_dataset_KitMalthouse,           # input Tokenized Dataset
    data_collator= transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),   # Format batches of data for training
)

# Fine-Tune the Model

In [None]:
# Log in to W&B
import wandb

wandb_api_key = user_secrets.get_secret("wand_API_Key")

wandb.login(key=wandb_api_key)

In [None]:
# Train the model
model.config.use_cache = False        # disable caching
trainer.train()

In [None]:
# Save the Fine-Tuned Model
model.save_pretrained("./kaggle/working/fine-tuned-llama_KitMalthouse")
tokenizer.save_pretrained("./kaggle/working/fine-tuned-llama_KitMalthouse")

In [None]:
# Save the fine-tuned model
wandb.finish()