# Setup Packages

In [None]:
# Import libraries

# Standard Python libraries
import pandas as pd
import pyreadr
from datasets import load_dataset, Dataset  # For loading datasets
import os
import json

# Hugging Face Transformers
import transformers
from transformers import (
    AutoTokenizer,            # For tokenizing text
    AutoModelForCausalLM,     # For loading the GPT-2 model
    Trainer,                  # For training the model
    TrainingArguments,        # For specifying training arguments
    logging,                  # For logging
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    DataCollatorWithPadding )

# PyTorch
import torch  # For tensor operations and GPU support


# For PEFT
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model  # For LoRA configuration and model
from trl import SFTTrainer  # For supervised fine-tuning

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Login to Hugging Face

# Set API Keys
from kaggle_secrets import UserSecretsClient # API Loggins
user_secrets = UserSecretsClient()

## Hugging Face
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")
from huggingface_hub import login

login(Hugging_Face_token)

# Load Data

In [None]:
# Load Data 

with open('/kaggle/input/preprocess-data-ipynb/HoC_boris_johnson.jsonl') as f:
    HoC_json_boris_johnson = [json.loads(line) for line in f]

In [None]:
# Convert to Hugging Face Dataset
df_Boris_Johnson = Dataset.from_list(HoC_json_boris_johnson)

In [None]:
df_Boris_Johnson

# Tokenize Data

Different models may require different preprocessing steps based on their *architecture*, *tokenizer type*, and *task*

In [None]:
# Tokenize your dataset
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")    # Define the Tokenizer
tokenizer.pad_token = tokenizer.eos_token                               # Set the padding token to the end-of-sequence token

In [None]:
# Tokenize dataset
def preprocess(examples):
    inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=512)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": inputs["input_ids"]}

tokenized_df_Boris_Johnson = df_Boris_Johnson.map(preprocess, batched=True)

In [None]:
# Preview tokenized dataset
tokenized_df_Boris_Johnson

# Model Setup

In [None]:
# Optimize Performance with Configurations
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Load model in 4bit, to redeuce memory and computational requirements
    bnb_4bit_use_double_quant=True,         # Double quantization, further compress the model weights
    bnb_4bit_quant_type="nf4",              # Quantization type = nf4
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in 16bit format, to speed up computation
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    quantization_config=bnb_config,
    device_map="auto"  # Automatically assigns model to GPU if available
)

In [None]:
# Apply PEFT (Adapter, LoRA and others)
model.gradient_checkpointing_enable()               # Reduce memory usage by saving intermediate activations
model = prepare_model_for_kbit_training(model)      # Prepare model for kbit training to reduce memory usage

## Inspect Model Architecture

The attention mechanism in this model is implemented with **modular projections**, as opposed to a **combined module**: `query_key_value` .
The model uses distinct linear layers for the query (q_proj), key (k_proj), and value (v_proj) projections

In [None]:
# Inspect Model Architecture
print(model)

# Define LoRA

In [None]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,                                  # Rank of the low-rank matrices, lower ranks -> lower computational load & memory usage
    lora_alpha=32,                        # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Specifies the modules that should be adapted using LoRA (*Depends on model architecture)
    lora_dropout=0.1,                     # A Regularization technique used to prevent overfitting
    bias="none",                          # specifies that no additional bias terms should be added
    task_type="CAUSAL_LM"                 # Define the model: one that is 'predicting the next word'
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

print_trainable_parameters(model)

# Define Training Parameters
Define training parameters, including batch size, learning rate, and the number of training epochs.

In [None]:
# Set up Hyperparameters
training_args = transformers.TrainingArguments(
    output_dir="outputs",
    optim="paged_adamw_8bit",
    eval_strategy="no",
    #report_to="none",                       # Disable WandB integration
    per_device_train_batch_size=3,          # Adjust the batch size
    gradient_accumulation_steps=4,          # Increaset gradient-steps to reduce memory usage
    warmup_steps=2,                         # Helps to stabilize training
    num_train_epochs=3,                     # Control duration of Training (use either 'max_steps' or 'num_train_epochs')
    learning_rate=2e-5,
    logging_steps=10,                       # Frequency of Training metrics logs for detailed feedback on process
    weight_decay=0.01,

    fp16=True,                              # Enable mixed precision training
    gradient_checkpointing=True,            # Storing only a subset of activations
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args= training_args,                                 # input Training Arguments
    train_dataset= tokenized_df_Boris_Johnson,           # input Tokenized Dataset
    data_collator= transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),   # Format batches of data for training
)

# Fine-Tune the Model

In [16]:
# Log in to W&B
import wandb

wandb_api_key = os.getenv("wand_API_Key")
wandb.login(key=wandb_api_key)

True

In [None]:
# Train the model
model.config.use_cache = False        # disable caching
trainer.train()

In [None]:
# Run inference on the model
model.eval()  # Set model to evaluation mode

# Define the pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
text_generator("Should the UK rejoin the EU?", max_length=100, num_return_sequences=5)

In [None]:
# Save the fine-tuned model
wandb.finish()
model.config.use_cache = True

In [None]:
# Save the Fine-Tuned Model
model.save_pretrained("./kaggle/working/fine-tuned-llama_hoc_Boris")
tokenizer.save_pretrained("./kaggle/working/fine-tuned-llama_hoc_Boris")