# Setup Packages

In [1]:
!pip install transformers datasets torch huggingface_hub peft trl bitsandbytes

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.12.1-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m80.0 MB/s[0m eta 

In [2]:
# Import libraries

# Standard Python libraries
import pandas as pd
from datasets import load_dataset, Dataset  # For loading datasets
import os
import torch

# Hugging Face Transformers
import transformers
from transformers import (
    AutoTokenizer,            # For tokenizing text
    AutoModelForCausalLM,     # For loading the GPT-2 model
    Trainer,                  # For training the model
    TrainingArguments,        # For specifying training arguments
    logging,                  # For logging
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    DataCollatorWithPadding
)

# PyTorch
import torch  # For tensor operations and GPU support

# For PEFT
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model  # For LoRA configuration and model
from trl import SFTTrainer  # For supervised fine-tuning

In [3]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Login to Hugging Face
user_secrets = UserSecretsClient()
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")

from huggingface_hub import login
login(Hugging_Face_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Load Data

In [5]:
#from google.colab import files
#uploaded = files.upload()

In [6]:
# Load Data
#df_Boris_Johnson = pd.read_csv("df_Boris_Johnson_2001-19.csv")
df_Boris_Johnson = pd.read_csv("/kaggle/input/df-boris-johnson-2001-19/df_Boris_Johnson_2001-19.csv")

In [7]:
# Convert Pandas DataFrame to Hugging Face Dataset
df_Boris_Johnson_HF = Dataset.from_pandas(df_Boris_Johnson)

# Tokenize Data

Different models may require different preprocessing steps based on their *architecture*, *tokenizer type*, and *task*

In [8]:
# Tokenize your dataset
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")    # Define the Tokenizer
tokenizer.pad_token = tokenizer.eos_token                               # Set the padding token to the end-of-sequence token

def tokenize_function(examples):
    tokenized_output = tokenizer(examples['text'],
                                 truncation=True,
                                 padding='max_length', max_length=512)
    tokenized_output['labels'] = tokenized_output['input_ids'][:]

    return tokenized_output

# Use Hugging Face Dataset's map function to apply Tokenization
tokenized_df_Boris_Johnson = df_Boris_Johnson_HF.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Map:   0%|          | 0/2213 [00:00<?, ? examples/s]

In [9]:
# Preview tokenized dataset
#tokenized_df_Boris_Johnson[1]

# Model Setup

In [10]:
# Optimize Performance with Configurations
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Load model in 4bit, to redeuce memory and computational requirements
    bnb_4bit_use_double_quant=True,         # Double quantization, further compress the model weights
    bnb_4bit_quant_type="nf4",              # Quantization type = nf4
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in 16bit format, to speed up computation
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B",
    quantization_config=bnb_config,
    device_map="auto"  # Automatically assigns model to GPU if available
)

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [11]:
# Apply PEFT (Adapter, LoRA and others)
model.gradient_checkpointing_enable()               # Reduce memory usage by saving intermediate activations
model = prepare_model_for_kbit_training(model)      # Prepare model for kbit training to reduce memory usage

## Inspect Model Architecture

The attention mechanism in this model is implemented with **modular projections**, as opposed to a **combined module**: `query_key_value` .
The model uses distinct linear layers for the query (q_proj), key (k_proj), and value (v_proj) projections

In [12]:
# Inspect Model Architecture
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e

# Define LoRA

In [13]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,                                  # Rank of the low-rank matrices, lower ranks -> lower computational load & memory usage
    lora_alpha=32,                        # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Specifies the modules that should be adapted using LoRA (*Depends on model architecture)
    lora_dropout=0.1,                     # A Regularization technique used to prevent overfitting
    bias="none",                          # specifies that no additional bias terms should be added
    task_type="CAUSAL_LM"                 # Define the model: one that is 'predicting the next word'
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [14]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

print_trainable_parameters(model)

trainable params: 2293760 || all params: 1805757440 || trainable%: 0.12702481236904112


# Define Training Parameters
Define training parameters, including batch size, learning rate, and the number of training epochs.

In [15]:
# Set up Hyperparameters
training_args = transformers.TrainingArguments(
    output_dir="outputs",
    optim="paged_adamw_8bit",
    eval_strategy="no",
    #report_to="none",                       # Disable WandB integration
    per_device_train_batch_size=3,          # Adjust the batch size
    gradient_accumulation_steps=4,          # Increaset gradient-steps to reduce memory usage
    warmup_steps=2,                         # Helps to stabilize training
    num_train_epochs=3,                     # Control duration of Training (use either 'max_steps' or 'num_train_epochs')
    learning_rate=2e-5,
    logging_steps=10,                       # Frequency of Training metrics logs for detailed feedback on process
    weight_decay=0.01,

    fp16=True,                              # Enable mixed precision training
    gradient_checkpointing=True,            # Storing only a subset of activations
)

In [16]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args= training_args,                                 # input Training Arguments
    train_dataset= tokenized_df_Boris_Johnson,           # input Tokenized Dataset
    data_collator= transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),   # Format batches of data for training
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


# Fine-Tune the Model

In [17]:
# Log in to W&B
user_secrets = UserSecretsClient()
wand_API_Key = user_secrets.get_secret("wand_API_Key")

import wandb
wandb.login(key=wand_API_Key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [18]:
# Train the model
model.config.use_cache = False        # disable caching
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhaoting-chan[0m ([33mhaoting-chan-gesis[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241120_205017-vucowyun[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33moutputs[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/haoting-chan-gesis/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/haoting-chan-gesis/huggingface/runs/vucowyun[0m
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,2.5228
20,2.6286
30,2.5823
40,2.4973
50,2.5304
60,2.4619
70,2.4754
80,2.4711
90,2.4788
100,2.4479


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=552, training_loss=2.3811079471007637, metrics={'train_runtime': 5831.279, 'train_samples_per_second': 1.139, 'train_steps_per_second': 0.095, 'total_flos': 5.740516328708506e+16, 'train_loss': 2.3811079471007637, 'epoch': 2.994579945799458})

In [19]:
# Save the Fine-Tuned Model
model.save_pretrained("./fine-tuned-llama")
tokenizer.save_pretrained("./fine-tuned-llama")

('./fine-tuned-llama/tokenizer_config.json',
 './fine-tuned-llama/special_tokens_map.json',
 './fine-tuned-llama/tokenizer.json')