# Setup Packages

In [1]:
# Import libraries

# Standard Python libraries
import pandas as pd
import pyreadr
from datasets import load_dataset, Dataset  # For loading datasets
import os
import json

# Hugging Face Transformers
import transformers
from transformers import (
    AutoTokenizer,            # For tokenizing text
    AutoModelForCausalLM,     # For loading the GPT-2 model
    Trainer,                  # For training the model
    TrainingArguments,        # For specifying training arguments
    logging,                  # For logging
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    DataCollatorWithPadding )

# PyTorch
import torch  # For tensor operations and GPU support


# For PEFT
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model  # For LoRA configuration and model
from trl import SFTTrainer  # For supervised fine-tuning

In [2]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Login to Hugging Face

# Set API Keys
from kaggle_secrets import UserSecretsClient # API Loggins
user_secrets = UserSecretsClient()

## Hugging Face
Hugging_Face_token = user_secrets.get_secret("Hugging_Face_token")
from huggingface_hub import login

login(Hugging_Face_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Load Data

In [4]:
# Load Data 

with open('/kaggle/input/preprocess-data-ipynb/HoC_boris_johnson.jsonl') as f:
    HoC_json_boris_johnson = [json.loads(line) for line in f]

In [5]:
# Convert to Hugging Face Dataset
df_Boris_Johnson = Dataset.from_list(HoC_json_boris_johnson)

In [6]:
df_Boris_Johnson

Dataset({
    features: ['input'],
    num_rows: 374
})

# Tokenize Data

Different models may require different preprocessing steps based on their *architecture*, *tokenizer type*, and *task*

In [7]:
# Tokenize your dataset
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")    # Define the Tokenizer
tokenizer.pad_token = tokenizer.eos_token                               # Set the padding token to the end-of-sequence token

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [8]:
# Tokenize dataset
def preprocess(examples):
    inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=512)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": inputs["input_ids"]}

tokenized_df_Boris_Johnson = df_Boris_Johnson.map(preprocess, batched=True)

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

In [9]:
# Preview tokenized dataset
tokenized_df_Boris_Johnson

Dataset({
    features: ['input', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 374
})

# Model Setup

In [10]:
# Optimize Performance with Configurations
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Load model in 4bit, to redeuce memory and computational requirements
    bnb_4bit_use_double_quant=True,         # Double quantization, further compress the model weights
    bnb_4bit_quant_type="nf4",              # Quantization type = nf4
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in 16bit format, to speed up computation
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    quantization_config=bnb_config,
    device_map="auto"  # Automatically assigns model to GPU if available
)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [11]:
# Apply PEFT (Adapter, LoRA and others)
model.gradient_checkpointing_enable()               # Reduce memory usage by saving intermediate activations
model = prepare_model_for_kbit_training(model)      # Prepare model for kbit training to reduce memory usage

## Inspect Model Architecture

The attention mechanism in this model is implemented with **modular projections**, as opposed to a **combined module**: `query_key_value` .
The model uses distinct linear layers for the query (q_proj), key (k_proj), and value (v_proj) projections

In [12]:
# Inspect Model Architecture
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e

# Define LoRA

In [13]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,                                  # Rank of the low-rank matrices, lower ranks -> lower computational load & memory usage
    lora_alpha=32,                        # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Specifies the modules that should be adapted using LoRA (*Depends on model architecture)
    lora_dropout=0.1,                     # A Regularization technique used to prevent overfitting
    bias="none",                          # specifies that no additional bias terms should be added
    task_type="CAUSAL_LM"                 # Define the model: one that is 'predicting the next word'
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [14]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

print_trainable_parameters(model)

trainable params: 2293760 || all params: 1805757440 || trainable%: 0.12702481236904112


# Define Training Parameters
Define training parameters, including batch size, learning rate, and the number of training epochs.

In [15]:
# Set up Hyperparameters
training_args = transformers.TrainingArguments(
    output_dir="outputs",
    optim="paged_adamw_8bit",
    eval_strategy="no",
    #report_to="none",                       # Disable WandB integration
    per_device_train_batch_size=3,          # Adjust the batch size
    gradient_accumulation_steps=4,          # Increaset gradient-steps to reduce memory usage
    warmup_steps=2,                         # Helps to stabilize training
    num_train_epochs=3,                     # Control duration of Training (use either 'max_steps' or 'num_train_epochs')
    learning_rate=2e-5,
    logging_steps=10,                       # Frequency of Training metrics logs for detailed feedback on process
    weight_decay=0.01,

    fp16=True,                              # Enable mixed precision training
    gradient_checkpointing=True,            # Storing only a subset of activations
)

In [16]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args= training_args,                                 # input Training Arguments
    train_dataset= tokenized_df_Boris_Johnson,           # input Tokenized Dataset
    data_collator= transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),   # Format batches of data for training
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


# Fine-Tune the Model

In [17]:
# Log in to W&B
import wandb

#wandb_api_key = os.getenv("wand_API_Key")


wandb.login(key="e9febb58ac1779cc78d820e36fb9798142a0563b")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [18]:
# Train the model
model.config.use_cache = False        # disable caching
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhaoting-chan[0m ([33mhaoting-chan-gesis[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241224_021810-4x5c1st4[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33moutputs[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/haoting-chan-gesis/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/haoting-chan-gesis/huggingface/runs/4x5c1st4[0m
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,2.6412
20,2.5849
30,2.5422
40,2.5508
50,2.5319
60,2.4636
70,2.4381
80,2.4561
90,2.4615


TrainOutput(global_step=93, training_loss=2.5194123842382945, metrics={'train_runtime': 901.3545, 'train_samples_per_second': 1.245, 'train_steps_per_second': 0.103, 'total_flos': 9498197307162624.0, 'train_loss': 2.5194123842382945, 'epoch': 2.928})

In [19]:
# Run inference on the model
model.eval()  # Set model to evaluation mode

# Define the pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
text_generator("Should the UK rejoin the EU?", max_length=100, num_return_sequences=5)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianFor

[{'generated_text': "Should the UK rejoin the EU? The question of whether the UK should rejoin the European Union is a complex and contentious issue. There are arguments both for and against rejoining, and opinions on the matter vary widely depending on one's political views, geographic location, and personal experiences.\n\n### Arguments For Rejoining the EU\n\n1.  **Economic Benefits**: Rejoining the EU could provide the UK with access to a large market, potentially stimulating economic growth and creating new job opportunities.\n"},
 {'generated_text': "Should the UK rejoin the EU? A complex question with no simple answer\nThe UK's decision to leave the EU has been a contentious issue, and opinions on whether it should rejoin are sharply divided. Here are some arguments for and against rejoining the EU, highlighting the complexity of the issue.\n\n**Arguments For Rejoining the EU:**\n\n1.  **Economic benefits**: The UK's economy has been heavily influenced by its membership in the E

In [20]:
# Save the fine-tuned model
wandb.finish()
model.config.use_cache = True

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         train/epoch ▁▂▃▄▄▅▆▇██
[34m[1mwandb[0m:   train/global_step ▁▂▃▄▄▅▆▇██
[34m[1mwandb[0m:     train/grad_norm █▅▆▃▅▁▂▂▃
[34m[1mwandb[0m: train/learning_rate █▇▆▅▄▄▃▂▁
[34m[1mwandb[0m:          train/loss █▆▅▅▄▂▁▂▂
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:               total_flos 9498197307162624.0
[34m[1mwandb[0m:              train/epoch 2.928
[34m[1mwandb[0m:        train/global_step 93
[34m[1mwandb[0m:          train/grad_norm 0.63498
[34m[1mwandb[0m:      train/learning_rate 0.0
[34m[1mwandb[0m:               train/loss 2.4615
[34m[1mwandb[0m:               train_loss 2.51941
[34m[1mwandb[0m:            train_runtime 901.3545
[34m[1mwandb[0m: train_samples_per_second 1.245
[34m[1mwandb[0m:   train_steps_per_second 0.103
[34m[1mwandb

In [21]:
# Save the Fine-Tuned Model
model.save_pretrained("./kaggle/working/fine-tuned-llama_hoc_Boris")
tokenizer.save_pretrained("./kaggle/working/fine-tuned-llama_hoc_Boris")

('./kaggle/working/fine-tuned-llama_hoc_Boris/tokenizer_config.json',
 './kaggle/working/fine-tuned-llama_hoc_Boris/special_tokens_map.json',
 './kaggle/working/fine-tuned-llama_hoc_Boris/tokenizer.json')