<a href="https://colab.research.google.com/github/innomartcom/public_notebooks/blob/main/324ae62fe8c5a0acd15045af75b43c2b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q -U "huggingface_hub[cli]" accelerate bitsandbytes peft transformers trl datasets einops flash_attn

[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for tokenizers [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for tokenizers (pyproject.toml) ... [?25l[?25herror
[31m  ERROR: Failed building wheel for tokenizers[0m[31m
[0m[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (tokenizers)[0m[31m
[0m

In [None]:
# First, install the necessary libraries.
# The 'trl' library provides the SFTTrainer, which simplifies the fine-tuning process.
# The 'accelerate' library is for distributed training.
# The 'bitsandbytes' library is for efficient quantization.
# The 'peft' library is for Parameter-Efficient Fine-Tuning (PEFT), like LoRA.
# The 'datasets' library is for handling and loading datasets.
# The 'transformers' library is the core for loading the model and tokenizer.
# The 'einops' library is required for the specific Gemma model.
# The 'flash_attn' library is highly recommended for faster training on GPUs.

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

In [None]:
# --- 1. Load the Model and Tokenizer ---
# We will use the BitsAndBytes library to quantize the model to 4-bit,
# which greatly reduces memory usage.
# We'll use 'bfloat16' for computation if the GPU supports it, for better numerical stability.
# The 'flash_attention_2' flag is enabled for faster training.

model_id = "google/gemma-3-270m"
# Check if a GPU is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

In [None]:
device

'cuda'

In [None]:
# Load the base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    use_cache=False
    )
model.config.pretraining_tp = 1  # Required for Gemma

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # To prevent issues with attention masks

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import login
my_token = "hf_token"
login(token=my_token)

In [None]:
# --- 2. Load and Preprocess the Dataset ---
# We'll use the 'samsum' dataset as a practical example for summarization.
# You can replace this with your own dataset.
dataset_name = "cnn_dailymail"
dataset = load_dataset(dataset_name, '3.0.0', split="train")

# Define a function to format the dataset into a conversational turn
def format_prompt(sample):
    # This format is a simple example. You can customize it to fit your task.
    # The goal is to provide a clear instruction for the model.
    return f"Summarize the following article:\n\n{sample['article']}\n\nSummary:\n{sample['highlights']}"


In [None]:
# --- 3. Configure PEFT (LoRA) ---
# We define the LoraConfig to set up the fine-tuning parameters.
# 'r' is the rank of the update matrices, a lower rank means fewer trainable parameters.
# 'lora_alpha' is a scaling factor.
# 'target_modules' specifies which parts of the model to apply LoRA to. For Gemma,
# these are typically the attention projection layers.
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
model = get_peft_model(model, peft_config)

# Print a summary of the trainable parameters to see the efficiency gain
model.print_trainable_parameters()


trainable params: 5,898,240 || all params: 273,996,416 || trainable%: 2.1527


In [None]:
# --- 4. Configure Training Arguments ---
# Set up the TrainingArguments. These control the training loop behavior.
training_args = TrainingArguments(
    output_dir="./gemma-dailycnn-lora",
    num_train_epochs=1,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size per device
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate a larger batch size
    optim="paged_adamw_32bit",  # Optimizer to use
    save_strategy="epoch",  # Save checkpoint at the end of each epoch
    logging_steps=10,  # Log every N steps
    learning_rate=2e-4,  # Learning rate
    fp16=False,  # Set to True if your GPU supports it
    bf16=True,  # Use bfloat16 for computation if possible
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    # max_seq_length=1024,  # Maximum sequence length for the model
    # tokenizer=tokenizer,
    formatting_func=format_prompt,
    args=training_args,
)



In [None]:
# Start the training
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgummadi_sai[0m ([33mgummadi_sai-srm-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 0 has a total capacity of 14.74 GiB of which 3.18 GiB is free. Process 9741 has 11.56 GiB memory in use. Of the allocated memory 11.34 GiB is allocated by PyTorch, and 100.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Save the fine-tuned model and tokenizer
trainer.save_model("gemma-samsum-lora-final")
tokenizer.save_pretrained("gemma-samsum-lora-final")