<a href="https://colab.research.google.com/github/kumarpriyanshu09/Gemma-3-4B-Fine-tuning-with-Unsloth/blob/main/Finetuning_Gemma3_(4B).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --no-deps unsloth

In [None]:
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name ="unsloth/gemma-3-4b-it",
    max_seq_length= 2048,
    load_in_4bit=True,
    load_in_8bit=False,
    full_finetuning=False,
)

In [None]:
model = FastModel.get_peft_model(
    model,
    finetuning_vision_layers = False,
    finetuning_language_layers = True,
    finetuning_attention_modules = True,
    finetuning_mlp_modules = True,

    r = 8,
    lora_alpha =8,
    lora_dropout = 0,
    bias = "none",
    random_state = 3047,
)

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [None]:
from datasets import load_dataset
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

In [None]:
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)

In [None]:
dataset[96]

In [None]:
def formatting_prompts_fuc(examples):
  convos = examples["conversations"]
  texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
  return { "text" : texts , }

dataset = dataset.map(formatting_prompts_fuc, batched = True)

In [None]:
dataset[96]["text"]

In [None]:
from trl import SFTTrainer, SFTConfig
trainer =  SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        #num_train_epochs = 1,
        max_steps = 32,
        learning_rate = 2e-4,
        logging_steps=1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none",
    )
)

In [None]:
from unsloth.chat_templates import  train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part= "<start_of_turn>model\n",
)

In [None]:
tokenizer.decode(trainer.train_dataset[96]["input_ids"])

In [None]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[96]["labels"]]).replace(tokenizer.pad_token, " ")

In [None]:
gpu_stats =  torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max Memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "Continue the sequence: 1, 1, 2, 3, 5, 8,",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

In [None]:
messages = [{
    "role": "user",
    "content": [{"type": "text", "text": "why is the sky blue?"}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt =True,
)
from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = 'pt').to("cuda"),
    max_new_tokens = 64,
    temperature = 1.0,
    top_p = 0.95,
    top_k = 64,
    streamer = TextStreamer(tokenizer, skip_promot = True),
)

In [None]:
model.save_pretrained("gemma-3-lora")
tokenizer.save_pretrained("gemma-3-lora")


In [None]:
model.save_pretrained_merged("gemma-3-finetune", tokenizer)

In [None]:
model.save_pretrained_gguf(
    "gemma-3-finetune",
    quantization_type="Q8_0"
)

In [None]:
if True:
  from unsloth import FastModel
  model,tokenzizer = FastModel.from_pretrained(
      model_name = "path/to/saved/model",
      load_in_4bit = True,
  )

  messages = [{
      "role" : "user",
      "content": [{"type": "text", "text": "why is the Gemma-3?"}]
  }]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = 'pt').to("cuda"),
    max_new_tokens = 64,
    temperature = 1.0,
    top_p = 0.95,
    top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)