In [None]:
# Standard installs
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets # If loading from Hugging Face Hub

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-nrp_bws3/unsloth_37005ff5017147fd8c2bb560c947a2b3
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-nrp_bws3/unsloth_37005ff5017147fd8c2bb560c947a2b3
  Resolved https://github.com/unslothai/unsloth.git to commit 7a8f99e1890213cdd01a3ab6c3e13174a96e8220
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.4.1 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.4.1-py3-none-any.whl.metadata (8.0 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git

Collecting xformers
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.15.2
    Uninstalling trl-0.15.2:
      Successfully uninstalled trl-0.15.2
Successfully installed trl-0.8.6 xformers-0.0.29.post3


In [None]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset

# Load Model - Llama 3 8B
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit version
    max_seq_length = 4096, # Context window
    dtype = torch.float16,
    load_in_4bit = True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# Enable LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

# Load Dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train[:100]")

# Defensive Preprocessing
def formatting_prompts_func(example):
    instruction = example.get("instruction", "")
    input_text = example.get("input", "")
    output_text = example.get("output", "")

    if input_text:
        user_prompt = f"{instruction}\n{input_text}"
    else:
        user_prompt = instruction

    full_prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n{output_text}<|im_end|>"

    return {"text": full_prompt}

dataset = dataset.map(formatting_prompts_func)

# Tokenization with labels
def tokenize_function(sample):
    model_inputs = tokenizer(
        sample["text"],
        truncation=True,
        padding="max_length",
        max_length=4096,
    )
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

dataset = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
dataset = dataset.remove_columns(["instruction", "input", "output", "text"])

# Ready to Train
FastLanguageModel.for_training(model)

from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model = model,
    train_dataset = dataset,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 20,
        learning_rate = 2e-4,
        logging_steps = 1,
        output_dir = "outputs",
        optim = "paged_adamw_32bit",
        fp16 = True,
        bf16 = False,
    ),
)

trainer.train()

# Save the model
model.save_pretrained("llama3_finetuned_unsloth")
tokenizer.save_pretrained("llama3_finetuned_unsloth")

Unsloth: Already have LoRA adapters! We shall skip this step.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 2 | Total steps = 20
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 13,631,488/8,000,000,000 (0.17% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.6585
2,2.712
3,2.7747
4,2.5211
5,2.757
6,2.5292
7,0.1802
8,0.0859
9,0.0931
10,0.0663


('llama3_finetuned_unsloth/tokenizer_config.json',
 'llama3_finetuned_unsloth/special_tokens_map.json',
 'llama3_finetuned_unsloth/tokenizer.json')