In [2]:
# Step 1: Install Dependencies

# https://github.com/unslothai/unsloth
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install "trl>=0.18.2,<0.25.0" peft accelerate bitsandbytes

# trl<0.9.0:    SFT
# peft:         LoRA
# accelerate:   distributed training & inference for PyTorch
# bitsandbytes: quantization

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-2j95sga7/unsloth_52d9ff9505ee4cfcb381f4774efab3a8
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-2j95sga7/unsloth_52d9ff9505ee4cfcb381f4774efab3a8
  Resolved https://github.com/unslothai/unsloth.git to commit 0779d697e6dc1e78a56053cc53b01c338afdb3ae
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from unsloth_zoo>=2025.11.3->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Using cached trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Using cached trl-0.24.0-py3-none-any.whl (423 kB)
Ins

In [4]:
# Step 2: Load the llama3 Model & Tokenizer

from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from datasets import load_dataset

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-bnb-4bit",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [None]:
# Step 3: Prepare custom dataset
# https://huggingface.co/datasets/yahma/alpaca-cleaned

alpaca_prompt = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

dataset = load_dataset("yahma/alpaca-cleaned", split="trained")
# selecting the first 500 examples
dataset = dataset.select(range(500))

EOS_TOKEN = tokenizer.eos_token
def format_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []

    for instruction, input, output in zip(instructions, inputs, outputs):
        if input:
          text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        else:
          text = alpaca_prompt.format(instruction, "", output) + EOS_TOKEN

        texts.append(text)

    return {"text": texts}

dataset = dataset.map(format_prompts_func, batched=True,)



In [None]:
# Step 4: LoRA adapters
# Tell the model what parameters to update

model = FastLanguageModel.get_peft_model(
    model,
    r=16, # Rank: Controls the size of LoRA matrics, Higher rank means more parameters to train
    lora_alpha=16, # Alpha: Scaling factor for the LoRA adapters,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)


In [None]:
# Step 5: Configure and Run the SFTTrainer
training_arguments = TrainingArguments(
    output_dir="./outputs",
    num_train_epochs=1, # Number of times to go through the dataset
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8, # Accumulate gradients to simulate a larger batch size
    optim="adamw_8bit",
    learning_rate=2e-4, # A good starting point for fine-tuning
    fp16=True,
    logging_steps=1,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=training_arguments,
)

# Run the training
trainer_stats = trainer.train()

In [None]:
# Step 6: Test Your Fine-tuned Model

# First, we need to load the model for inference only
# The model will be loaded with the LoRA adapters
model = FastLanguageModel.for_inference(
    model
)

In [None]:
new_messages = [
    {
        "role": "user",
        "content": "Tell me about the specific legal requirements for starting a tech startup in Delaware."
    }
]

# Apply the llama 3 chat template and run inference
# We no longer need to pass the chat_template argument, as it's already set on the tokenizer
inputs = tokenizer.apply_chat_template(
    new_messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
    padding=True,
).to("cuda")

outputs = model.generate(
    inputs,
    max_new_tokens=256,
    use_cache=True,
    do_sample=True,
    temperature=0.7,
    repetition_penalty=1.1,
)

print(tokenizer.decode(
    outputs[0][inputs.shape[1]:],
    skip_special_tokens=True
))