Installing Dependencies

In [1]:
# Install Unsloth and essential libraries for Llama 3.2
import os
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-lmxm0yx2/unsloth_078cf7d54cd841cb8dd69703fdcc8e06
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-lmxm0yx2/unsloth_078cf7d54cd841cb8dd69703fdcc8e06
  Resolved https://github.com/unslothai/unsloth.git to commit d59ee86feeca4e0f63964d6fa7986a3d8d343a4c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2026.1.3 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2026.1.3-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git-

Load Model and Add LoRA Adapters

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Supports auto RoPE Scaling
dtype = None # None for auto detection
load_in_4bit = True # Use 4bit quantization to save memory

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Add LoRA Adapters (This is the 'Fine-tuning' part)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank: 16 is a good balance for legal tasks
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.3: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth 2026.1.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Data Preparation

In [3]:
from datasets import load_dataset
import os

# 1. OPTIONAL: Upload file via code (or manually drag it to the folder icon on the left)
from google.colab import files
if not os.path.exists("legal_train.jsonl"):
    print("Please upload your 'legal_train.jsonl' file:")
    uploaded = files.upload()

# 2. Load the dataset from the local /content/ directory
# In Colab, the current working directory is usually /content/
dataset = load_dataset("json", data_files="legal_train.jsonl", split="train")

# 3. Define the professional legal prompt (remains the same)
legal_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a legal aid assistant. Simplify the following legalese into plain, easy-to-understand English for a non-lawyer.<|eot_id|><|start_header_id|>user<|end_header_id|>
{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{}<|eot_id|>"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for inst, out in zip(instructions, outputs):
        texts.append(legal_prompt.format(inst, out))
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Training (The Main Event)

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 350,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "output   s",
    ),
)

trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 71 | Num Epochs = 39 | Total steps = 350
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss
1,0.5115
2,0.507
3,0.5236
4,0.4941
5,0.4266
6,0.4634
7,0.4499
8,0.4641
9,0.4278
10,0.3236




0,1
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇██
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█
train/grad_norm,█▄▅▃▃▂▁▂▂▂▂▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁
train/learning_rate,████▇▇▇▇▇▆▆▆▅▅▅▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
train/loss,█▆▄▂▄▂▂▂▂▂▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
total_flos,3090379145963520.0
train/epoch,38.88889
train/global_step,350.0
train/grad_norm,0.25324
train/learning_rate,0.0
train/loss,0.0925
train_loss,0.11421
train_runtime,400.1803
train_samples_per_second,6.997
train_steps_per_second,0.875


TrainOutput(global_step=350, training_loss=0.11420533995543207, metrics={'train_runtime': 400.1803, 'train_samples_per_second': 6.997, 'train_steps_per_second': 0.875, 'total_flos': 3090379145963520.0, 'train_loss': 0.11420533995543207, 'epoch': 38.888888888888886})

Smoke Test


In [6]:
# Inference test
FastLanguageModel.for_inference(model) # 2x faster
complex_text = "Notwithstanding any provisions contained herein to the contrary, the Tenant shall indemnify the Landlord."

inputs = tokenizer([legal_prompt.format(complex_text, "")], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 64)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("--- FINETUNED OUTPUT ---")
print(response.split("assistant")[-1].strip())

--- FINETUNED OUTPUT ---
You may be required to pay for damages.


Save for Local Use

In [None]:
# Save the LoRA adapters
model.save_pretrained("legalese_model_lora")
tokenizer.save_pretrained("legalese_model_lora")

# Optional: Export to GGUF for Ollama/Local use
# model.save_pretrained_gguf("model_gguf", tokenizer, quantization_method = "q4_k_m")

print("Training Complete! Download the 'legalese_model_lora' folder to your Windows machine.")