In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch
from pathlib import Path
from datetime import datetime

In [2]:
model_id = "Qwen/Qwen3-0.6B"  # example; use yours

# Use bf16 if available, else fp16
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

max_memory = {
    0: "1GiB",  # keep only ~1GiB of the model on GPU0
    1: "7GiB",  # rest of the model goes here
    "cpu": "24GiB",  # optional spillover/offload safety
}

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=dtype,
    device_map="auto",  # let HF place layers under the caps
    max_memory=max_memory,
    low_cpu_mem_usage=True,  # streams weights in
    offload_folder="./offload",  # only used if it needs to spill to CPU
)

# (Training tip) disable cache + enable checkpointing to reduce activations
model.config.use_cache = False
model.gradient_checkpointing_enable()

# Setup for the model specific chat format
if not getattr(tokenizer, "chat_template", None):
    model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

# When you create batches, put inputs on the device that holds the FIRST layer.
# Auto placement usually puts embeddings & early blocks on the *smaller* device.
first_device = next(model.parameters()).device

In [3]:
ds = load_dataset("trl-lib/Capybara")

In [4]:
# Get the current timestamp
current_time = datetime.now()

# Create a readable timestamp
formatted_time = current_time.strftime("%b-%d-%Y-%H-%M-%S")

# Adjust the model
fine_tuned_model_name = f'{model_id.replace("/", "--")}-ft'

# Model assets output folder
model_output_folder = Path.cwd().joinpath(f"models/{fine_tuned_model_name}")
if not model_output_folder.exists():
    model_output_folder.mkdir(parents=True, exist_ok=True)

# SFTrainer configuration
sft_config = SFTConfig(
    # Output directory for model assets
    output_dir=model_output_folder,
    # Hyperparameter : Controls maximum number of steps to be executed
    # Maximum number of gradient update steps during training.
    max_steps=200,
    # Common starting point for fine-tuning
    # The initial learning rate for the optimizer.
    learning_rate=5e-5,
    # Set according to your GPU memory capacity
    # Number of training samples per device in each batch. Smaller values help fit large models into memory-constrained GPUs.
    per_device_train_batch_size=1,
    # Simulate batch size = 8
    gradient_accumulation_steps=8,
    # Frequency of logging training metrics
    # Logs metrics (e.g., loss) every 10 steps during training.
    logging_steps=10,
    # Frequency of saving model checkpoints
    # Saves model checkpoints every 100 steps. In case of failure, loss or work will be limited to a maximum of 100 steps
    save_steps=100,
    # Evaluate the model at regular intervals
    eval_strategy="steps",
    # Frequency of evaluation
    # Run the model evaluation after every 50 steps
    eval_steps=50,
    # Use MPS for mixed precision training
    use_mps_device=(True if first_device == "mps" else False),
    # Set a unique name for your model - used for HuggingFace hub
    hub_model_id=fine_tuned_model_name,
    max_length=128,
)

In [5]:
# Initialize the SFTTrainer
trainer = SFTTrainer(
    # The language model being fine-tuned.
    model=model,
    # Passes the fine-tuning configuration defined above
    args=sft_config,
    # Training dataset
    train_dataset=ds["train"],
    # Evaluation dataset
    eval_dataset=ds["test"],
    # Tokenizer used
    processing_class=tokenizer,
)

In [6]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
50,1.7478,1.88018
100,1.7352,1.842487
150,1.7793,1.826471
200,1.8316,1.823102


TrainOutput(global_step=200, training_loss=1.8082774925231933, metrics={'train_runtime': 229.0441, 'train_samples_per_second': 6.986, 'train_steps_per_second': 0.873, 'total_flos': 523354621870080.0, 'train_loss': 1.8082774925231933})

In [7]:
# Save the model
# trainer.save_model(f"./{fine_tuned_model_name}")