<a href="https://colab.research.google.com/github/harshavardhangadila/AI-tasks-using-unsloth.ai/blob/main/Part%201%20Fine%20tuning%20using%20SmolLM2/Finetuning_Using_SmolLM2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install --upgrade \
  "unsloth>=2025.10.0" \
  "transformers==4.57.1" \
  "accelerate>=1.10.0" \
  "datasets>=2.20.0" \
  "trl>=0.23.0" \
  "peft>=0.17.1" \
  "sentencepiece"

# 2) IMPORTANT: import unsloth BEFORE transformers/peft
import sys, platform, torch, os
from unsloth import FastLanguageModel

import transformers, datasets, accelerate, trl, peft
print("Python       :", sys.version.split()[0])
print("Platform     :", platform.platform())
print("Transformers :", transformers.__version__)
print("Accelerate   :", accelerate.__version__)
print("Datasets     :", datasets.__version__)
print("TRL          :", trl.__version__)
print("PEFT         :", peft.__version__)

# 3) GPU check
cuda_ok = torch.cuda.is_available()
print("\nCUDA available:", cuda_ok)
if cuda_ok:
    dev = torch.cuda.get_device_properties(0)
    print(f"GPU          : {dev.name} | VRAM ~{round(dev.total_memory/1e9,2)} GB | bf16: {torch.cuda.is_bf16_supported()}")
else:
    print("‚ö†Ô∏è No GPU detected. In Colab: Runtime ‚Üí Change runtime type ‚Üí GPU.")

# 4) Load SmolLM2 via Unsloth and quick generate
BASE_MODEL = "HuggingFaceTB/SmolLM2-135M"
dtype = torch.float16 if cuda_ok else torch.float32

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_MODEL,
    max_seq_length = 1024,
    dtype = dtype,
    load_in_4bit = False,   # full-precision path for full FT later
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# For inference speed patch
FastLanguageModel.for_inference(model)

device = "cuda" if cuda_ok else "cpu"
model.to(device)

prompt = "You are a concise assistant.\nUser: Hello!\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=16, do_sample=False)

print("\n‚úÖ Smoke test output:\n", tokenizer.decode(out[0], skip_special_tokens=True))
if cuda_ok:
    print("\nüíæ Peak reserved GPU memory (GB):",
          round(torch.cuda.max_memory_reserved() / 1e9, 3))
print("\n‚úÖ STEP 1 complete ‚Äî environment OK. Reply with the printed output, then we‚Äôll move to STEP 2 (dataset prep).")


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.[0m[31m
[0mPython       : 3.12.12
Platform     : Linux-6.6.105+-x86_64-with-glibc2.35
Transformers : 4.57.1
Accelerate   : 1.11.0
Datasets     : 4.3.0
TRL          : 0.23.0
PEFT         : 0.17.1

CUDA available: True
GPU          : Tesla T4 | VRAM ~15.83 GB | bf16: False
==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xfor

In [None]:
## STEP 2 ‚Äî Load AG News
import os, random, numpy as np, torch
from datasets import load_dataset
from transformers import AutoTokenizer

# --- reuse the same base to keep tokenizer consistent ---
BASE_MODEL = "HuggingFaceTB/SmolLM2-135M"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
EOS = tokenizer.eos_token

# seeds for reproducibility
random.seed(42); np.random.seed(42); torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

def load_topic_data():
    try:
        ds = load_dataset("ag_news")
        print("‚úÖ Loaded dataset: ag_news")
        return ds, "ag_news"
    except Exception as e:
        print("‚ö†Ô∏è Could not load 'ag_news' ‚Üí", e)
        print("‚Ü™Ô∏è Falling back to 'dbpedia_14'...")
        ds = load_dataset("dbpedia_14")
        print("‚úÖ Loaded dataset: dbpedia_14")
        return ds, "dbpedia_14"

raw, ds_name = load_topic_data()

# consistent small splits so T4 can train quickly
if ds_name == "ag_news":
    train_raw, test_raw = raw["train"], raw["test"]
    valid_raw = train_raw.select(range(4000))
    train_raw = train_raw.select(range(4000, 4000+10000))
    text_field, label_field = "text", "label"
    LABELS = ["World", "Sports", "Business", "Sci/Tech"]
else:
    train_raw, test_raw = raw["train"], raw["test"]
    valid_raw = train_raw.select(range(4000))
    train_raw = train_raw.select(range(4000, 4000+10000))
    text_field, label_field = "content", "label"
    # DBPedia has 14 classes
    LABELS = [f"Class_{i}" for i in range(14)]

LABEL_MAP = {i: name for i, name in enumerate(LABELS)}
label_set_str = ", ".join(LABELS)

PROMPT = """You are an assistant that classifies a news article into ONE topic from this set:
{}

### Article:
{}

### Instructions:
Respond with exactly one topic from the set above (no extra words).

### Topic:
{}"""

def format_topic(batch):
    texts = batch[text_field]
    labels = batch[label_field]
    out_texts = []
    for t, y in zip(texts, labels):
        gold = LABEL_MAP.get(int(y), str(y))
        out_texts.append(PROMPT.format(label_set_str, t, gold) + EOS)
    return {"text": out_texts}

train = train_raw.map(format_topic, batched=True, remove_columns=train_raw.column_names)
valid = valid_raw.map(format_topic, batched=True, remove_columns=valid_raw.column_names)

print(f"‚úÖ train/valid sizes: {len(train)} / {len(valid)}")
print("‚úÖ Example (truncated to 800 chars):\n")
print(train["text"][0][:800])

# quick token length sanity
enc = tokenizer(train["text"][:64], padding=False, truncation=True, max_length=512)
lengths = [len(x) for x in enc["input_ids"]]
print(f"\nToken length stats on 64 samples ‚Äî min:{min(lengths)}, max:{max(lengths)}, avg:{sum(lengths)/len(lengths):.1f}")
print("\n‚úÖ STEP 2 complete ‚Äî data is formatted and ready for training.")


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

‚úÖ Loaded dataset: ag_news


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

‚úÖ train/valid sizes: 10000 / 4000
‚úÖ Example (truncated to 800 chars):

You are an assistant that classifies a news article into ONE topic from this set:
World, Sports, Business, Sci/Tech

### Article:
Court Deals Blow to Movie Studios (Reuters) Reuters - A federal appeals court on Thursday\delivered a stinging blow to the anti-piracy efforts of major\movie studios and music companies by ruling several Internet\file-sharing software companies are not liable for copyright\infringement.

### Instructions:
Respond with exactly one topic from the set above (no extra words).

### Topic:
Sci/Tech<|endoftext|>

Token length stats on 64 samples ‚Äî min:82, max:249, avg:125.0

‚úÖ STEP 2 complete ‚Äî data is formatted and ready for training.


In [None]:
## STEP 3: Sanity train with Unsloth + TRL
import os, torch
from trl import SFTTrainer, SFTConfig

# We already have: model, tokenizer, and the processed datasets: `train`, `valid`
# (with a single column "text" that contains the fully formatted prompt+label)
# If you're in a fresh runtime, make sure those 3 objects exist exactly as before.

# IMPORTANT: T4 => no bf16. We'll ALSO avoid fp16 to dodge GradScaler issues.
# We train fp32 + gradient checkpointing to fit memory.
torch_dtype_note = "torch.float32"

cfg = SFTConfig(
    output_dir="smollm2_135m_fullft_agnews_unsloth_fp32",
    num_train_epochs=1,                 # sanity run; you can raise to 2-3 later
    per_device_train_batch_size=8,      # fits with 135M model on T4 in fp32
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    weight_decay=0.05,
    warmup_ratio=0.03,
    logging_steps=20,
    save_strategy="no",                 # avoid extra checkpoints during sanity run
    eval_strategy="no",                 # quick sanity: skip eval during training
    gradient_checkpointing=True,        # memory saver; Unsloth is GC-friendly
    remove_unused_columns=False,        # KEEP the 'text' column for SFTTrainer
    fp16=False,
    bf16=False,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train,
    eval_dataset=valid,                 # not used during training in this sanity run
    dataset_text_field="text",          # tell SFTTrainer which field to tokenize
    packing=False,
    args=cfg,
)

# Print parameter counts + device/precision summary
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Params: total={total_params/1e6:.1f}M | trainable={trainable_params/1e6:.1f}M (full finetuning)")
if torch.cuda.is_available():
    gpu = torch.cuda.get_device_properties(0)
    print(f"GPU: {gpu.name} | VRAM ~{round(gpu.total_memory/1e9, 2)} GB | dtype={torch_dtype_note}")

print("‚úÖ Trainer ready. Starting a short fp32 sanity train...")
train_result = trainer.train()

# Runtime + memory snapshot
mins = round(train_result.metrics.get("train_runtime", 0)/60, 2)
peak_gb = round(torch.cuda.max_memory_reserved()/1e9, 3) if torch.cuda.is_available() else "CPU"
print(f"‚è± Runtime (min): {mins}")
print(f"üíæ Peak reserved GPU memory (GB): {peak_gb}")

# Quick eval after the sanity epoch (optional; fast)
metrics = trainer.evaluate()
print("üìä Eval metrics:", metrics)

# Tiny generation sanity check (match dtype/device to fp32 train)
from transformers import pipeline

gen = pipeline(
    "text-generation",
    model=trainer.model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float32,  # match training
)

def build_infer_prompt(article_text: str) -> str:
    return f"""You are an assistant that classifies a news article into ONE topic from this set:
World, Sports, Business, Sci/Tech

### Article:
{article_text}

### Instructions:
Respond with exactly one topic from the set above (no extra words).

### Topic:
"""

samples = [
    "The central bank raised interest rates amid concerns about inflation in the manufacturing sector.",
    "The team clinched the championship title after a stunning overtime victory.",
    "Scientists have unveiled a new quantum processor that dramatically improves error rates.",
    "Trade tensions rise as global markets react to new tariffs on imported goods.",
]

print("\nüîµ Sanity predictions:")
for s in samples:
    prompt = build_infer_prompt(s)
    pred = gen(prompt, max_new_tokens=8, do_sample=False)[0]["generated_text"].split("### Topic:")[-1].strip().splitlines()[0]
    print(" -", pred)


The model is already on multiple devices. Skipping the move to device specified in `args`.


Params: total=134.5M | trainable=134.5M (full finetuning)
GPU: Tesla T4 | VRAM ~15.83 GB | dtype=torch.float32
‚úÖ Trainer ready. Starting a short fp32 sanity train...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1 | Total steps = 1,250
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 134,515,008 of 134,515,008 (100.00% trained)


Step,Training Loss
20,3.5461
40,1.941
60,1.6479
80,1.5686
100,1.5741
120,1.5323
140,1.5193
160,1.5097
180,1.5175
200,1.4713


Unsloth: Will smartly offload gradients to save VRAM!
‚è± Runtime (min): 5.8
üíæ Peak reserved GPU memory (GB): 4.534


`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


üìä Eval metrics: {'eval_loss': 1.4429614543914795, 'eval_runtime': 56.047, 'eval_samples_per_second': 71.369, 'eval_steps_per_second': 17.842, 'epoch': 1.0}

üîµ Sanity predictions:
 - Business
 - Sports
 - Sci/Tech
 - Business
