# Classification Chat Template using TinyLlama and Unsloth

This notebook demonstrates a lightweight classification chat template using TinyLlama and Unsloth for optimization.

In [None]:
# Install necessary libraries and update Unsloth
!pip install unsloth
!pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Import required libraries
from unsloth import FastLanguageModel
from datasets import load_dataset
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from transformers import TrainingArguments

# Configuration
max_seq_length = 1024  # Use a smaller sequence length for faster training
load_in_4bit = True    # Use 4-bit quantization for memory efficiency
dtype = None           # Auto-detect dtype

# Load pre-quantized TinyLlama model with 4-bit quantization (using the base model here)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/tinyllama-bnb-4bit",  # Use the base model instead of instruct
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# Add LoRA adapters including `embed_tokens` and `lm_head`
lora_config = LoraConfig(
    r=16,  # Use a smaller rank for faster training
    lora_alpha=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
        "lm_head", "embed_tokens"  # Ensure these are included to avoid NaN issues
    ],
    lora_dropout=0,  # Optimized with no dropout
    bias="none"
)

# Apply LoRA adapters and prepare for quantized training
model = get_peft_model(model, lora_config)
model = prepare_model_for_kbit_training(model)

# Enable gradient checkpointing if needed for memory savings
model.gradient_checkpointing_enable()

# Data Preparation
dataset = load_dataset("ag_news", split="train[:50%]")  # Increase dataset size to 50%

# Function to format inputs for classification with label IDs
def formatting_prompts_func(examples):
    labels = examples["label"]  # Use numerical label IDs directly
    texts = examples["text"]
    prompt = "### Instruction:\nClassify the following news article into one of these categories: [0] World, [1] Sports, [2] Business, [3] Sci/Tech.\n\n### Input:\n{}\n\n### Response: ["
    prompts = [prompt.format(text) for text in texts]
    return {"text": prompts, "labels": labels}

# Process the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)

# Training configuration
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,  # Pack sequences for efficient training
    args=TrainingArguments(
        per_device_train_batch_size=4,  # Adjust to prevent OOM
        gradient_accumulation_steps=4,  # Fewer accumulation steps for faster training
        warmup_ratio=0.05,  # Reduced warmup
        num_train_epochs=5,  # Train for more epochs to improve fine-tuning
        learning_rate=2e-5,  # Lower the learning rate for more precise training
        fp16=not torch.cuda.is_bf16_supported(),  # Use FP16 if BF16 is not available
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",  # Use 8-bit Adam optimizer
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs"
    ),
)

# Show GPU memory stats before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU: {gpu_stats.name}. Max memory: {max_memory} GB.")
print(f"Reserved memory before training: {start_gpu_memory} GB.")

# Train the model
trainer_stats = trainer.train()

# Show final memory and time stats after training
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"Training time: {trainer_stats.metrics['train_runtime']} seconds.")
print(f"Peak reserved memory: {used_memory} GB.")
print(f"Memory used for LoRA training: {used_memory_for_lora} GB.")
print(f"Peak memory usage: {used_percentage} %.")

# Inference
FastLanguageModel.for_inference(model)
model.eval()

inputs = tokenizer(
    ["### Instruction:\nClassify the following news article into one of these categories: [0] World, [1] Sports, [2] Business, [3] Sci/Tech.\n\n### Input:\nApple releases new iPhone model with improved camera\n\n### Response: ["],
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=max_seq_length
).to("cuda")

outputs = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_new_tokens=3,  # Limit to just the label ID
    do_sample=True,
    top_k=4,  # Limit to the 4 category options
    temperature=0.1,  # Lower temperature for more deterministic output
    early_stopping=True
)

generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
classification_result = generated_text.split("### Response: [")[1].strip()[:-1]  # Extract label ID

# Map label ID back to category name
category_mapping = {
    "0": "World",
    "1": "Sports",
    "2": "Business",
    "3": "Sci/Tech"
}
category_name = category_mapping.get(classification_result, "Unknown")
print("Classification result:", category_name)

# Save the LoRA adapters and model
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")


Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-6ew3h1dm/unsloth_fbc4fb0fa220480e946fee35e87f9e78
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-6ew3h1dm/unsloth_fbc4fb0fa220480e946fee35e87f9e78
  Resolved https://github.com/unslothai/unsloth.git to commit a2f4c9793ecf829ede2cb64f2ca7a909ce3b0884
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.


Generating train split: 0 examples [00:00, ? examples/s]

GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB.
Reserved memory before training: 1.277 GB.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,998 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 2,185
 "-____-"     Number of trainable parameters = 0
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,2.2713
2,2.255
3,2.2516
4,2.2591
5,2.2475
6,2.2336
7,2.2812
8,2.2373
9,2.2071
10,2.2333


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Training time: 3015.6429 seconds.
Peak reserved memory: 2.766 GB.
Memory used for LoRA training: 1.489 GB.
Peak memory usage: 6.991 %.
Classification result: World




('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.model',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')