In [None]:
# Standard installs
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets # If loading from Hugging Face Hub

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-mt9vwavh/unsloth_f9b86b3a97d149a29efcbedfce35820b
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-mt9vwavh/unsloth_f9b86b3a97d149a29efcbedfce35820b
  Resolved https://github.com/unslothai/unsloth.git to commit 7a8f99e1890213cdd01a3ab6c3e13174a96e8220
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.4.1 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.4.1-py3-none-any.whl.metadata (8.0 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git

Collecting xformers
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl<0.9.0
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.15.2
    Uninstalling trl-0.15.2:
      Successfully uninstalled trl-0.15.2
Successfully installed trl-0.8.6 xformers-0.0.29.post3


In [None]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Chat Template for Classification (Sentiment Analysis)

In [None]:
# CORRECT MODEL from Huggingface directly
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # Correct path
    max_seq_length = 2048,
    dtype = torch.float16,
    load_in_4bit = True,
)


# Enable LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

# Load a small classification dataset
dataset = load_dataset("glue", "sst2", split = "train[:500]")  # Only 500 samples

# Format dataset into a chat prompt
def formatting_prompts_func(example):
    text = example['sentence']
    label = example['label']
    label_text = "positive" if label == 1 else "negative"

    prompt = f"<|im_start|>system\nYou are a helpful assistant classifying sentiment.<|im_end|>\n<|im_start|>user\nClassify the sentiment of this sentence: {text}<|im_end|>\n<|im_start|>assistant\n{label_text}<|im_end|>"
    return {"text": prompt}

dataset = dataset.map(formatting_prompts_func)

# Tokenization
def tokenize_function(sample):
    model_inputs = tokenizer(
        sample["text"],
        truncation=True,
        padding="max_length",
        max_length=2048,
    )
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(["sentence", "label", "text"])

# Training Preparation
FastLanguageModel.for_training(model)

from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model = model,
    train_dataset = dataset,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        max_steps = 30,  # Small demo
        learning_rate = 2e-4,
        logging_steps = 1,
        output_dir = "outputs_chat_template_classification",
        optim = "paged_adamw_32bit",
        fp16 = True,
        bf16 = False,
    ),
)

trainer.train()

# Save final model
model.save_pretrained("tinyllama_classification_unsloth")
tokenizer.save_pretrained("tinyllama_classification_unsloth")

==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.4.1 patched 22 layers with 22 QKV layers, 22 O layers and 0 MLP layers.


Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 2,252,800/4,000,000,000 (0.06% trained)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjayanth-kalyanam[0m ([33mjayanth-kalyanam-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,13.2757
2,13.2371
3,13.0417
4,13.1578
5,13.4625
6,13.2895
7,13.157
8,13.1296
9,11.6512
10,9.3055


('tinyllama_classification_unsloth/tokenizer_config.json',
 'tinyllama_classification_unsloth/special_tokens_map.json',
 'tinyllama_classification_unsloth/tokenizer.model',
 'tinyllama_classification_unsloth/added_tokens.json',
 'tinyllama_classification_unsloth/tokenizer.json')

## Conversational Chat Finetuning

In [None]:
# CONTINUING after Task (c.1) model load

from datasets import load_dataset

# Load a small conversational dataset
dataset_conv = load_dataset("conv_ai_2", split="train[:1000]")  # Slightly bigger, but still lightweight

# CONTINUING after loading dataset_conv

# Correct formatting function
def formatting_prompts_conv(example):
    dialog = example['dialog']
    if len(dialog) < 2:
        return {"text": ""}  # Skip if not enough dialog turns

    user_message = dialog[-2]  # Second last message = User
    assistant_response = dialog[-1]  # Last message = Assistant

    prompt = f"<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n{assistant_response}<|im_end|>"
    return {"text": prompt}

dataset_conv = dataset_conv.map(formatting_prompts_conv)

# Continue with tokenization as before
def tokenize_function_conv(sample):
    model_inputs = tokenizer(
        sample["text"],
        truncation=True,
        padding="max_length",
        max_length=2048,
    )
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

dataset_conv = dataset_conv.map(tokenize_function_conv, batched=True)
dataset_conv = dataset_conv.remove_columns(["dialog", "text"])

# Training Preparation (REUSE model)
FastLanguageModel.for_training(model)

from transformers import Trainer, TrainingArguments

trainer_conv = Trainer(
    model = model,
    train_dataset = dataset_conv,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        max_steps = 40,  # Slightly longer
        learning_rate = 2e-4,
        logging_steps = 1,
        output_dir = "outputs_conversational_chat",
        optim = "paged_adamw_32bit",
        fp16 = True,
        bf16 = False,
        save_strategy="no",  # To save memory, no checkpoint saving here
    ),
)

trainer_conv.train()

# Save after conversational finetuning
model.save_pretrained("tinyllama_conversational_unsloth")
tokenizer.save_pretrained("tinyllama_conversational_unsloth")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 40
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 2,252,800/4,000,000,000 (0.06% trained)


Step,Training Loss
1,4.6918
2,4.6317
3,4.6265
4,4.6318
5,4.5804
6,4.6674
7,4.5813
8,4.5945
9,4.6221
10,4.5806


('tinyllama_conversational_unsloth/tokenizer_config.json',
 'tinyllama_conversational_unsloth/special_tokens_map.json',
 'tinyllama_conversational_unsloth/tokenizer.model',
 'tinyllama_conversational_unsloth/added_tokens.json',
 'tinyllama_conversational_unsloth/tokenizer.json')

## Extending Maximum Context Size of TinyLlama

In [None]:
# CONTINUING after conversational task

# Set tokenizer's model max length to safe limit
tokenizer.model_max_length = 2048

# Create dummy dataset with near-max length examples
dummy_texts = ["Hello " * 350, "This is a simulated long context. " * 350]

from datasets import Dataset
dataset_context = Dataset.from_list([{"text": text} for text in dummy_texts])

# Tokenization
def tokenize_long(sample):
    model_inputs = tokenizer(
        sample["text"],
        truncation=True,
        padding="max_length",
        max_length=2048,  # NOT 4096
    )
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

dataset_context = dataset_context.map(tokenize_long, batched=True)

# Training Preparation
FastLanguageModel.for_training(model)

from transformers import Trainer, TrainingArguments

trainer_context = Trainer(
    model = model,
    train_dataset = dataset_context,
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        max_steps = 20,
        learning_rate = 2e-4,
        logging_steps = 1,
        output_dir = "outputs_extend_context",
        optim = "paged_adamw_32bit",
        fp16 = True,
        bf16 = False,
        save_strategy="no",
    ),
)

trainer_context.train()

# Save the model
model.save_pretrained("tinyllama_extended_context_unsloth")
tokenizer.save_pretrained("tinyllama_extended_context_unsloth")

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2 | Num Epochs = 10 | Total steps = 20
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 2,252,800/4,000,000,000 (0.06% trained)


Step,Training Loss
1,8.2722
2,17.3182
3,8.2722
4,17.3182
5,8.2722
6,3.541
7,3.541
8,4.2693
9,0.0265
10,3.9872


('tinyllama_extended_context_unsloth/tokenizer_config.json',
 'tinyllama_extended_context_unsloth/special_tokens_map.json',
 'tinyllama_extended_context_unsloth/tokenizer.model',
 'tinyllama_extended_context_unsloth/added_tokens.json',
 'tinyllama_extended_context_unsloth/tokenizer.json')

## Multi-Dataset Single Finetuning

In [None]:
# CONTINUING in same notebook, reusing model/tokenizer

from datasets import load_dataset, concatenate_datasets

# Load Classification Dataset (SST2 small subset)
dataset_classify = load_dataset("glue", "sst2", split="train[:500]")

def formatting_prompts_classify(example):
    text = example['sentence']
    label = example['label']
    label_text = "positive" if label == 1 else "negative"

    prompt = f"<|im_start|>system\nYou are a helpful assistant classifying sentiment.<|im_end|>\n<|im_start|>user\nClassify the sentiment of this sentence: {text}<|im_end|>\n<|im_start|>assistant\n{label_text}<|im_end|>"
    return {"text": prompt}

dataset_classify = dataset_classify.map(formatting_prompts_classify)
dataset_classify = dataset_classify.remove_columns(["sentence", "label"])

# Load Conversational Dataset (ConvAI2 small subset)
dataset_converse = load_dataset("conv_ai_2", split="train[:1000]")

def formatting_prompts_converse(example):
    dialog = example['dialog']
    if len(dialog) < 2:
        return {"text": ""}
    user_message = dialog[-2]
    assistant_response = dialog[-1]

    prompt = f"<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n{assistant_response}<|im_end|>"
    return {"text": prompt}

dataset_converse = dataset_converse.map(formatting_prompts_converse)
dataset_converse = dataset_converse.remove_columns(["dialog"])

# Merge both datasets
dataset_merged = concatenate_datasets([dataset_classify, dataset_converse])

# Tokenization
def tokenize_merged(sample):
    model_inputs = tokenizer(
        sample["text"],
        truncation=True,
        padding="max_length",
        max_length=2048,
    )
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

dataset_merged = dataset_merged.map(tokenize_merged, batched=True)
dataset_merged = dataset_merged.remove_columns(["text"])

# Training Preparation
FastLanguageModel.for_training(model)

from transformers import Trainer, TrainingArguments

trainer_merged = Trainer(
    model = model,
    train_dataset = dataset_merged,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        max_steps = 50,  # Slightly bigger since two tasks mixed
        learning_rate = 2e-4,
        logging_steps = 1,
        output_dir = "outputs_multi_task",
        optim = "paged_adamw_32bit",
        fp16 = True,
        bf16 = False,
        save_strategy="no",
    ),
)

trainer_merged.train()

# Save final model
model.save_pretrained("tinyllama_multi_task_unsloth")
tokenizer.save_pretrained("tinyllama_multi_task_unsloth")

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,500 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 2,252,800/4,000,000,000 (0.06% trained)


Step,Training Loss
1,4.551
2,4.5727
3,4.5661
4,4.5798
5,4.6116
6,4.526
7,4.5907
8,4.5225
9,4.6144
10,4.537


('tinyllama_multi_task_unsloth/tokenizer_config.json',
 'tinyllama_multi_task_unsloth/special_tokens_map.json',
 'tinyllama_multi_task_unsloth/tokenizer.model',
 'tinyllama_multi_task_unsloth/added_tokens.json',
 'tinyllama_multi_task_unsloth/tokenizer.json')