<a href="https://colab.research.google.com/github/kdhenderson/msds_colab_notebooks/blob/main/MSDS_Workshop_Fine_Tuning_Part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ✅ 1. Install Dependencies
!pip install -q unsloth bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install -q sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.6/766.6 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.2/253.2 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# ✅ 2. Import Libraries
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, train_on_responses_only
from datasets import load_dataset, Dataset
from transformers import DataCollatorForSeq2Seq
from trl import SFTConfig, SFTTrainer
import torch
import json

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
# ✅ 3. Set Parameters
max_seq_length = 2048
dtype = None
load_in_4bit = True

In [4]:
# ✅ 4. Load Base Model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.9.4: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [5]:
# ✅ 5. Add LoRA Adapters (required for 4-bit finetuning)
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,  # how much weight do you want to put on the new matrix vs pretrained (bigger more weight on fine-tuned data)
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.9.4 patched 16 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [13]:
# ✅ 6. Create Example Train + Test JSONL Files
train_data = [
    {"conversations": [
        {"role": "user", "content": "What is the HW percentage in DS 6371?"},
        {"role": "assistant", "content": "The homework percentage in 6371 is 10%"}
    ]},
    {"conversations": [
        {"role": "user", "content": "What is the midterm percentage in DS 6371?"},
        {"role": "assistant", "content": "The midterm is worth 25% in DS 6371"}
    ]},
    {"conversations": [
        {"role": "user", "content": "What is the Final Exam worth in DS6371"},
        {"role": "assistant", "content": "The final exam is worth 25% of the grade in DS 6371"}
    ]},
    {"conversations": [
        {"role": "user", "content": "What is the HW percentage in DS 6371?"},
        {"role": "assistant", "content": "The homework percentage in 6371 is 10%"}
    ]},
    {"conversations": [
        {"role": "user", "content": "What is the midterm percentage in DS 6371?"},
        {"role": "assistant", "content": "The midterm is worth 25% in DS 6371"}
    ]},
    {"conversations": [
        {"role": "user", "content": "What is the Final Exam worth in DS6371"},
        {"role": "assistant", "content": "The final exam is worth 25% of the grade in DS 6371"}
    ]},
        {"conversations": [
        {"role": "user", "content": "What is the HW percentage in DS 6371?"},
        {"role": "assistant", "content": "The homework percentage in 6371 is 10%"}
    ]},
    {"conversations": [
        {"role": "user", "content": "What is the midterm percentage in DS 6371?"},
        {"role": "assistant", "content": "The midterm is worth 25% in DS 6371"}
    ]},
    {"conversations": [
        {"role": "user", "content": "What is the Final Exam worth in DS6371"},
        {"role": "assistant", "content": "The final exam is worth 25% of the grade in DS 6371"}
    ]}
]

test_data = [
    {"conversations": [
        {"role": "user", "content": "What percentage of the grade is the homework worth in DS6371?"},
        {"role": "assistant", "content": "10%"}
    ]},
    {"conversations": [
        {"role": "user", "content": "What is 5 + 7?"},
        {"role": "assistant", "content": "5 + 7 equals 12."}
    ]},
    {"conversations": [
        {"role": "user", "content": "What is the midterm percentage in DS 6371?"},
        {"role": "assistant", "content": "25%"}
    ]},
    {"conversations": [
        {"role": "user", "content": "What is the weight of the final exam in DS 6371?"},
        {"role": "assistant", "content": "It means Excitement, Respect and Celebration of Hard Work."}
    ]},
    {"conversations": [
        {"role": "user", "content": "What does Whamo mean?"},
        {"role": "assistant", "content": "It means Excitement, Respect and Celebration of Hard Work."}
    ]}
]

with open("train.jsonl", "w") as f:
    for item in train_data:
        f.write(json.dumps(item) + "\n")

with open("test.jsonl", "w") as f:
    for item in test_data:
        f.write(json.dumps(item) + "\n")

In [14]:
# ✅ 7. Load and Format Dataset
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

def format_conversations(example):
    text = tokenizer.apply_chat_template(example["conversations"], tokenize=False, add_generation_prompt=False)
    return {"text": text}

train_ds = load_dataset("json", data_files="train.jsonl", split="train").map(format_conversations)
test_ds = load_dataset("json", data_files="test.jsonl", split="train").map(format_conversations)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [15]:
# ✅ 8. Tokenize and Mask Responses
train_ds = train_ds.map(lambda x: tokenizer(x["text"]), batched=True, num_proc=2)
test_ds = test_ds.map(lambda x: tokenizer(x["text"]), batched=True, num_proc=2)

Map (num_proc=2):   0%|          | 0/9 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/5 [00:00<?, ? examples/s]

In [16]:
# ✅ 9. Trainer Config
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=40,    # epochs
        learning_rate=2e-4,
        logging_steps=1,
        output_dir="outputs",
        optim="adamw_8bit",
        seed=42,
        report_to="none"
    ),
)

In [17]:
# ✅ 10. Mask User Inputs, Only Train on Assistant Outputs
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n"
)

num_proc must be <= 9. Reducing num_proc to 9 for dataset of size 9.


Map (num_proc=9):   0%|          | 0/9 [00:00<?, ? examples/s]

In [18]:

# ✅ 11. Train Model
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9 | Num Epochs = 20 | Total steps = 40
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 5,636,096 of 1,241,450,496 (0.45% trained)


Step,Training Loss
1,0.0001
2,0.0001
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


TrainOutput(global_step=40, training_loss=6.5150584211082215e-06, metrics={'train_runtime': 33.0384, 'train_samples_per_second': 9.686, 'train_steps_per_second': 1.211, 'total_flos': 65098804654080.0, 'train_loss': 6.5150584211082215e-06, 'epoch': 20.0})

In [19]:
# ✅ 12. Inference on Test Data
from transformers import TextStreamer
FastLanguageModel.for_inference(model)  # Enable faster inference

for example in test_data:
    messages = example["conversations"][:1]  # Just the user message
    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs, max_new_tokens=100, temperature=0.99, top_p=0.9)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))


system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

What percentage of the grade is the homework worth in DS6371?assistant

The homework is worth 10% of the grade in DS 6371
system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

What is 5 + 7?assistant

5 + 7 = 12
system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

What is the midterm percentage in DS 6371?assistant

The midterm is worth 25% in DS 6371
system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

What is the weight of the final exam in DS 6371?assistant

The weight of the final exam in DS 6371 is 25%
system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

What does Whamo mean?assistant

Whamo is a brand of wooden skewers.
