# Introduction

This notebook shows how to finetune a LLM on a custom dataset using unsloth.

# Prepare a dataset

In [1]:
# Display the first N conversations from a JSONL fine-tuning dataset in a readable way

import json
from pathlib import Path

# =========================
# Configuration
# =========================
jsonl_path = Path("juergen_profile_conversations.jsonl")
num_entries = 10

# =========================
# Load and display
# =========================
if not jsonl_path.exists():
    raise FileNotFoundError(f"File not found: {jsonl_path}")

print(f"üìÑ Loading first {num_entries} entries from: {jsonl_path}\n")

with jsonl_path.open("r", encoding="utf-8") as f:
    for idx, line in enumerate(f):
        if idx >= num_entries:
            break

        record = json.loads(line)

        print("=" * 80)
        print(f"Conversation #{idx + 1}")
        print("=" * 80)

        conversations = record.get("messages", [])
        if not conversations:
            print("‚ö†Ô∏è  No conversations found in this entry.\n")
            continue

        for msg in conversations:
            role = msg.get("role", "unknown").upper()
            content = msg.get("content", "")

            if role == "SYSTEM":
                prefix = "[SYSTEM]"
            elif role == "USER":
                prefix = "[USER]"
            elif role == "ASSISTANT":
                prefix = "[ASSISTANT]"
            else:
                prefix = f"[{role}]"

            print(f"{prefix}\n{content}\n")

        print()  # blank line between conversations

print("‚úÖ Done.")


üìÑ Loading first 10 entries from: juergen_profile_conversations.jsonl

Conversation #1
[SYSTEM]
You are a helpful assistant. Be accurate and do not invent personal facts.

[USER]
What is my full name?

[ASSISTANT]
Your full name is J√ºrgen Brauer.


Conversation #2
[SYSTEM]
You are a helpful assistant. Be accurate and do not invent personal facts.

[USER]
When was I born?

[ASSISTANT]
You were born on March 14, 1976.


Conversation #3
[SYSTEM]
You are a helpful assistant. Be accurate and do not invent personal facts.

[USER]
What is my profession?

[ASSISTANT]
You are a professor.


Conversation #4
[SYSTEM]
You are a helpful assistant. Be accurate and do not invent personal facts.

[USER]
Where do I work?

[ASSISTANT]
You have been working at Kempten University of Applied Sciences since 2015.


Conversation #5
[SYSTEM]
You are a helpful assistant. Be accurate and do not invent personal facts.

[USER]
What did I study and where?

[ASSISTANT]
You studied Computer Science, completing yo

# Prepare a base model, finetune, save

In [2]:
# Unsloth (current-style): conversational fine-tuning (LoRA + 4bit) in one Jupyter cell
# Based on the latest Unsloth notebook approach (SFTConfig + DataCollatorForSeq2Seq + train_on_responses_only)
# pip install unsloth transformers trl datasets accelerate bitsandbytes peft

from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, train_on_responses_only

import os
import torch
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq
from trl import SFTConfig, SFTTrainer

# =========================
# 1) Configuration
# =========================
model_name        = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
# Note: The current Unsloth notebook uses the "llama-3.1" chat template for Llama 3.x conversational finetunes.
chat_template     = "llama-3.1"

data_file_path    = "juergen_profile_conversations.jsonl"

max_seq_length    = 2048
dtype             = None               # None = Unsloth auto-detect (recommended in notebook)
load_in_4bit      = True

# LoRA
lora_r            = 16
lora_alpha        = 16
lora_dropout      = 0                 # notebook uses 0 as optimized default
use_grad_ckpt     = "unsloth"         # True or "unsloth" (notebook recommends "unsloth" for long context)

# Training
output_dir        = "outputs_conv_en"
max_steps         = 120
learning_rate     = 2e-4
batch_size        = 2
grad_acc_steps    = 4
warmup_steps      = 10
logging_steps     = 1
weight_decay      = 0.001
lr_scheduler_type = "linear"
seed              = 3407

# Optional: packing (faster for short samples). Keep False for clarity.
packing           = False

# Save location for LoRA adapters
adapter_dir       = f"lora_adapter__{model_name.replace('/', '_')}"

# =========================
# 2) Sanity checks
# =========================
if not torch.cuda.is_available():
    raise RuntimeError("CUDA/GPU not found. Unsloth fine-tuning typically requires an NVIDIA GPU.")

if not os.path.exists(data_file_path):
    raise FileNotFoundError(
        f"Training data file not found: {data_file_path}\n"
        "Create it first (JSONL with one object per line, containing a 'conversations' field)."
    )

os.makedirs(output_dir, exist_ok=True)

# =========================
# 3) Load dataset (JSONL)
# =========================
print(f"‚úÖ Loading dataset from: {data_file_path}")
dataset = load_dataset("json", data_files=data_file_path, split="train")

if "messages" not in dataset.column_names:
    raise ValueError(f"Dataset must contain a 'messages' column. Found: {dataset.column_names}")

# =========================
# 4) Load model + tokenizer (Unsloth)
# =========================
print(f"‚úÖ Loading model: {model_name}")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = model_name,
    max_seq_length  = max_seq_length,
    dtype           = dtype,
    load_in_4bit    = load_in_4bit,
)

# Apply the chat template (current notebook approach)
print(f"‚úÖ Setting chat template: {chat_template}")
tokenizer = get_chat_template(tokenizer, chat_template=chat_template)

# =========================
# 5) Enable LoRA
# =========================
print("‚úÖ Enabling LoRA ...")
model = FastLanguageModel.get_peft_model(
    model,
    r                       = lora_r,
    target_modules          = ["q_proj", "k_proj", "v_proj", "o_proj",
                               "gate_proj", "up_proj", "down_proj"],
    lora_alpha              = lora_alpha,
    lora_dropout            = lora_dropout,
    bias                    = "none",
    use_gradient_checkpointing = use_grad_ckpt,
    random_state            = seed,
    use_rslora              = False,
    loftq_config            = None,
)

# =========================
# 6) Convert conversations -> "text" using tokenizer.apply_chat_template(...)
#    This matches the Unsloth notebook pattern.
# =========================
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False,
        )
        for convo in convos
    ]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

print("\n--- Sample formatted training text (truncated) ---")
print(dataset[0]["text"][:800], "...\n")

# =========================
# 7) Train (current notebook style: SFTConfig + DataCollatorForSeq2Seq)
# =========================
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

print("‚úÖ Starting training ...")
trainer = SFTTrainer(
    model               = model,
    tokenizer           = tokenizer,
    train_dataset       = dataset,
    dataset_text_field  = "text",
    max_seq_length      = max_seq_length,
    data_collator       = data_collator,
    packing             = packing,
    args                = SFTConfig(
        per_device_train_batch_size     = batch_size,
        gradient_accumulation_steps     = grad_acc_steps,
        warmup_steps                    = warmup_steps,
        max_steps                       = max_steps,
        learning_rate                   = learning_rate,
        logging_steps                   = logging_steps,
        optim                           = "adamw_8bit",
        weight_decay                    = weight_decay,
        lr_scheduler_type               = lr_scheduler_type,
        seed                            = seed,
        output_dir                      = output_dir,
        report_to                       = "none",
    ),
)

# Optional but recommended by Unsloth notebook:
# Train only on assistant tokens (mask system + user from loss).
# For Llama 3.x templates, the notebook uses these separators:
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part    = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

trainer_stats = trainer.train()

# =========================
# 8) Save LoRA adapters
# =========================
print(f"üíæ Saving LoRA adapter to: {adapter_dir}")
model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)

print("‚úÖ Done.")


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-12-31 13:57:00.134767: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-31 13:57:00.166664: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-31 13:57:00.931568: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  if not hasattr(np, "object"):


ü¶• Unsloth Zoo will now patch everything to make training faster!
‚úÖ Loading dataset from: juergen_profile_conversations.jsonl
‚úÖ Loading model: unsloth/Llama-3.2-3B-Instruct-bnb-4bit
==((====))==  Unsloth 2025.12.9: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition. Num GPUs = 1. Max memory: 94.969 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 12.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
‚úÖ Setting chat template: llama-3.1
‚úÖ Enabling LoRA ...


Unsloth 2025.12.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
num_proc must be <= 16. Reducing num_proc to 16 for dataset of size 16.



--- Sample formatted training text (truncated) ---
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

You are a helpful assistant. Be accurate and do not invent personal facts.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is my full name?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Your full name is J√ºrgen Brauer.<|eot_id|> ...

‚úÖ Starting training ...


num_proc must be <= 16. Reducing num_proc to 16 for dataset of size 16.
The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 16 | Num Epochs = 60 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss
1,3.9112
2,3.3037
3,3.2263
4,4.0927
5,3.3589
6,3.2029
7,2.8477
8,2.2852
9,2.1859
10,1.9051


üíæ Saving LoRA adapter to: lora_adapter__unsloth_Llama-3.2-3B-Instruct-bnb-4bit
‚úÖ Done.


# Load model and test it

In [4]:
# Unsloth (current-style): reload saved LoRA folder + interactive chat loop (Notebook-style)
# This matches the "Saving, loading finetuned models" section in the attached Unsloth notebook. :contentReference[oaicite:1]{index=1}

import os
import torch
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from transformers import TextStreamer

# =========================
# 1) Configuration
# =========================
base_model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
lora_dir     = f"lora_adapter__{base_model_name.replace('/', '_')}"

# Must match what you used during training (the notebook uses llama-3.1 for Llama 3.x chat formatting)
chat_template = "llama-3.1"

max_seq_length = 2048
dtype          = None        # notebook style: auto
load_in_4bit   = True

# Generation params (notebook often uses temperature=1.5 and min_p=0.1; adapt as you like)
max_new_tokens = 256
temperature    = 1.0
min_p          = 0.1

system_message = "You are a helpful assistant. Be accurate and do not invent personal facts."

# =========================
# 2) Sanity checks
# =========================
if not torch.cuda.is_available():
    raise RuntimeError("CUDA/GPU not found. This inference setup expects an NVIDIA GPU.")

if not os.path.isdir(lora_dir):
    raise FileNotFoundError(
        f"LoRA folder not found: {lora_dir}\n"
        "Make sure you saved the adapter with model.save_pretrained(lora_dir) and tokenizer.save_pretrained(lora_dir)."
    )

# =========================
# 3) Notebook-style reload: load directly from the LoRA folder
#    (This is exactly what the notebook does: model_name='lora_model') :contentReference[oaicite:3]{index=3}
# =========================
print(f"‚úÖ Loading finetuned LoRA folder: {lora_dir}")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = lora_dir,        # <-- key difference: load from the saved LoRA folder
    max_seq_length = max_seq_length,
    dtype          = dtype,
    load_in_4bit   = load_in_4bit,
)

print(f"‚úÖ Setting chat template: {chat_template}")
tokenizer = get_chat_template(tokenizer, chat_template=chat_template)

FastLanguageModel.for_inference(model)  # Enable native faster inference (notebook pattern) :contentReference[oaicite:4]{index=4}

# =========================
# 4) Interactive chat loop (with history) + optional streaming
# =========================
print("\n‚úÖ Interactive chat started.")
print("Type your message and press Enter.")
print("Commands: /reset (clear history), /exit (quit), /stream on|off\n")

use_streaming = False
chat_history = [{"role": "system", "content": system_message}]

for prompt_id in range(1,6):

    print("-" * 50)
    print(f"Prompt {prompt_id}:")
    
    user_text = input("You: ").strip()
    if not user_text:
        continue
    print(f"You > {user_text}")

    lower = user_text.lower()
    if lower in ("/exit", "/quit"):
        print("Assistant: Bye!")
        break

    if lower == "/reset":
        chat_history = [{"role": "system", "content": system_message}]
        print("Assistant: Conversation history cleared.\n")
        continue

    if lower.startswith("/stream"):
        parts = lower.split()
        if len(parts) == 2 and parts[1] in ("on", "off"):
            use_streaming = (parts[1] == "on")
            print(f"Assistant: Streaming is now {'ON' if use_streaming else 'OFF'}.\n")
        else:
            print("Assistant: Usage: /stream on  OR  /stream off\n")
        continue

    chat_history.append({"role": "user", "content": user_text})

    # Notebook-style prompt building: apply_chat_template(..., tokenize=True, return_tensors="pt") :contentReference[oaicite:5]{index=5}
    input_ids = tokenizer.apply_chat_template(
        chat_history,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    if use_streaming:
        streamer = TextStreamer(tokenizer, skip_prompt=True)
        _ = model.generate(
            input_ids=input_ids,
            streamer=streamer,
            max_new_tokens=max_new_tokens,
            use_cache=True,
            temperature=temperature,
            min_p=min_p,
        )
        # We still want to store the assistant message in history. Re-generate without streamer would be wasteful.
        # So we decode the full output by doing one non-stream generation with very small overhead:
        # (If you care, we can implement token capture from streamer, but this is simpler.)
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            use_cache=True,
            temperature=temperature,
            min_p=min_p,
        )
        prompt_len = input_ids.shape[-1]
        assistant_text = tokenizer.decode(outputs[0][prompt_len:], skip_special_tokens=True).strip()
        print()  # newline after streaming
    else:
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            use_cache=True,
            temperature=temperature,
            min_p=min_p,
        )
        prompt_len = input_ids.shape[-1]
        assistant_text = tokenizer.decode(outputs[0][prompt_len:], skip_special_tokens=True).strip()
        print(f"Assistant: {assistant_text}\n")

    chat_history.append({"role": "assistant", "content": assistant_text})


‚úÖ Loading finetuned LoRA folder: lora_adapter__unsloth_Llama-3.2-3B-Instruct-bnb-4bit
==((====))==  Unsloth 2025.12.9: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition. Num GPUs = 1. Max memory: 94.969 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 12.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
‚úÖ Setting chat template: llama-3.1

‚úÖ Interactive chat started.
Type your message and press Enter.
Commands: /reset (clear history), /exit (quit), /stream on|off

--------------------------------------------------
Prompt 1:


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


You > Who am I?
Assistant: You are J√ºrgen Brauer, a professor at Kempten University of Applied Sciences with a PhD from the Karlsruhe Institute of Technology.

--------------------------------------------------
Prompt 2:
You > What is my birthdate?
Assistant: You were born on March 14, 1976.

--------------------------------------------------
Prompt 3:
You > How many children do I have?
Assistant: You have three children: Julius, Florian, and Helene.

--------------------------------------------------
Prompt 4:
You > My favorite meal is Pizza, right?
Assistant: Your favorite meal is Pizza, for example, spaghetti with mountain cheese and fried pepperoncini.

--------------------------------------------------
Prompt 5:
You > Do I have a PhD?
Assistant: You have a PhD.

