In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

if os.getcwd().endswith('notebooks'):
    # Move up one level to the project root
    os.chdir('..') 

print(f"Current Working Directory: {os.getcwd()}")

Current Working Directory: /home/hsozer/Projects/DeepLearning/recipe_bot


In [3]:
# Install Unsloth (Optimized training library)
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install standard Hugging Face libraries
!pip install --no-deps xformers trl peft accelerate bitsandbytes

print("‚úÖ Libraries installed.")

Collecting unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-pw8lhpmu/unsloth_ecde57ee58e046ae801af0cd628a5bf5
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-pw8lhpmu/unsloth_ecde57ee58e046ae801af0cd628a5bf5
  Resolved https://github.com/unslothai/unsloth.git to commit f6fe157a4d4f100ffb23762be8f839ed98dd3715
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
‚úÖ Libraries installed.


In [4]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
import os

# CONFIGURATION
MODEL_NAME = "unsloth/mistral-7b-v0.3-bnb-4bit"
MAX_SEQ_LENGTH = 2048 # context length
DTYPE = None # None = Auto-detect
LOAD_IN_4BIT = True # 4-bit quantization

# File Paths
DATASET_PATH = "data/training/hybrid_train.jsonl"
OUTPUT_DIR = "models/mistral_qlora"

print(f"Configured for training on {torch.cuda.get_device_name(0)}")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
Configured for training on NVIDIA GeForce RTX 4070 SUPER


In [5]:
print(f"‚è≥ Loading {MODEL_NAME}...")

# Load Base Model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT,
)

# Add LoRA Adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"], # Target all linear layers
    lora_alpha = 32, # Scaling factor
    lora_dropout = 0, # Unsloth optimization
    bias = "none",
    use_gradient_checkpointing = "unsloth", # Saves massive VRAM
    random_state = 3407,
)

print("‚úÖ QLoRA adapters attached.")

‚è≥ Loading unsloth/mistral-7b-v0.3-bnb-4bit...
==((====))==  Unsloth 2025.12.5: Fast Mistral patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 4070 SUPER. Num GPUs = 1. Max memory: 11.585 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.12.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


‚úÖ QLoRA adapters attached.


In [6]:
# Define the prompt structure
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Function to map our JSONL rows to this text format
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # We append EOS_TOKEN so the model knows when to stop generating
        text = alpaca_prompt.format(instruction, input, output) + tokenizer.eos_token
        texts.append(text)
    return { "text" : texts, }

# Load Dataset
print(f"üìÇ Loading data from {DATASET_PATH}...")
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
dataset = dataset.map(formatting_prompts_func, batched = True)

print(f"‚úÖ Loaded {len(dataset)} training examples.")

üìÇ Loading data from data/training/hybrid_train.jsonl...
‚úÖ Loaded 24720 training examples.


In [None]:
print("üöÄ Starting Training...")

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 2,
    packing = True, # Enable sequence packing for efficiency
    args = TrainingArguments(
        per_device_train_batch_size = 16, # for VRAM constraints
        gradient_accumulation_steps = 1, # No accumulation needed       
        warmup_steps = 50,
        num_train_epochs = 1,
        learning_rate = 1e-4, # learning rate
        fp16 = False,   
        bf16 = True,
        optim = "adamw_8bit", # Uses 8-bit optimizer to save VRAM
        
        # Logging - Saving 
        logging_steps = 10,
        output_dir = OUTPUT_DIR,
        save_strategy = "steps",
        save_steps = 500, # Save a checkpoint every 500 steps
        seed = 3407,
    ),
)

# Execute Training
trainer_stats = trainer.train()

üöÄ Starting Training...


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 24,720 | Num Epochs = 1 | Total steps = 1,545
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 1 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040 of 7,289,966,592 (0.58% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.5311
20,1.2229
30,0.9845
40,0.8599
50,0.8401
60,0.776
70,0.8153
80,0.8251
90,0.749
100,0.7428


In [None]:
print("üíæ Saving Model...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Quick Test Inference
print("\nüç≥ Chef-Bot Test:")
FastLanguageModel.for_inference(model)

# Test Prompt
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "I have chicken breast, rice, and soy sauce. What can I cook?", # Instruction
            "", # Input
            "", # Output
        )
    ], return_tensors = "pt").to("cuda")

# Generate
outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
response = tokenizer.batch_decode(outputs)[0]

# Print only the response part
print(response.split("### Response:")[-1])

üíæ Saving Model...

üç≥ Chef-Bot Test:

We can make a simple chicken and rice dish.

Chicken Rice:

Ingredients:
- Chicken breast
- Rice
- Soy sauce
- Oil, salt, pepper

Instructions:
1. Cook rice.
2. Cook chicken in oil until done.
3. Add soy sauce and serve over rice.</s>
