# Kayas Assistant — QLoRA Fine-tuning on Kaggle (T4x2 FSDP)

**Model:** `Qwen/Qwen2.5-7B-Instruct`
**Hardware:** `T4 x2` (2x 16GB GPUs)
**Strategy:** FSDP + QLoRA + Flash Attention 2 for maximum speed and memory efficiency.

In [None]:
# Environment check
import os, sys, platform, shutil
print('Python:', sys.version)
print('OS:', platform.platform())

# List input data directory
INPUT_DIR = '/kaggle/input'
if os.path.exists(INPUT_DIR):
    print("\nInput data:")
    for root, dirs, files in os.walk(INPUT_DIR):
        for name in files:
            print(f"- {os.path.join(root, name)}")
else:
    print(f"\nWarning: Input directory '{INPUT_DIR}' not found.")

In [None]:
# --- 1. Install Dependencies (bnb + triton last) ---
print("🚀 Installing dependencies...")
print("   This part is silent and will take 1-2 minutes...")

# Install main libraries first
!pip install -q \
    "torch==2.3.1+cu121" \
    "transformers==4.43.3" \
    "peft==0.12.0" \
    "trl==0.9.6" \
    "datasets==2.20.0" \
    "accelerate==0.32.1" \
    "pyarrow==22.0.0" \
    "rich" \
    -f https://download.pytorch.org/whl/torch_stable.html

# Install bitsandbytes and triton (for quantization / LoRA)
!pip install -q bitsandbytes==0.43.1 triton==2.3.0

print("✅ All dependencies installed successfully.")

# --- 2. RESTART KERNEL ---
print("\n" + "="*60)
print("🛑 IMPORTANT: YOU MUST RESTART THE KERNEL NOW!")
print("Go to 'Run' > 'Restart Session' in the Kaggle menu.")
print("Do NOT run this cell again after restarting.")
print("="*60)


In [4]:
# Imports
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    GenerationConfig
)
from trl import SFTTrainer
import os
from pathlib import Path
import json


In [None]:
# Auto-discover the dataset path
# Tries enhanced 10k first, then 8k, then any mega_brain_dataset_*.jsonl under your dataset folder.

def find_dataset_path(base_dir='/kaggle/input'):
    preferred = [
        'mega_brain_dataset_10000_enhanced.jsonl',
        'mega_brain_dataset_8000_enhanced.jsonl',
        'mega_brain_dataset_10000.jsonl',
        'mega_brain_dataset_8000.jsonl',
    ]
    # If you know your dataset subfolder, you can narrow base_dir to '/kaggle/input/kayass/'
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file in preferred or (file.startswith('mega_brain_dataset_') and file.endswith('.jsonl')):
                path = os.path.join(root, file)
                print(f"✅ Found dataset at: {path}")
                return path
    print(f"❌ ERROR: No mega_brain_dataset_*.jsonl found in {base_dir}")
    print("Please check your 'Add Data' settings or set DATASET_PATH manually.")
    return None

DATASET_PATH = find_dataset_path()
if DATASET_PATH is None:
    raise FileNotFoundError("Could not find the dataset file.")

✅ Found dataset at: /kaggle/input/kayass/mega_brain_dataset_10000_enhanced.jsonl


In [None]:
# --- Parameters ---

# Model
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
ALLOW_MODEL_FALLBACK = True  # If OOM at load, retry with 3B automatically

# Dataset
# DATASET_PATH is set from Cell 5

# Output
OUTPUT_DIR = Path("/kaggle/working/brain-lora-7b")

# QLoRA
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
LORA_TARGET_MODULES = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj"
]

# Training
MAX_SEQ_LEN = 1024 # Keep at 1024
BATCH_SIZE = 1     # Keep at 1 per device
GRAD_ACCUM_STEPS = 8 # Reduced for stability on T4
EPOCHS = 1
LEARNING_RATE = 2e-4
MAX_GRAD_NORM = 0.3
WARMUP_RATIO = 0.03

# Memory/loader knobs
DISABLE_FLASH_ATTENTION = True  # Force eager attention to reduce peak VRAM

# --- End Parameters ---

# Create output dir
OUTPUT_DIR.mkdir(exist_ok=True)
print(f"Model: {BASE_MODEL}")
print(f"Output: {OUTPUT_DIR}")
print(f"Seq Len: {MAX_SEQ_LEN}")
print(f"Effective Batch Size: {BATCH_SIZE * GRAD_ACCUM_STEPS * max(1, torch.cuda.device_count())}")

Model: Qwen/Qwen2.5-7B-Instruct
Output: /kaggle/working/brain-lora-7b
Seq Len: 1024
Effective Batch Size: 16 (over 2 GPUs)


In [22]:
# Load dataset
print(f"Loading dataset from {DATASET_PATH}...")
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")

# --- NEW: Manually Apply Chat Template ---
print("Applying chat template to dataset...")
# Load tokenizer temporarily just for applying the template
tokenizer_for_template = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
if tokenizer_for_template.pad_token_id is None:
    tokenizer_for_template.pad_token = tokenizer_for_template.eos_token

def apply_template(example):
    # Takes the 'messages' list and converts it to a single formatted string
    return {"formatted_text": tokenizer_for_template.apply_chat_template(
        example["messages"], 
        tokenize=False, 
        add_generation_prompt=False # Important for training!
    )}

# Apply the function to the dataset
dataset = dataset.map(apply_template, num_proc=os.cpu_count()) 
# Remove the original messages column as it's no longer needed by SFTTrainer
# dataset = dataset.remove_columns("messages") 
# ^^^ Keep messages for now, might be useful if SFTTrainer still needs it internally

print("✅ Chat template applied.")
# --- End NEW section ---


# Split the dataset
train_val_split = dataset.train_test_split(test_size=0.05, seed=42) # 5% for validation
train_data = train_val_split["train"]
val_data = train_val_split["test"]

print(f"\n✅ Dataset loaded, formatted, and split:")
print(f"   Train samples: {len(train_data)}")
print(f"   Validation samples: {len(val_data)}")
# Optional: Print first example's formatted text
if len(train_data) > 0:
    print("\n   Example formatted text:")
    print(train_data[0]['formatted_text'][:500] + "...") # Show first 500 chars

Loading dataset from /kaggle/input/kayass/mega_brain_dataset_10000_enhanced.jsonl...
Applying chat template to dataset...
✅ Chat template applied.

✅ Dataset loaded, formatted, and split:
   Train samples: 9500
   Validation samples: 500

   Example formatted text:
<|im_start|>system
You are Kayas, a friendly and helpful AI assistant. You have a warm personality and genuinely care about helping users.

When users ask you to do something:
1. First, acknowledge their request warmly
2. Then provide the JSON tool calls
3. Optionally add a brief confirmation

Format:
{
  "response": "Your friendly message here",
  "actions": [{"tool": "...", "args": {...}}]
}

Be natural, friendly, and professional. Use emojis sparingly and appropriately.<|im_end|>
<|im_start|>...


In [None]:
def main_training_function():
    # --- IMPORTS MOVED INSIDE ---
    import torch
    import torch.distributed as dist
    from datasets import load_dataset # Keep this import if you reload inside
    from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
    from transformers import (
        AutoModelForCausalLM,
        AutoTokenizer,
        BitsAndBytesConfig,
        TrainingArguments,
        GenerationConfig
    )
    from trl import SFTTrainer
    import os
    from pathlib import Path

    # --- Device / env setup per process ---
    local_rank = int(os.environ.get('LOCAL_RANK', os.environ.get('RANK', 0)))
    if torch.cuda.is_available():
        torch.cuda.set_device(local_rank)
    os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True')

    # --- 1. Load Tokenizer ---
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token

    # --- 2. QLoRA Config ---
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    def _load_base(model_name: str):
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        print(f"Process {os.environ.get('RANK', 0)}: Loading base model {model_name}...")
        mdl = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map={"": local_rank} if torch.cuda.is_available() else None,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
        )
        # Lower peak memory by disabling flash attention and cache
        try:
            if DISABLE_FLASH_ATTENTION:
                mdl.config.attn_implementation = 'eager'
        except Exception:
            pass
        mdl.config.use_cache = False
        print(f"Process {os.environ.get('RANK', 0)}: Base model loaded.")
        return mdl

    # --- 3. Load Model with fallback on OOM ---
    try:
        model = _load_base(BASE_MODEL)
    except torch.cuda.OutOfMemoryError as e:
        if ALLOW_MODEL_FALLBACK:
            print("\n⚠️ OOM at load with 7B. Falling back to Qwen/Qwen2.5-3B-Instruct...")
            fallback = "Qwen/Qwen2.5-3B-Instruct"
            global BASE_MODEL
            BASE_MODEL = fallback
            tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
            if tokenizer.pad_token_id is None:
                tokenizer.pad_token = tokenizer.eos_token
            model = _load_base(BASE_MODEL)
        else:
            raise e

    # --- 4. LoRA Config ---
    lora_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=LORA_TARGET_MODULES,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # --- 5. Training Arguments (with FSDP) ---
    training_args = TrainingArguments(
        output_dir=str(OUTPUT_DIR),
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        learning_rate=LEARNING_RATE,
        num_train_epochs=EPOCHS,
        logging_steps=10,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=2,
        report_to="none",
        fp16=True,
        gradient_checkpointing=False,  # FSDP activation checkpointing will be used
        max_grad_norm=MAX_GRAD_NORM,
        warmup_ratio=WARMUP_RATIO,
        lr_scheduler_type="linear",
        optim="paged_adamw_8bit",
        fsdp="full_shard auto_wrap",
        fsdp_config={
            "transformer_layer_cls_to_wrap": ["Qwen2DecoderLayer"],
            "activation_checkpointing": True
        },
    )

    # --- 6. SFTTrainer ---
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        dataset_text_field="formatted_text",  # Use the pre-formatted text column
        peft_config=lora_config,
        packing=False,  # Keep packing disabled
        max_seq_length=MAX_SEQ_LEN,
        tokenizer=tokenizer,
    )

    # --- 7. Start Training ---
    print("\n" + "="*50)
    print(f"Process {os.environ.get('RANK', 0)}: 🚀 STARTING FSDP TRAINING...")
    print("="*50)

    trainer.train()

    # --- 8. Save Final Model (Only Rank 0 saves) ---
    print(f"Process {os.environ.get('RANK', 0)}: ✅ Training complete.")
    if trainer.is_world_process_zero():
        print("Rank 0: Saving final adapter...")
        trainer.save_model(str(OUTPUT_DIR))
        print("Rank 0: Final model saved.")
    else:
        print(f"Rank {os.environ.get('RANK', 0)}: Not saving model.")

# End of training function definition
print("✅ Training function defined.")

✅ Training function defined.


In [None]:
from accelerate import notebook_launcher
import os, torch

# Optional: quick sanity
print(f"CUDA device count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f" - GPU {i}: {torch.cuda.get_device_name(i)}")

# Set environment variable for FSDP
os.environ["ACCELERATE_USE_FSDP"] = "true"

print("Starting notebook launcher for 2 processes...")

# This will launch the main_training_function on 2 GPUs
try:
    notebook_launcher(main_training_function, num_processes=2)
except Exception as e:
    print(f"❌ Training failed with error: {e}")
    # This is to help debug FSDP issues
    if "transformer_layer_cls_to_wrap" in str(e):
        print("\n💡 DEBUG HINT: The 'transformer_layer_cls_to_wrap' name might be wrong.")
        print("   Check the model architecture. For Qwen2, it should be 'Qwen2DecoderLayer'.")
    raise e

print("✅ Training launcher finished.")

Starting notebook launcher for 2 processes...
Launching training on 2 GPUs.
Process 0: Loading base model Qwen/Qwen2.5-7B-Instruct...
Process 1: Loading base model Qwen/Qwen2.5-7B-Instruct...


`low_cpu_mem_usage` was None, now set to True since model is quantized.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Process 0: Base model loaded.



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
W1025 12:17:20.223000 136841810080896 torch/multiprocessing/spawn.py:145] Terminating process 1207 via signal SIGTERM
E1025 12:17:20.594000 136841810080896 torch/distributed/elastic/multiprocessing/api.py:695] failed (exitcode: 1) local_rank: 1 (pid: 1208) of fn: main_training_function (start_method: fork)
E1025 12:17:20.594000 136841810080896 torch/distributed/elastic/multiprocessing/api.py:695] Traceback (most recent call last):
E1025 12:17:20.594000 136841810080896 torch/distributed/elastic/multiprocessing/api.py:695]   File "/usr/local/lib/python3.11/dist-packages/torch/distributed/elastic/multiprocessing/api.py", line 656, in _poll
E1025 12:17:20.594000 136841810080896 torch/distributed/elastic/multiprocessing/api.py:695]     self._pc.join(-1)
E1025 12:17:20.594000 136841810080896 torch/distributed/elastic/multiprocessing/api.py:695]   File "/usr/local/lib/python3.11/dis

❌ Training failed with error: 
main_training_function FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-10-25_12:17:19
  host      : 5936555831bc
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 1208)
  error_file: /tmp/torchelastic_81n_230y/none_v3y56760/attempt_0/1/error.json
  traceback : Traceback (most recent call last):
    File "/usr/local/lib/python3.11/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
      return f(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^
    File "/tmp/ipykernel_109/2084375751.py", line 32, in main_training_function
      model = AutoModelForCausalLM.from_pretrained(
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/usr/local/lib/python3.11/dist-packages/transformers/models/auto/auto_factory.py", line 564, in from_pr

ChildFailedError: 
============================================================
main_training_function FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-10-25_12:17:19
  host      : 5936555831bc
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 1208)
  error_file: /tmp/torchelastic_81n_230y/none_v3y56760/attempt_0/1/error.json
  traceback : Traceback (most recent call last):
    File "/usr/local/lib/python3.11/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
      return f(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^
    File "/tmp/ipykernel_109/2084375751.py", line 32, in main_training_function
      model = AutoModelForCausalLM.from_pretrained(
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/usr/local/lib/python3.11/dist-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
      return model_class.from_pretrained(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py", line 3990, in from_pretrained
      dispatch_model(model, **device_map_kwargs)
    File "/usr/local/lib/python3.11/dist-packages/accelerate/big_modeling.py", line 419, in dispatch_model
      attach_align_device_hook_on_blocks(
    File "/usr/local/lib/python3.11/dist-packages/accelerate/hooks.py", line 615, in attach_align_device_hook_on_blocks
      add_hook_to_module(module, hook)
    File "/usr/local/lib/python3.11/dist-packages/accelerate/hooks.py", line 160, in add_hook_to_module
      module = hook.init_hook(module)
               ^^^^^^^^^^^^^^^^^^^^^^
    File "/usr/local/lib/python3.11/dist-packages/accelerate/hooks.py", line 282, in init_hook
      set_module_tensor_to_device(module, name, self.execution_device, tied_params_map=self.tied_params_map)
    File "/usr/local/lib/python3.11/dist-packages/accelerate/utils/modeling.py", line 396, in set_module_tensor_to_device
      new_value = old_value.to(device)
                  ^^^^^^^^^^^^^^^^^^^^
  torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 
  
============================================================

In [None]:
# Save the tokenizer
print(f"💾 Saving tokenizer to {OUTPUT_DIR}...")
try:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.save_pretrained(str(OUTPUT_DIR))
    print("✅ Tokenizer saved.")
except Exception as e:
    print(f"Could not save tokenizer: {e}")

In [None]:
# --- Inference Test ---
# We need to free memory first.
# RESTART THE KERNEL after this cell if it fails on memory.
print("\n" + "="*50)
print("🧪 RUNNING INFERENCE TEST...")
print("="*50)

from peft import PeftModel
from transformers import GenerationConfig
import torch

try:
    # Clear cache
    torch.cuda.empty_cache()

    # Load base model in 4-bit
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map="auto", # Load on one GPU for inference
        torch_dtype=torch.float16,
        trust_remote_code=True,
    )
    
    tokenizer = AutoTokenizer.from_pretrained(str(OUTPUT_DIR), use_fast=True, trust_remote_code=True)

    # Load the LoRA adapter
    print(f"Loading adapter from {OUTPUT_DIR}...")
    inf_model = PeftModel.from_pretrained(base_model, str(OUTPUT_DIR))
    inf_model.eval()
    print("✅ Model and adapter loaded for inference.")

    # --- Test ---
    messages = [
        {"role": "system", "content": "You are Kayas, an intelligent AI assistant that helps users accomplish tasks by calling the appropriate tools.\n\nWhen given a command, respond with a JSON array of tool calls. Each tool call has:\n- \"tool\": the tool name (e.g., \"filesystem.create_file\")\n- \"args\": a dictionary of arguments\n\nAvailable tools:\n- filesystem.create_file, process.start_program, uia.click_button, etc.\n\nRespond ONLY with valid JSON. No explanation, just the tool calls."},
        {"role": "user", "content": "Hey Kayas, can you open notepad and then find a file called 'todo.txt' on my desktop?"}
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([prompt], return_tensors='pt').to(inf_model.device)
    
    gen_config = GenerationConfig(
        max_new_tokens=256,
        do_sample=True,
        temperature=0.1, # Low temp for tool use
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

    print("\n--- PROMPT ---")
    print(prompt)
    
    print("\n--- GENERATION ---")
    with torch.inference_mode():
        gen_outputs = inf_model.generate(**inputs, generation_config=gen_config)
    
    decoded = tokenizer.decode(gen_outputs[0], skip_special_tokens=True)
    
    # Print only the assistant's response
    assistant_response = decoded.split("<|im_start|>assistant")[1].replace("<|im_end|>", "").strip()
    print(assistant_response)
    
    # Try to parse the JSON
    try:
        json.loads(assistant_response)
        print("\n✅ JSON is valid!")
    except Exception as e:
        print(f"\n⚠️ WARNING: Output is not valid JSON. {e}")

except Exception as e:
    print(f"❌ Inference test failed: {e}")
    print("   This might be an OOM error. Try restarting the session and running *only* this cell.")

print("\n🎉 All done!")