# KLLM Quickstart Notebook

Quick demonstration of fine-tuning LLMs with Unsloth and QLoRA.

In [None]:
# Check GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 1. Load Model with Unsloth

In [None]:
from unsloth import FastLanguageModel

# Choose your model
MODEL_NAME = "Qwen/Qwen3-14B"  # or "openai/gpt-oss-20b", "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
MAX_SEQ_LENGTH = 4096

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,  # Auto-detect
    load_in_4bit=True,  # QLoRA
)

print(f"Model loaded: {MODEL_NAME}")

## 2. Apply LoRA Adapters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    lora_alpha=128,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

# Show trainable parameters
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")

## 3. Prepare Dataset

In [None]:
from datasets import load_dataset

# Load sample dataset (Alpaca format)
dataset = load_dataset("yahma/alpaca-cleaned", split="train[:1000]")
print(f"Dataset size: {len(dataset)}")
print(f"Sample: {dataset[0]}")

In [None]:
def format_prompt(example):
    """Format to Qwen3 chat template."""
    instruction = example["instruction"]
    input_text = example.get("input", "")
    output = example.get("output", "")
    
    if input_text:
        user_content = f"{instruction}\n\n{input_text}"
    else:
        user_content = instruction
    
    text = f"""<|im_start|>user
{user_content}<|im_end|>
<|im_start|>assistant
{output}<|im_end|>"""
    
    return {"text": text}

dataset = dataset.map(format_prompt)
print(f"Formatted sample:\n{dataset[0]['text'][:500]}")

## 4. Train

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    packing=True,
    args=TrainingArguments(
        output_dir="./models/quickstart",
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,
        logging_steps=10,
        save_steps=100,
        optim="adamw_8bit",
        fp16=True,
    ),
)

In [None]:
# Start training
trainer.train()

## 5. Test the Model

In [None]:
# Switch to inference mode
FastLanguageModel.for_inference(model)

# Test prompt
prompt = """<|im_start|>user
Explain the difference between machine learning and deep learning.<|im_end|>
<|im_start|>assistant
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    temperature=0.7,
    do_sample=True,
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

## 6. Save Model

In [None]:
# Save LoRA adapters
model.save_pretrained("./models/quickstart-lora")
tokenizer.save_pretrained("./models/quickstart-lora")
print("Saved LoRA adapters")

# Save merged model (full weights)
model.save_pretrained_merged(
    "./models/quickstart-merged",
    tokenizer,
    save_method="merged_16bit",
)
print("Saved merged model")

## Next Steps

1. **Custom Dataset**: Prepare your own data using `data/prepare_data.py`
2. **Full Training**: Use the training scripts in `scripts/`
3. **Evaluation**: Run benchmarks with `evaluation/run_eval.py`
4. **Experiment**: Try different models, LoRA ranks, learning rates