# Buddhist Scholar QLoRA Fine-Tuning

Fine-tune Qwen2.5-14B with QLoRA using Unsloth on Google Colab's free T4 GPU.

**Prerequisites:**
- Run `prepare_training_data.py` locally to generate `training_data.jsonl`
- Upload `training_data.jsonl` to this Colab notebook

**Output:**
- A LoRA adapter in GGUF format that can be loaded with llama.cpp:
  `llama-server -m Qwen2.5-14B-Instruct-Q4_K_M.gguf --lora buddhist-scholar.gguf`

In [None]:
# Step 1: Install Unsloth (optimized for Colab T4)
!pip install unsloth
!pip install --no-deps trl peft accelerate bitsandbytes

In [None]:
# Step 2: Load the model with 4-bit quantization
from unsloth import FastLanguageModel
import torch

max_seq_length = 4096
dtype = None  # Auto-detect
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-14B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print(f"Model loaded. GPU memory: {torch.cuda.memory_allocated()/1024**3:.1f} GB")

In [None]:
# Step 3: Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

print(f"Trainable params: {model.print_trainable_parameters()}")

In [None]:
# Step 4: Load and format training data
import json
from datasets import Dataset

# Upload training_data.jsonl to Colab first
# from google.colab import files
# uploaded = files.upload()  # Uncomment to upload interactively

data_file = "training_data.jsonl"

examples = []
with open(data_file, "r") as f:
    for line in f:
        examples.append(json.loads(line))

print(f"Loaded {len(examples)} training examples")

# Format into ChatML template (Qwen2.5's native format)
def format_chatml(example):
    messages = example["conversations"]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
    )
    return {"text": text}

dataset = Dataset.from_list(examples)
dataset = dataset.map(format_chatml)
print(f"Dataset ready: {len(dataset)} examples")
print(f"Sample:\n{dataset[0]['text'][:500]}...")

In [None]:
# Step 5: Configure and run training
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=50,
        max_steps=1000,  # Adjust: 500-2000 depending on dataset size
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=42,
        output_dir="./outputs",
    ),
)

print("Starting training...")
stats = trainer.train()
print(f"Training complete! Loss: {stats.training_loss:.4f}")

In [None]:
# Step 6: Test the fine-tuned model
FastLanguageModel.for_inference(model)

messages = [
    {"role": "system", "content": "You are a Buddhist scholar."},
    {"role": "user", "content": "How does Nagarjuna's sunyata differ from nihilism?"},
]

inputs = tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to("cuda")

outputs = model.generate(input_ids=inputs, max_new_tokens=512, temperature=0.3)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

In [None]:
# Step 7: Save the LoRA adapter
# Save as HuggingFace PEFT format first
model.save_pretrained("buddhist-scholar-lora")
tokenizer.save_pretrained("buddhist-scholar-lora")
print("Saved PEFT adapter to ./buddhist-scholar-lora/")

# Save as GGUF for llama.cpp
# Option A: Use Unsloth's built-in GGUF export
model.save_pretrained_gguf(
    "buddhist-scholar-gguf",
    tokenizer,
    quantization_method="q4_k_m",
)
print("Saved GGUF adapter to ./buddhist-scholar-gguf/")

# Download the adapter
from google.colab import files
import glob
gguf_files = glob.glob("buddhist-scholar-gguf/*.gguf")
for f in gguf_files:
    print(f"Downloading {f}...")
    files.download(f)

## Local Deployment

After downloading the GGUF adapter, deploy locally with llama.cpp:

```bash
llama-server -m Qwen2.5-14B-Instruct-Q4_K_M.gguf --lora buddhist-scholar.gguf
```

The adapter is typically 10-100MB and applies on top of the base model.
llama.cpp supports hot-swapping LoRA adapters via its REST API.