# BharatBuild Qwen Fine-tuning (Google Colab)

This notebook fine-tunes Qwen2.5-Coder-7B for BharatBuild AI.

**Requirements:**
- Colab Pro ($10/month) for T4 GPU
- Or Colab Pro+ for A100 GPU (faster)

**Estimated Time:** 4-6 hours on T4

In [None]:
# Step 1: Check GPU
!nvidia-smi

In [None]:
# Step 2: Install dependencies
!pip install -q torch torchvision torchaudio
!pip install -q transformers>=4.37.0 datasets>=2.16.0 accelerate>=0.25.0
!pip install -q peft>=0.7.0 bitsandbytes>=0.41.0 trl>=0.7.0
!pip install -q scipy sentencepiece

In [None]:
# Step 3: Upload training data
from google.colab import files
print("Upload your training_data.jsonl file:")
uploaded = files.upload()

In [None]:
# Step 4: Move uploaded file
!mkdir -p data
!mv training_data.jsonl data/ 2>/dev/null || mv train.jsonl data/training_data.jsonl 2>/dev/null || echo "File already in place"
!wc -l data/training_data.jsonl

In [None]:
# Step 5: Training Configuration
import os
import torch
from datetime import datetime
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer

# Configuration
MODEL_NAME = "Qwen/Qwen2.5-Coder-7B-Instruct"
OUTPUT_DIR = "./output/qwen-bharatbuild"
DATA_FILE = "./data/training_data.jsonl"

# Optimized for T4 (16GB) - reduce batch size if OOM
BATCH_SIZE = 1
GRADIENT_ACCUMULATION = 16  # Effective batch size = 16
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
MAX_SEQ_LENGTH = 2048

# LoRA configuration
LORA_R = 64
LORA_ALPHA = 128
LORA_DROPOUT = 0.05

print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
# Step 6: Load Model and Tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="right"
)
tokenizer.pad_token = tokenizer.eos_token

# 4-bit quantization config
print("Configuring 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load model
print(f"Loading model: {MODEL_NAME}")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
print("Model loaded!")

In [None]:
# Step 7: Apply LoRA
print("Applying LoRA...")
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Step 8: Load and Format Dataset
def format_training_sample(example):
    return {
        "text": f"<|im_start|>system\n{example.get('system', 'You are a helpful coding assistant.')}<|im_end|>\n"
                f"<|im_start|>user\n{example['instruction']}<|im_end|>\n"
                f"<|im_start|>assistant\n{example['output']}<|im_end|>"
    }

print(f"Loading dataset from {DATA_FILE}...")
dataset = load_dataset("json", data_files=DATA_FILE, split="train")
print(f"Dataset size: {len(dataset)} samples")
dataset = dataset.map(format_training_sample, remove_columns=dataset.column_names)

In [None]:
# Step 9: Setup Training
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    bf16=True,
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    optim="paged_adamw_32bit",
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    packing=True,
)
print("Trainer ready!")

In [None]:
# Step 10: START TRAINING
print("=" * 60)
print("Starting training...")
print(f"Start time: {datetime.now()}")
print("=" * 60)

trainer.train()

print("\n" + "=" * 60)
print("Training complete!")
print(f"End time: {datetime.now()}")
print("=" * 60)

In [None]:
# Step 11: Save Model
print("Saving model...")
trainer.save_model(f"{OUTPUT_DIR}/final")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
print(f"Model saved to: {OUTPUT_DIR}/final")

In [None]:
# Step 12: Download Model
!zip -r qwen-bharatbuild-finetuned.zip {OUTPUT_DIR}/final

from google.colab import files
files.download('qwen-bharatbuild-finetuned.zip')

In [None]:
# Alternative: Upload to Google Drive
from google.colab import drive
drive.mount('/content/drive')

!cp -r {OUTPUT_DIR}/final /content/drive/MyDrive/qwen-bharatbuild-finetuned/