# LoRA Fine-tuning Example

This notebook demonstrates Low-Rank Adaptation (LoRA) for efficient fine-tuning of large language models.

**Key Benefits:**
- 0.1-1% trainable parameters
- 10x less memory usage
- Multiple task adapters on single model
- No inference latency after merging

In [None]:
# Install required libraries
!pip install -q transformers datasets accelerate peft bitsandbytes evaluate

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training
)
from datasets import load_dataset
import warnings
warnings.filterwarnings('ignore')

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 1. Load Pre-trained Model

In [None]:
# Use a smaller model for demonstration
model_name = "microsoft/phi-2"  # 2.7B parameters
# Alternative: "meta-llama/Llama-2-7b-hf" (requires HF token)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model in 8-bit for memory efficiency
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,  # Quantization for memory efficiency
    device_map="auto",
    trust_remote_code=True
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

print(f"Model loaded: {model_name}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

## 2. Configure LoRA

In [None]:
# LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank - higher = more capacity but more parameters
    lora_alpha=32,  # Scaling parameter
    target_modules=[
        "q_proj",  # Query projection
        "v_proj",  # Value projection
        # "k_proj",  # Optional: Key projection
        # "o_proj",  # Optional: Output projection
    ],
    lora_dropout=0.1,  # Dropout for regularization
    bias="none",  # Don't train biases
    task_type=TaskType.CAUSAL_LM,  # Task type
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()

## 3. Prepare Training Data

In [None]:
# Load a small instruction dataset
dataset = load_dataset("tatsu-lab/alpaca", split="train")

# Use subset for quick training
dataset = dataset.select(range(1000))

# Format prompts
def format_instruction(sample):
    instruction = f"""### Instruction:
{sample['instruction']}

### Input:
{sample['input'] if sample['input'] else 'N/A'}

### Response:
{sample['output']}"""
    return instruction

# Tokenize dataset
def tokenize_function(examples):
    instructions = [format_instruction(sample) for sample in examples]
    model_inputs = tokenizer(
        instructions,
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"
    )
    model_inputs["labels"] = model_inputs["input_ids"].clone()
    return model_inputs

# Apply tokenization
tokenized_dataset = dataset.map(
    lambda x: tokenize_function([x]),
    remove_columns=dataset.column_names
)

print(f"Dataset size: {len(tokenized_dataset)}")
print(f"\nExample instruction:\n{format_instruction(dataset[0])}")

## 4. Configure Training

In [None]:
training_args = TrainingArguments(
    output_dir="./lora_results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # Effective batch size = 16
    warmup_steps=50,
    max_steps=200,  # Limit steps for demo
    learning_rate=3e-4,  # Higher LR for LoRA
    fp16=True,
    logging_steps=10,
    save_strategy="steps",
    save_steps=50,
    evaluation_strategy="no",  # No eval for demo
    report_to="none",
    optim="paged_adamw_8bit",  # Memory-efficient optimizer
    gradient_checkpointing=True,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

print(f"Training configuration ready")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Total steps: {training_args.max_steps}")

## 5. Train Model

In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train
print("Starting LoRA fine-tuning...")
trainer.train()

# Save LoRA adapter
model.save_pretrained("./lora_adapter")
print("\nLoRA adapter saved!")

## 6. Inference with LoRA

In [None]:
def generate_response(instruction, input_text=""):
    prompt = f"""### Instruction:
{instruction}

### Input:
{input_text if input_text else 'N/A'}

### Response:
"""
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the response part
    response = response.split("### Response:")[-1].strip()
    return response

# Test examples
test_instructions = [
    ("What is machine learning?", ""),
    ("Translate the following to French:", "Hello, how are you?"),
    ("Summarize this text:", "LoRA is a technique for efficient fine-tuning of large language models. It works by decomposing weight updates into low-rank matrices."),
    ("Write a haiku about AI", ""),
]

for instruction, input_text in test_instructions:
    print(f"\nInstruction: {instruction}")
    if input_text:
        print(f"Input: {input_text}")
    response = generate_response(instruction, input_text)
    print(f"Response: {response}")
    print("-" * 50)

## 7. QLoRA - 4-bit Training (Optional)

In [None]:
from transformers import BitsAndBytesConfig

# QLoRA configuration for 4-bit training
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model with 4-bit quantization
# model_4bit = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
#     device_map="auto",
#     trust_remote_code=True
# )

print("QLoRA allows training 70B+ models on consumer GPUs!")
print("- 4-bit base model")
print("- 16-bit LoRA adapters")
print("- Matches full fine-tuning performance")
print("- 10x memory reduction")

## 8. Merge and Export

In [None]:
# Merge LoRA weights with base model
# This creates a standard model without LoRA overhead

# merged_model = model.merge_and_unload()
# merged_model.save_pretrained("./merged_model")

print("After merging:")
print("- No additional inference latency")
print("- Standard model format")
print("- Can be deployed normally")
print("\nAdapter size: ~100MB vs Full model: ~5-10GB")

## Key Takeaways

### LoRA Benefits
1. **Efficiency**: Only 0.1-1% of parameters trainable
2. **Memory**: 10x reduction in GPU memory
3. **Speed**: 3-10x faster training
4. **Flexibility**: Multiple task adapters on one model
5. **Deployment**: No inference overhead after merging

### Best Practices
- Start with r=8-16 for rank
- Set alpha = 2 × rank
- Target Q and V projections first
- Use higher learning rates (1e-4 to 1e-3)
- Apply dropout (0.05-0.1) to prevent overfitting

### When to Use LoRA
- Limited GPU memory
- Need multiple task-specific models
- Rapid experimentation
- Edge deployment with adapter swapping