# LoRA PEFT Fine Tuning (Letter Target Variable)

## Load Data

In [1]:
from datasets import load_dataset
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..'))) 
from src.helper_functions import format_letter_finetuning


# Load model
usml_raw = load_dataset("GBaker/MedQA-USMLE-4-options")
usml_train = usml_raw['train']

print(usml_train)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['question', 'answer', 'options', 'meta_info', 'answer_idx', 'metamap_phrases'],
    num_rows: 10178
})


## Pre-process the Dataset (Letter Strategy)

In [2]:
formatted_train = usml_train.map(
    format_letter_finetuning,
    remove_columns=usml_train.column_names
)

print(formatted_train[0]['prompt'])
print(formatted_train[0]['completion'])

Map: 100%|██████████| 10178/10178 [00:00<00:00, 27829.96 examples/s]

Question: A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?
A. Ampicillin
B. Ceftriaxone
C. Doxycycline
D. Nitrofurantoin
Answer:
D





## Verify GPU and Tokenize Dataset

In [3]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    Trainer,
    BitsAndBytesConfig  # New for QLoRA
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training  # Updated imports


# --- GPU Verification (Enhanced) ---
assert torch.cuda.is_available(), "GPU not detected!"
print(f"CUDA version: {torch.version.cuda}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f}GB")
torch.cuda.empty_cache()  # Clear cache before loading model

# --- Tokenizer Setup (Llama 3 Specific) ---
model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)

# Critical Llama 3 tokenizer settings
tokenizer.pad_token = tokenizer.eos_token  # Required for padding
tokenizer.padding_side = "left"  # Essential for autoregressive generation
assert tokenizer.pad_token is not None, "Padding token not set correctly!"  # Validation

# --- Tokenization Function (Optimized) ---
def tokenize_function(examples):
    # Combine prompt + completion
    texts = [p + c for p, c in zip(examples['prompt'], examples['completion'])]
    
    # Tokenize with fixed-length padding
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length",  # Ensures consistent length
        return_tensors="pt"  # Returns PyTorch tensors
    )
    
    # Create labels (mask prompt tokens)
    prompt_lens = [len(tokenizer(p)['input_ids']) for p in examples['prompt']]
    labels = tokenized["input_ids"].clone()
    for i, plen in enumerate(prompt_lens):
        labels[i, :plen] = -100  # Mask prompt tokens
    
    tokenized["labels"] = labels
    return tokenized

# --- Dataset Tokenization (With Validation) ---
tokenized_dataset = formatted_train.map(
    tokenize_function,
    batched=True,
    remove_columns=['prompt', 'completion'],
    batch_size=8  # Smaller batches prevent OOM during tokenization
)

# Verify dataset structure
required_columns = ["input_ids", "attention_mask", "labels"]
assert all(col in tokenized_dataset.features for col in required_columns), \
       f"Tokenized dataset missing required columns: {required_columns}"

# For Hugging Face datasets (recommended)

random_seed = 42
train_val_split = tokenized_dataset.train_test_split(
    test_size=0.1,  # 10% validation
    shuffle=True,
    seed = random_seed
)
train_data = train_val_split["train"]
val_data = train_val_split["test"]  

CUDA version: 12.6
VRAM: 23.57GB


## Load Model, Configure LoRA, Collate Data, and Train

In [None]:
# --- 1. Memory Optimization ---
torch.cuda.empty_cache()

# --- 2. QLoRA 4-bit Quantization ---
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True  # Extra memory savings
)

# --- 3. Model Loading (No CPU offloading) ---
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    quantization_config=quant_config,
    device_map="auto",  # Automatically uses GPU(s)
    token=True
)
model = prepare_model_for_kbit_training(model)

# --- 4. LoRA Config (Optimized for QLoRA) ---
lora_config = LoraConfig(
    r=8,  # Can use higher rank than standard LoRA
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Focus on attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# --- 5. Critical QLoRA Optimizations ---
model.config.use_cache = False  # Saves 20% VRAM
model.config.pretraining_tp = 1  # Disables tensor parallelism

# --- 6. Training Arguments (Updated for QLoRA + Validation) ---
training_args = TrainingArguments(
    output_dir="./llama8b-usmle-qlora-letters",
    # Batch Configuration
    per_device_train_batch_size=1,           # Reduced from 2 for stability
    gradient_accumulation_steps=16,          # Effective batch=16
    # Learning Rate
    learning_rate=3e-5,                      # Reduced from 5e-5
    lr_scheduler_type="cosine_with_restarts", # Better for unstable loss
    warmup_steps=150,                        # Longer warmup
    # Optimization
    optim="paged_adamw_8bit",
    weight_decay=0.01,                       # NEW: Regularization
    max_grad_norm=0.3,                       # Gradient clipping
    # Memory Management
    fp16=True,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    # Monitoring & Saving
    logging_steps=5,
    eval_strategy="steps",              # NEW: Critical for validation
    eval_steps=100,                          # Evaluate every 100 steps
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,             # NEW: Keep best checkpoint
    metric_for_best_model="eval_loss",       # NEW
    greater_is_better=False,                 # NEW: Lower eval_loss=better
    report_to="none"
)

# --- 7. Memory Verification ---
print(f"Available VRAM: {torch.cuda.mem_get_info()[0]/1e9:.2f}GB")
model.print_trainable_parameters()

# --- 8. Trainer (Updated with Validation) ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,                # From train_test_split
    eval_dataset=val_data,                   # NEW: Validation set
    data_collator=DataCollatorForSeq2Seq(
        tokenizer,
        pad_to_multiple_of=8,
        label_pad_token_id=-100
    ),

)
# --- 9. Train ---
trainer.train()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Available VRAM: 10.98GB
trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


Step,Training Loss,Validation Loss
100,4.3609,4.017844
200,3.8667,3.94394
300,3.7106,3.928877
400,3.839,3.921076
500,3.8487,3.914184
600,3.7624,3.910863
700,3.995,3.908091
800,4.0957,3.906157
900,3.8755,3.904726
1000,3.8856,3.903224


TrainOutput(global_step=1719, training_loss=3.9048510280163082, metrics={'train_runtime': 43036.5054, 'train_samples_per_second': 0.639, 'train_steps_per_second': 0.04, 'total_flos': 6.338426213656166e+17, 'train_loss': 3.9048510280163082, 'epoch': 3.0})

## Save QLoRA Weights

In [5]:
# 1. Save adapters (PEFT automatically excludes base model)
model.save_pretrained(
    "llama8b-usmle-qlora_letters",
    safe_serialization=True,  # Uses modern .safetensors format
    max_shard_size="200MB" ) # Optional: splits large adapters

## Push Model to Hugging Face

In [6]:
import os
from huggingface_hub import HfApi, create_repo

# 1. Configuration - MUST MATCH YOUR SAVE PATH!
SAVE_DIR = "llama8b-usmle-qlora_letters"  # Exactly matches save_pretrained() path
MODEL_NAME = "usmle-llama8b-qlora_letters"  
USERNAME = "jihbr"

# 2. Verify saved files (immediately after save_pretrained)
required_files = {
    "adapter_config.json": "QLoRA configuration",
    "adapter_model.safetensors": "Adapter weights",
    "README.md": "Model card"  # Recommended to add
}

print(f"Verifying files in: {os.path.abspath(SAVE_DIR)}")
print("Found files:", os.listdir(SAVE_DIR))

for file, desc in required_files.items():
    file_path = os.path.join(SAVE_DIR, file)
    assert os.path.exists(file_path), f"Missing {desc} at {file_path}\nFull path: {os.path.abspath(file_path)}"

# 3. Prepare for Upload
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # Faster uploads

# 4. Create Hub repository
try:
    repo_url = create_repo(
        repo_id=f"{USERNAME}/{MODEL_NAME}",
        repo_type="model",
        token=True,
        exist_ok=True
    )
    print(f"Created repository: {repo_url}")
except Exception as e:
    raise RuntimeError(f"Repo creation failed. Does the token have write access?\n{str(e)}")

# 5. Upload with progress tracking
api = HfApi()
try:
    upload_result = api.upload_folder(
        folder_path=SAVE_DIR,
        repo_id=f"{USERNAME}/{MODEL_NAME}",
        commit_message=f"USMLE QLoRA adapters - {MODEL_NAME}",
        allow_patterns=[
            "adapter_config.json",
            "adapter_model.safetensors",
            "README.md",
            "training_args.bin"
        ],
        ignore_patterns=["*.bin", "pytorch_model*"],
    )
    print(f"✅ Upload successful!\nView at: https://huggingface.co/{USERNAME}/{MODEL_NAME}")
except Exception as e:
    print(f"❌ Upload failed. Check your internet connection and HF token.\n{str(e)}")
    # Suggested recovery:
    print("Try re-running just the upload:")
    print(f"huggingface-cli upload {USERNAME}/{MODEL_NAME} {SAVE_DIR}/*")

Verifying files in: /home/jupyter/SIADS-699_PEFT-Testers/notebooks/llama8b-usmle-qlora
Found files: ['adapter_model.safetensors', 'adapter_config.json', 'README.md']
Created repository: https://huggingface.co/jihbr/usmle-llama8b-qlora


Uploading...:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

✅ Upload successful!
View at: https://huggingface.co/jihbr/usmle-llama8b-qlora
