# LoRA PEFT Fine Tuning

## Load Data

In [1]:
from datasets import load_dataset
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..'))) 
from src.helper_functions import format_mcf_finetuning


# Load model
usml_raw = load_dataset("GBaker/MedQA-USMLE-4-options")
usml_train = usml_raw['train']
print(usml_train)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['question', 'answer', 'options', 'meta_info', 'answer_idx', 'metamap_phrases'],
    num_rows: 10178
})


## Pre-process the Dataset

In [2]:
formatted_train = usml_train.map(
    format_mcf_finetuning,
    remove_columns=usml_train.column_names
)

print(formatted_train[0]['prompt'])
print(formatted_train[0]['completion'])

Map: 100%|██████████| 10178/10178 [00:00<00:00, 26938.48 examples/s]

Question: A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?
A. Ampicillin
B. Ceftriaxone
C. Doxycycline
D. Nitrofurantoin
Answer:
Nitrofurantoin





## Verify GPU and Tokenize Dataset

In [3]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    Trainer
)
from peft import LoraConfig, get_peft_model

# 1. GPU Verification
assert torch.cuda.is_available(), "GPU not detected!"
print(f"CUDA version: {torch.version.cuda}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f}GB")
torch.cuda.empty_cache()

# 2. Tokenizer Setup
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# 3. Tokenization Function
def tokenize_function(examples):
    texts = [p + c for p, c in zip(examples['prompt'], examples['completion'])]
    
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Create labels (mask prompt tokens)
    prompt_lens = [len(tokenizer(p)['input_ids']) for p in examples['prompt']]
    labels = tokenized["input_ids"].clone()
    for i, plen in enumerate(prompt_lens):
        labels[i, :plen] = -100
    
    tokenized["labels"] = labels
    return tokenized

# 4. Apply tokenization
tokenized_dataset = formatted_train.map(
    tokenize_function,
    batched=True,
    remove_columns=['prompt', 'completion'],
    batch_size=8  # Smaller batches for tokenization
)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/joshhaber/SIADS-699_PEFT-Testers/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/joshhaber/SIADS-699_PEFT-Testers/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/joshhaber/SIADS-699_PEFT-Testers/.venv/lib/python3.11/site-packages/ipykernel/kernelapp

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


AssertionError: GPU not detected!

## Load Model, Configure LoRA, Collate Data, and Train

In [None]:
# 5. Model Loading (FP16)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    token=True
)

# 6. LoRA Configuration (Standard, no quantization prep)
lora_config = LoraConfig(
    r=8,  # Reduced from 16 for memory
    lora_alpha=16, #alpha typically 2x rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Full attention layer
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# 7. Verify parameters
model.print_trainable_parameters()  # Should show ~0.1% of parameters

# 8. Training Arguments (Memory-optimized)
training_args = TrainingArguments(
    output_dir="./llama7b-usmle-lora",
    per_device_train_batch_size=2,  # Reduced from 4
    gradient_accumulation_steps=8,  # Increased to compensate
    warmup_ratio=0.1,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    optim="adamw_torch",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    report_to="none"
)

# 9. Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8
)

# 10. Pre-Training Memory Check
print(f"Pre-train VRAM: {torch.cuda.memory_allocated()/1e9:.2f}GB used")

# 11. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Start training
trainer.train()

## Save LoRA Weights

In [None]:
# 1. Save adapters (PEFT automatically excludes base model)
model.save_pretrained(
    "llama7b-usmle-lora",
    safe_serialization=True,  # Uses modern .safetensors format
    max_shard_size="200MB"  # Optional: splits large adapters

## Push Model to Hugging Face

In [None]:
# 1. Define your custom model name
MODEL_NAME = "usmle-llama7b-lora"  
USERNAME = "jihbr"  # Your Hugging Face username

# 2. Verify critical files
import os
required_files = {
    "adapter_config.json": "LoRA configuration",
    "adapter_model.safetensors": "Adapter weights"
}
for file, desc in required_files.items():
    assert os.path.exists(f"llama7b-usmle-lora-subset/{file}"), f"Missing {desc} file"

# 3. Accelerated upload (5x faster in GCP)
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# 4. Upload with custom name
from huggingface_hub import HfApi, create_repo

# Create the repository with your custom name
repo_url = create_repo(
    repo_id=f"{USERNAME}/{MODEL_NAME}",  # Changed here
    repo_type="model",
    token=True,
    exist_ok=True
)
print(f"Created repository: {repo_url}")

# Upload to your named repository
HfApi().upload_folder(
    folder_path="llama7b-usmle-lora",
    repo_id=f"{USERNAME}/{MODEL_NAME}",  # Changed here
    repo_type="model",
    commit_message="Add USMLE LoRA adapters - Medical QA Fine-Tune",
    ignore_patterns=["*.bin", "pytorch_model*"],
    allow_patterns=["adapter_*", "README*"]
)

print("✅ Adapters uploaded successfully to:")
print(f"https://huggingface.co/{USERNAME}/{MODEL_NAME}")  # Changed here