# DoRA PEFT Fine Tuning (Letter Target Variable)

## Load Data

In [1]:
from datasets import load_dataset
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..'))) 
from src.helper_functions import format_letter_finetuning


# Load model
usml_raw = load_dataset("GBaker/MedQA-USMLE-4-options")
usml_train = usml_raw['train']
print(usml_train)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['question', 'answer', 'options', 'meta_info', 'answer_idx', 'metamap_phrases'],
    num_rows: 10178
})


## Pre-process the Dataset (Letter Strategy)

In [2]:
formatted_train = usml_train.map(
    format_letter_finetuning,
    remove_columns=usml_train.column_names
)

print(formatted_train[0]['prompt'])
print(formatted_train[0]['completion'])

Question: A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?
A. Ampicillin
B. Ceftriaxone
C. Doxycycline
D. Nitrofurantoin
Answer:
D


## Verify GPU and Tokenize Dataset

In [3]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    Trainer,
    BitsAndBytesConfig  
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training  # Updated imports

# 1. GPU Verification
assert torch.cuda.is_available(), "GPU not detected!"
print(f"CUDA version: {torch.version.cuda}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f}GB")
torch.cuda.empty_cache()

# 2. Tokenizer Setup
model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# 3. Tokenization Function
def tokenize_function(examples):
    texts = [p + c for p, c in zip(examples['prompt'], examples['completion'])]
    
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Create labels (mask prompt tokens)
    prompt_lens = [len(tokenizer(p)['input_ids']) for p in examples['prompt']]
    labels = tokenized["input_ids"].clone()
    for i, plen in enumerate(prompt_lens):
        labels[i, :plen] = -100
    
    tokenized["labels"] = labels
    return tokenized

# 4. Apply tokenization
tokenized_dataset = formatted_train.map(
    tokenize_function,
    batched=True,
    remove_columns=['prompt', 'completion'],
    batch_size=8  # Smaller batches for tokenization
)

# Verify dataset structure
required_columns = ["input_ids", "attention_mask", "labels"]
assert all(col in tokenized_dataset.features for col in required_columns), \
       f"Tokenized dataset missing required columns: {required_columns}"

# For Hugging Face datasets (recommended)

random_seed = 42
train_val_split = tokenized_dataset.train_test_split(
    test_size=0.1,  # 10% validation
    shuffle=True,
    seed = random_seed
)
train_data = train_val_split["train"]
val_data = train_val_split["test"] 

CUDA version: 12.6
VRAM: 23.57GB


## Load Model, Configure DoRA, Collate Data, and Train

In [None]:
# --- 1. Memory Optimization ---
torch.cuda.empty_cache()

# --- 2. QLoRA 4-bit Quantization ---
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# --- 3. Model Loading ---
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    quantization_config=quant_config,
    device_map="auto",
    token=True
)
model = prepare_model_for_kbit_training(model)

# --- 4. DoRA Config (Optimized for Low VRAM) ---
lora_config = LoraConfig(
    r=8,  # Reduced rank (original was 16)
    lora_alpha=16,  # Reduced alpha (original was 32)
    target_modules=["q_proj", "v_proj"],  # Same as before
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    use_dora=True  # Keep DoRA enabled
)

model = get_peft_model(model, lora_config)

# --- 5. Critical Memory Optimizations ---
model.config.use_cache = False
model.config.pretraining_tp = 1

# --- 6. Training Arguments (VRAM-Friendly) ---
training_args = TrainingArguments(
    output_dir="./llama8b-usmle-dora-lowvram-letters",
    # Batch Configuration
    per_device_train_batch_size=1,  # Already minimal
    gradient_accumulation_steps=8,  # Reduced from 16
    # Learning Rate
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,  # Shorter warmup
    # Optimization
    optim="paged_adamw_8bit",
    weight_decay=0.01,
    max_grad_norm=0.3,
    # Memory Management
    fp16=True,
    gradient_checkpointing=True,  # Critical for low VRAM
    gradient_checkpointing_kwargs={"use_reentrant": False},
    # Evaluation/Logging
    eval_strategy="no",  
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    report_to="none"
)

# --- 7. Memory Verification ---
print(f"Available VRAM: {torch.cuda.mem_get_info()[0]/1e9:.2f}GB")
model.print_trainable_parameters()

# --- 8. Trainer (No Validation) ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer,
        pad_to_multiple_of=8,
        label_pad_token_id=-100
    )
)

# --- 9. Train ---
trainer.train()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Available VRAM: 2.04GB
trainable params: 3,571,712 || all params: 8,033,832,960 || trainable%: 0.0445


Step,Training Loss
10,4.1167
20,4.0072
30,4.1291
40,4.062
50,3.9951
60,3.864
70,3.8928
80,4.3843
90,3.9396
100,4.1529


TrainOutput(global_step=3435, training_loss=3.879001127684481, metrics={'train_runtime': 46542.0957, 'train_samples_per_second': 0.59, 'train_steps_per_second': 0.074, 'total_flos': 6.33856452502487e+17, 'train_loss': 3.879001127684481, 'epoch': 3.0})

## Save DoRA Weights

In [5]:
# 1. Save adapters (PEFT automatically excludes base model)
model.save_pretrained(
    "llama8b-usmle-dora-letters",
    safe_serialization=True,  # Uses modern .safetensors format
    max_shard_size="200MB" ) # Optional: splits large adapters

## Push Model to Hugging Face

In [6]:
# 1. Define your custom model name
MODEL_NAME = "usmle-llama8b-dora-letters"  
USERNAME = "jihbr"  # Your Hugging Face username

# 2. Verify critical files
import os
required_files = {
    "adapter_config.json": "DoRA configuration",
    "adapter_model.safetensors": "Adapter weights"
}
for file, desc in required_files.items():
    assert os.path.exists(f"llama8b-usmle-dora/{file}"), f"Missing {desc} file"

# 3. Accelerated upload (5x faster in GCP)
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# 4. Upload with custom name
from huggingface_hub import HfApi, create_repo

# Create the repository with your custom name
repo_url = create_repo(
    repo_id=f"{USERNAME}/{MODEL_NAME}",  # Changed here
    repo_type="model",
    token=True,
    exist_ok=True
)
print(f"Created repository: {repo_url}")

# Upload to your named repository
HfApi().upload_folder(
    folder_path="llama8b-usmle-dora_letters",
    repo_id=f"{USERNAME}/{MODEL_NAME}",  # Changed here
    repo_type="model",
    commit_message="Add USMLE DoRA adapters - Medical QA Fine-Tune",
    ignore_patterns=["*.bin", "pytorch_model*"],
    allow_patterns=["adapter_*", "README*"]
)

print("✅ Adapters uploaded successfully to:")
print(f"https://huggingface.co/{USERNAME}/{MODEL_NAME}")  # Changed here

Created repository: https://huggingface.co/jihbr/usmle-llama8b-dora


Uploading...:   0%|          | 0.00/14.3M [00:00<?, ?B/s]

✅ Adapters uploaded successfully to:
https://huggingface.co/jihbr/usmle-llama8b-dora
