# Testing PEFT configurations on a subset of the training data

In [1]:
from datasets import load_dataset
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..'))) 
from src.helper_functions import format_mcf_finetuning


# Load model
usml_raw = load_dataset("GBaker/MedQA-USMLE-4-options")
usml_train = usml_raw['train']
sample_train = usml_train.shuffle(seed=42).select(range(100))
print(sample_train)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['question', 'answer', 'options', 'meta_info', 'answer_idx', 'metamap_phrases'],
    num_rows: 100
})


Preprocess the subset

In [2]:
formatted_train_subset = sample_train.map(
    format_mcf_finetuning,
    remove_columns=sample_train.column_names
)

print(formatted_train_subset[0]['prompt'])
print(formatted_train_subset[0]['completion'])

Question: A 35-year-old woman comes to your office with a variety of complaints. As part of her evaluation, she undergoes laboratory testing which reveals the presence of anti-centromere antibodies. All of the following symptoms and signs would be expected to be present EXCEPT:
A. Pallor, cyanosis, and erythema of the hands
B. Blanching vascular abnormalities
C. Hypercoagulable state
D. Heartburn and regurgitation
Answer:
Hypercoagulable state


In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig
    Trainer
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Load tokenizer - special setup for LLaMA
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)  # Added token=True
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # For autoregressive generation

# Tokenization function (unchanged)
def tokenize_function(examples):
    texts = [p + c for p, c in zip(examples['prompt'], examples['completion'])]
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    prompt_lens = [len(tokenizer(p)['input_ids']) for p in examples['prompt']]
    tokenized["labels"] = [
        [-100]*plen + t[plen:] for plen, t in zip(prompt_lens, tokenized["input_ids"])
    ]
    return tokenized

tokenized_dataset = formatted_train_subset.map(tokenize_function, batched=True)

# Modified model loading for GCP GPUs
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Changed to 4bit for better GPU utilization
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,  # Replaced load_in_8bit
    device_map="auto",
    torch_dtype=torch.float16,
    token=True  # Added token for authentication
)

# Prepare for LoRA (updated function name)
model = prepare_model_for_kbit_training(model)

# LoRA config (unchanged structure)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Updated TrainingArguments for GCP
training_args = TrainingArguments(
    output_dir="./llama7b-usmle-lora-subset",
    per_device_train_batch_size=4,  # Increased for GPU
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=5,
    evaluation_strategy="no",
    save_strategy="epoch",
    optim="paged_adamw_32bit",  # Changed to 32bit
    gradient_checkpointing=True,  # Added for memory efficiency
    report_to="none"  # Disables wandb if not needed
)

# Data collator (unchanged)
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8
)

# Trainer (unchanged structure)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# GPU verification
print(f"Using device: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f}GB")

# Start training
trainer.train()

# Save model (unchanged)
model.save_pretrained("llama7b-usmle-lora-subset")

Map: 100%|██████████| 100/100 [00:01<00:00, 65.95 examples/s]
Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Now save the fine tuned model to ephemeral storage.

In [None]:
from google.cloud import storage

# Create the bucket (run once)
client = storage.Client()
bucket_name = "subset_models"  # Your bucket name

try:
    bucket = client.create_bucket(bucket_name, location="us-central1")
    print(f"Bucket created: gs://{bucket_name}")
except Exception as e:
    print(f"Bucket exists or error: {e}")

In [None]:
# Save locally first (temporarily)
local_dir = "llama7b-usmle-lora-subset"

# Upload to GCS
client = storage.Client()
bucket = client.bucket("subset_models")
gcs_path = "llama7b-usmle-lora-subset"  # Folder name in bucket

# Upload all files
for file_name in os.listdir(local_dir):
    local_path = os.path.join(local_dir, file_name)
    blob = bucket.blob(f"{gcs_path}/{file_name}")
    blob.upload_from_filename(local_path)
    print(f"Uploaded: gs://subset_models/{gcs_path}/{file_name}")

# Clean up local files (optional)
!rm -rf {local_dir}