In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer


print(f"GPU name: {torch.cuda.get_device_name(0)}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")

2025-08-29 09:38:36.899620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756460316.922629     153 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756460316.929710     153 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


GPU name: Tesla P100-PCIE-16GB
GPU memory: 17.059545088 GB


In [2]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [8]:
# Set seed for reproducibility
import random

def set_seed(seed=11):
    """Set all seeds to ensure reproducibility"""
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

### Load Dataset

In [3]:
from datasets import load_dataset

# Load PubMedQA dataset
pubmedqa = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Split the dataset
train_test_split = pubmedqa["train"].train_test_split(test_size=0.1)
dataset = train_test_split

# Examine the dataset
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 900
    })
    test: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 100
    })
})
{'pubid': 20101129, 'question': 'Is prophylactic fixation a cost-effective method to prevent a future contralateral fragility hip fracture?', 'context': {'contexts': [': A previous hip fracture more than doubles the risk of a contralateral hip fracture. Pharmacologic and environmental interventions to prevent hip fracture have documented poor compliance. The purpose of this study was to examine the cost-effectiveness of prophylactic fixation of the uninjured hip to prevent contralateral hip fracture.', ': A Markov state-transition model was used to evaluate the cost and quality-adjusted life-years (QALYs) for unilateral fixation of hip fracture alone (including internal fixation or arthroplasty) compared 

### Preprocess the Data

In [4]:
from transformers import AutoTokenizer

# Load tokenizer for Mistral-7B-Instruct
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to EOS token

def format_prompt(example):
    # Format the prompt using Mistral's instruction format
    prompt = f"<s>[INST] Context: {example['context']}\n\nQuestion: {example['question']} [/INST] {example['final_decision']}</s>"
    return {"text": prompt}

def tokenize_function(examples):
    # Tokenize the formatted prompts
    return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")

# Apply formatting and tokenization
dataset = dataset.map(format_prompt)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

### Fine-tuning


In [5]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

# Load the model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Prepare the model for training
model = prepare_model_for_kbit_training(model)

# Configure LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

trainable params: 41,943,040 || all params: 7,289,966,592 || trainable%: 0.5754


### Train the Model

In [6]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./mistral-pubmedqa",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    fp16=True,  # Use mixed precision
    optim="paged_adamw_8bit",
    report_to="wandb",
    run_name="mistral-pubmedqa-finetune",
    gradient_checkpointing=True,
    ddp_find_unused_parameters=False,
    remove_unused_columns=False,
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=lambda data: {
        "input_ids": torch.stack([item["input_ids"] for item in data]),
        "attention_mask": torch.stack([item["attention_mask"] for item in data]),
        "labels": torch.stack([item["input_ids"] for item in data]),
    },
)

# Start training
trainer.train()

### Save the model

In [None]:
# Save the model
trainer.save_model("./mistral-pubmedqa-finetune")

# Save the tokenizer
tokenizer.save_pretrained("./mistral-pubmedqa-finetune")

# Save the LoRA adapter separately
model.save_pretrained("./mistral-pubmedqa-adapter")

### Evaluation

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_pubmedqa(model, tokenizer, dataset):
    """Evaluate the model on PubMedQA test set"""
    predictions = []
    references = []
    
    # Set model to evaluation mode
    model.eval()
    
    # Process each example in the test set
    for example in dataset:
        # Format the prompt
        prompt = f"<s>[INST] Context: {example['context']}\n\nQuestion: {example['question']} [/INST]"
        
        # Tokenize the prompt
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        # Generate answer
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=10,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode the generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract the answer (yes/no/maybe)
        # The answer should be after the [/INST] token
        answer = generated_text.split("[/INST]")[-1].strip().lower()
        
        # Normalize the answer to match the expected format
        if "yes" in answer:
            pred = "yes"
        elif "no" in answer:
            pred = "no"
        elif "maybe" in answer:
            pred = "maybe"
        else:
            # If unclear, default to maybe
            pred = "maybe"
        
        predictions.append(pred)
        references.append(example["final_decision"].lower())
    
    # Calculate metrics
    accuracy = accuracy_score(references, predictions)
    f1 = f1_score(references, predictions, average="weighted")
    
    # Create confusion matrix
    cm = confusion_matrix(references, predictions, labels=["yes", "no", "maybe"])
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["yes", "no", "maybe"], yticklabels=["yes", "no", "maybe"])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix - PubMedQA Evaluation')
    plt.savefig('./pubmedqa_confusion_matrix.png')
    
    # Print metrics
    print(f"PubMedQA Evaluation Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return {
        "accuracy": accuracy,
        "f1": f1,
        "confusion_matrix": cm
    }

# Evaluate on PubMedQA test set
pubmedqa_results = evaluate_pubmedqa(model, tokenizer, dataset["test"])

### Evaluation on MMLU

In [None]:
from lm_eval import evaluator
from lm_eval.models.huggingface import HFLM

def evaluate_mmlu(model, tokenizer):
    """Evaluate the model on MMLU dataset"""
    # Create HFLM model for lm-eval
    hf_model = HFLM(
        pretrained=model,
        tokenizer=tokenizer,
        batch_size=4,
        device="cuda"
    )
    
    # Define MMLU tasks
    # We'll evaluate on a subset of MMLU tasks for demonstration
    # You can add more tasks as needed
    tasks = [
        "mmlu_anatomy",
        "mmlu_clinical_knowledge",
        "mmlu_college_medicine",
        "mmlu_medical_genetics",
        "mmlu_professional_medicine",
        "mmlu_virology"
    ]
    
    # Run evaluation
    results = evaluator.simple_evaluate(
        model=hf_model,
        tasks=tasks,
        num_fewshot=5,
        limit=50  # Limit to 50 examples per task for faster evaluation
    )
    
    # Print results
    print("\nMMLU Evaluation Results:")
    for task in tasks:
        if task in results["results"]:
            print(f"{task}: {results['results'][task]['acc']:.4f}")
    
    # Calculate average accuracy
    avg_acc = np.mean([results["results"][task]["acc"] for task in tasks if task in results["results"]])
    print(f"\nAverage MMLU Accuracy: {avg_acc:.4f}")
    
    return results

# Evaluate on MMLU
mmlu_results = evaluate_mmlu(model, tokenizer)

In [None]:
import json

# Save evaluation results
evaluation_results = {
    "pubmedqa": pubmedqa_results,
    "mmlu": mmlu_results
}

with open("./evaluation_results.json", "w") as f:
    json.dump(evaluation_results, f, indent=4)

print("Evaluation results saved to evaluation_results.json")

### Inference

In [None]:
from transformers import pipeline

# Load the fine-tuned model
model_path = "./mistral-pubmedqa-finetune"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Create a text generation pipeline
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example question
context = "Recent studies have shown that regular physical activity can reduce the risk of cardiovascular disease by up to 35%."
question = "Does exercise reduce heart disease risk?"

# Format the prompt
prompt = f"<s>[INST] Context: {context}\n\nQuestion: {question} [/INST]"

# Generate answer
result = qa_pipeline(prompt, max_new_tokens=10, num_return_sequences=1)
print(result[0]['generated_text'])