In [20]:
# For cloud gpu runpod.io

# !pip install torch==2.1.0+cu121 -f https://download.pytorch.org/whl/torch_stable.html

In [26]:
from datasets import load_dataset
import torch

# Load PubMedQA dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Preprocessing function
def preprocess(example):
    # Combine context and question
    context = " ".join(example["context"]["contexts"])
    input_text = f"Context: {context}\n\nQuestion: {example['question']}\n\nAnswer:"
    
    # Format labels
    label = "yes" if example["final_decision"] == "yes" else "no"
    
    return {
        "input_text": input_text,
        "label": label
    }

# Apply preprocessing
processed_dataset = dataset.map(
    preprocess,
    remove_columns=["pubid", "question", "context", "long_answer", "final_decision"]
)

# Split dataset
train_test = processed_dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

### Load & tokenise

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Quantization config (4-bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model and tokenizer
model_name = "openai/gpt-oss-20b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Set pad token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

### LoRA

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Prepare model for training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Should show ~0.01% trainable parameters

### Training

In [None]:
def tokenize_and_format(example):
    # Tokenize inputs
    tokenized = tokenizer(
        example["input_text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    
    # Tokenize labels
    labels = tokenizer(
        example["label"],
        truncation=True,
        padding="max_length",
        max_length=8,
    )["input_ids"]
    
    # Combine input and label tokens
    tokenized["input_ids"] = tokenized["input_ids"] + labels
    tokenized["attention_mask"] = tokenized["attention_mask"] + [1] * len(labels)
    
    # Create labels for loss calculation
    tokenized["labels"] = [-100] * len(tokenized["input_ids"]) + labels
    
    return tokenized

# Apply tokenization
train_dataset = train_dataset.map(tokenize_and_format)
eval_dataset = eval_dataset.map(tokenize_and_format)

# Set format for PyTorch
train_dataset.set_format("torch")
eval_dataset.set_format("torch")

In [None]:
from transformers import TrainingArguments, Trainer

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt-pubmedqa-20b",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,
    bf16=False,
    optim="paged_adamw_8bit",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="wandb",
    run_name="gpt-20b-pubmedqa",
    gradient_checkpointing=True,
    ddp_find_unused_parameters=False,
    remove_unused_columns=False,
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=lambda data: {
        "input_ids": torch.stack([item["input_ids"] for item in data]),
        "attention_mask": torch.stack([item["attention_mask"] for item in data]),
        "labels": torch.stack([item["labels"] for item in data]),
    }
)

# Start training
trainer.train()

# Save model
trainer.save_model("./gpt-pubmedqa-20b-final")
tokenizer.save_pretrained("./gpt-pubmedqa-20b-final")

### Evaluate

In [None]:
from peft import PeftModel
import torch

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "openai/gpt-oss-20b",
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "./gpt-pubmedqa-20b-final")

# Inference function
def ask_medical_question(context, question):
    prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            temperature=0.7,
            do_sample=True
        )
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()

# Example usage
context = "Patient shows symptoms X, Y, and Z."
question = "Should we administer treatment A?"
print(ask_medical_question(context, question))