BERT

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

#1. Install dependencies
!pip install -q transformers datasets peft accelerate evaluate textattack

#2. Imports
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import DatasetDict

#3. Load Dataset (PubMedQA)
raw = load_dataset("pubmed_qa", "pqa_labeled")

print(raw)

# Split train into train + temp (70/30)
train_temp = raw["train"].train_test_split(test_size=0.3, seed=42)

# Split temp into validation + test (50/50)
val_test = train_temp["test"].train_test_split(test_size=0.5, seed=42)

# Build dataset dict with all splits
dataset = DatasetDict({
    "train": train_temp["train"],       # 800 examples
    "validation": val_test["train"],    # 100 examples
    "test": val_test["test"]            # 100 examples
})

print(dataset)

raw = dataset

# Map labels to ids
label2id = {"yes": 0, "no": 1, "maybe": 2}
id2label = {v: k for k, v in label2id.items()}

# Preprocessing function
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Preprocessing function

def preprocess(ex):
    # Convert question (list of strings) into one string
    question = " ".join(ex["question"]) if isinstance(ex["question"], list) else str(ex["question"])

    # Extract context text
    if isinstance(ex["context"], dict) and "contexts" in ex["context"]:
        context = " ".join(ex["context"]["contexts"])
    elif isinstance(ex["context"], list):
        context = " ".join([c.get("text", str(c)) for c in ex["context"]])
    else:
        context = str(ex["context"])

    # Combine question and context
    text = question + " " + context

    return tokenizer(text, truncation=True, padding="max_length", max_length=256)


sample = raw["train"][0]["context"]
print(type(sample))
print(sample)

print(type(raw["train"][0]["question"]), raw["train"][0]["question"])
print(type(raw["train"][0]["context"]), raw["train"][0]["context"].keys())


train = raw["train"].map(preprocess, batched=False)
val = raw["validation"].map(preprocess, batched=False)

train = train.map(lambda ex: {"labels": label2id[ex["final_decision"]]})
val = val.map(lambda ex: {"labels": label2id[ex["final_decision"]]})

train.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
val.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

#4. Load BERT Model and Apply LoRA
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # confirm trainable params




[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 1000
    })
})
DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 700
    })
    validation: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 150
    })
    test: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 150
    })
})


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

<class 'dict'>
{'contexts': ["To examine gout patients' knowledge of their condition, including the central role of achieving and maintaining the serum urate (SU) goal with the use of urate-lowering therapy (ULT).", 'This study of 612 gout patients was conducted at a Veterans Affairs medical center. Gout patients were included based on administrative diagnostic codes and receipt of at least 1 allopurinol prescription over a 1-year period. Questionnaires were mailed to patients and linked to medical records data. The questionnaire included gout-specific knowledge questions, the Patient Activation Measure, and self-reported health outcomes. Knowledge was assessed descriptively. Multivariable logistic regression was used to determine predictors of SU goal knowledge. Associations of knowledge with health outcomes were examined in exploratory analyses.', 'The questionnaire had a 62% response rate. Only 14% of patients knew their SU goal, while the majority answered correctly for the other 5

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.2707


In [2]:
#5. Training Setup

training_args = TrainingArguments(
    output_dir="pubmedqa_lora",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=3,
    # evaluation_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",         # FIX: save checkpoint each epoch
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"   # <--- disables wandb/tensorboard
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    acc = (preds == labels).astype(float).mean().item()
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    # tokenizer=tokenizer,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

#6. Fine-Tune
trainer.train()

#7. Evaluate & Adversarial Robustness Check
print("Evaluation Results:", trainer.evaluate())


#8. Save LoRA Adapter
# model.save_pretrained("pubmedqa_lora_adapter")
trainer.save_model("pubmedqa_lora_full")
tokenizer.save_pretrained("pubmedqa_lora_adapter")

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.974691,0.54
2,1.027500,0.970892,0.54
3,0.944700,0.970625,0.54


Evaluation Results: {'eval_loss': 0.9746913909912109, 'eval_accuracy': 0.54, 'eval_runtime': 2.1767, 'eval_samples_per_second': 68.91, 'eval_steps_per_second': 4.594, 'epoch': 3.0}


('pubmedqa_lora_adapter/tokenizer_config.json',
 'pubmedqa_lora_adapter/special_tokens_map.json',
 'pubmedqa_lora_adapter/vocab.txt',
 'pubmedqa_lora_adapter/added_tokens.json',
 'pubmedqa_lora_adapter/tokenizer.json')

GPT-Neo

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

#1. Install dependencies
!pip install -q transformers datasets peft accelerate evaluate textattack

#2. Imports
import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

#3. Load Dataset (PubMedQA)
raw = load_dataset("pubmed_qa", "pqa_labeled")

# Split into train/val/test
train_temp = raw["train"].train_test_split(test_size=0.3, seed=42)
val_test = train_temp["test"].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    "train": train_temp["train"],
    "validation": val_test["train"],
    "test": val_test["test"]
})

# Map labels
label2id = {"yes": 0, "no": 1, "maybe": 2}
id2label = {v: k for k, v in label2id.items()}

# Tokenizer (GPT-Neo)
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPT-style models (GPT-2, GPT-Neo) have no pad_token, so reuse eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# also sync with model config
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name)
config.pad_token_id = tokenizer.pad_token_id

# Preprocessing
def preprocess(ex):
    question = " ".join(ex["question"]) if isinstance(ex["question"], list) else str(ex["question"])
    if isinstance(ex["context"], dict) and "contexts" in ex["context"]:
        context = " ".join(ex["context"]["contexts"])
    elif isinstance(ex["context"], list):
        context = " ".join([c.get("text", str(c)) for c in ex["context"]])
    else:
        context = str(ex["context"])
    text = question + " " + context
    return tokenizer(text, truncation=True, padding="max_length", max_length=256)

train = dataset["train"].map(preprocess, batched=False)
val   = dataset["validation"].map(preprocess, batched=False)

train = train.map(lambda ex: {"labels": label2id[ex["final_decision"]]})
val   = val.map(lambda ex: {"labels": label2id[ex["final_decision"]]})

train.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
val.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

#@4. Load GPT-Neo Model + LoRA

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
    pad_token_id=tokenizer.pad_token_id  # ensure model knows pad id
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)


model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 592,128 || all params: 125,793,024 || trainable%: 0.4707


In [4]:
#5. Training Setup
training_args = TrainingArguments(
    output_dir="pubmedqa_gptneo_lora",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    acc = (preds == labels).astype(float).mean().item()
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

# 6. Fine-Tune
trainer.train()

#7. Evaluate
print("Evaluation Results:", trainer.evaluate())

#8. Save Adapter
# model.save_pretrained("pubmedqa_gptneo_lora_adapter")
trainer.save_model("pubmedqa_gptneo_lora_full")
tokenizer.save_pretrained("pubmedqa_gptneo_lora_full")

Epoch,Training Loss,Validation Loss,Accuracy
1,1.3054,1.344457,0.413333
2,1.1948,1.305185,0.413333
3,1.2061,1.294275,0.42


Evaluation Results: {'eval_loss': 1.2942745685577393, 'eval_accuracy': 0.42, 'eval_runtime': 2.7301, 'eval_samples_per_second': 54.942, 'eval_steps_per_second': 6.959, 'epoch': 3.0}


('pubmedqa_gptneo_lora_full/tokenizer_config.json',
 'pubmedqa_gptneo_lora_full/special_tokens_map.json',
 'pubmedqa_gptneo_lora_full/vocab.json',
 'pubmedqa_gptneo_lora_full/merges.txt',
 'pubmedqa_gptneo_lora_full/added_tokens.json',
 'pubmedqa_gptneo_lora_full/tokenizer.json')

In [5]:
from transformers import pipeline

# Load fine-tuned model + tokenizer
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Run inference on a new example
text = "Does increased uric acid lead to gout development?"
pred = pipe(text)
print(pred)


Device set to use cuda:0


[{'label': 'yes', 'score': 0.94881272315979}]


In [6]:
# Run inference on a new example
text = "Do mitochondria play a role in remodeling lace plant leaves during programmed cell death?"
pred = pipe(text)
print(pred)

[{'label': 'yes', 'score': 0.9463178515434265}]


In [8]:
from transformers import pipeline
from peft import PeftModel

# Reload full fine-tuned model (base + adapter + classifier head)
adapted_model = AutoModelForSequenceClassification.from_pretrained(
    "pubmedqa_gptneo_lora_full",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

pipe = pipeline("text-classification", model=adapted_model, tokenizer=tokenizer)
print("With adapter:", pipe("Does increased uric acid lead to gout development?"))

# Reload base model only (no adapter, no trained head)
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
    pad_token_id=tokenizer.pad_token_id
)

pipe_unlearned = pipeline("text-classification", model=base_model, tokenizer=tokenizer)
print("Without adapter:", pipe_unlearned("Does increased uric acid lead to gout development?"))


Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


With adapter: [{'label': 'yes', 'score': 0.94881272315979}]


Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Without adapter: [{'label': 'maybe', 'score': 0.6752314567565918}]
