In [None]:

!pip install -q datasets pandas

from datasets import load_dataset
import pandas as pd
import numpy as np

# load dataset
ds = load_dataset(
    "openlifescienceai/MedQA-USMLE-4-options-hf",
    split="train"
)


df = ds.to_pandas()

# basic cleaning


df = df.dropna(subset=["sent1", "sent2",
                       "ending0", "ending1", "ending2", "ending3",
                       "label"])

# Normalize whitespace: remove newlines, extra spaces
def clean_text(x):
    if not isinstance(x, str):
        return x
    x = x.replace("\n", " ")
    return " ".join(x.split())

for col in ["sent1", "sent2", "ending0", "ending1", "ending2", "ending3"]:
    df[col] = df[col].apply(clean_text)


df["label"] = df["label"].astype(int)
df = df[df["label"].isin([0, 1, 2, 3])]


df["question"] = df["sent1"].str.strip() + " " + df["sent2"].str.strip()

option_cols = ["ending0", "ending1", "ending2", "ending3"]
letters = ["A", "B", "C", "D"]

df["correct_letter"] = df["label"].apply(lambda i: letters[int(i)])
df["correct_text"]   = df.apply(
    lambda r: r[option_cols[int(r["label"])]],
    axis=1
)

print("Cleaned dataset size:", len(df))
df.head(3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/735 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/5.12M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/648k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/667k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1272 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1273 [00:00<?, ? examples/s]

Cleaned dataset size: 10178


Unnamed: 0,id,sent1,sent2,ending0,ending1,ending2,ending3,label,question,correct_letter,correct_text
0,train-00000,A 23-year-old pregnant woman at 22 weeks gesta...,,Ampicillin,Ceftriaxone,Doxycycline,Nitrofurantoin,3,A 23-year-old pregnant woman at 22 weeks gesta...,D,Nitrofurantoin
1,train-00001,A 3-month-old baby died suddenly at night whil...,,Placing the infant in a supine position on a f...,Keeping the infant covered and maintaining a h...,Application of a device to maintain the sleepi...,Avoiding pacifier use during sleep,0,A 3-month-old baby died suddenly at night whil...,A,Placing the infant in a supine position on a f...
2,train-00002,A mother brings her 3-week-old infant to the p...,,Abnormal migration of ventral pancreatic bud,Complete failure of proximal duodenum to recan...,Abnormal hypertrophy of the pylorus,Failure of lateral body folds to move ventrall...,0,A mother brings her 3-week-old infant to the p...,A,Abnormal migration of ventral pancreatic bud


In [None]:
# 1. Login to Hugging Face
from huggingface_hub import login
login()



from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model_name = "google/gemma-3-4b-it"

tokenizer = AutoTokenizer.from_pretrained(model_name)

if device == "cuda":
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,
    ).to(device)

model.eval()

print("Gemma-3 4B model loaded ")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Using device: cpu


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

In [None]:
import re, time
import pandas as pd

#build prompt
def build_mcq_prompt_from_parts(question_text, options):
    return f"""You are a medical expert answering USMLE-style questions.

Question:
{question_text}

Options:
A. {options[0]}
B. {options[1]}
C. {options[2]}
D. {options[3]}

Give ONLY the single best option letter: A, B, C, or D.
Answer:"""


def gemma_predict_letter(question_text, options, max_new_tokens=4):
    """
    Uses Gemma to answer the MCQ and returns a single letter in {A,B,C,D} or None.
    """
    prompt = build_mcq_prompt_from_parts(question_text, options)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        out_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

    full_text = tokenizer.decode(out_ids[0], skip_special_tokens=True)


    if full_text.startswith(prompt):
        completion = full_text[len(prompt):].strip()
    else:
        completion = full_text.split("Answer:")[-1].strip()


    m = re.search(r"\b([ABCD])\b", completion)
    if m:
        return m.group(1)


        if ch in "ABCD":
            return ch

    return None

#paraphrase
def paraphrase_question_with_gemma(question_text, max_new_tokens=96):
    """
    Uses Gemma to paraphrase the QUESTION ONLY.
    Deterministic (no sampling) to avoid CUDA multinomial issues.
    """
    prompt = (
        "You are a medical doctor.\n\n"
        "Rewrite the following clinical question in different words while keeping "
        "ALL the medical meaning and clinical facts the same.\n"
        "Do NOT change the clinical facts. Do NOT add new information.\n"
        "Output ONLY the rewritten question, nothing else.\n\n"
        f"Original question:\n{question_text}\n\n"
        "Rewritten question:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        out_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

    raw = tokenizer.decode(out_ids[0], skip_special_tokens=True).strip()


    if raw.startswith(prompt):
        paraphrased = raw[len(prompt):].strip()
    else:

        lines = [l for l in raw.split("\n") if l.strip()]
        paraphrased = lines[-1].strip() if lines else raw

    return paraphrased



In [None]:
N_QUESTIONS = 1000

subset = df.sample(N_QUESTIONS, random_state=0).reset_index(drop=True)

records = []
t0 = time.time()

for i, row in subset.iterrows():
    question_orig = row["question"]
    options = [row[c] for c in option_cols]
    correct = row["correct_letter"]

    #  baseline prediction on original question
    base_pred = gemma_predict_letter(question_orig, options)
    base_correct = (base_pred == correct)

    #  paraphrase the question (options unchanged)
    question_para = paraphrase_question_with_gemma(question_orig)

    # prediction on paraphrased question
    para_pred = gemma_predict_letter(question_para, options)
    para_correct = (para_pred == correct)

    # did the model flip its answer?
    flip = (base_pred != para_pred)

    records.append({
        "qid": i,
        "question_original": question_orig,
        "question_paraphrased": question_para,
        "option_A": options[0],
        "option_B": options[1],
        "option_C": options[2],
        "option_D": options[3],
        "correct_letter": correct,
        "baseline_pred": base_pred,
        "baseline_correct": base_correct,
        "paraphrased_pred": para_pred,
        "paraphrased_correct": para_correct,
        "prediction_flip": flip,
    })

    if (i + 1) % 10 == 0:
        elapsed = time.time() - t0
        print(f"Processed {i+1}/{N_QUESTIONS}  |  {elapsed:.1f}s elapsed")
#save results
para_df = pd.DataFrame(records)
out_name = "gemma_paraphrase_results.csv"
para_df.to_csv(out_name, index=False)

elapsed_total = time.time() - t0
print(f"\n Saved {out_name}")
print("Rows:", len(para_df))
print("Total time:", round(elapsed_total, 1), "seconds")

para_df.head()
