# Install Necessary Libraries

In [1]:
!pip install -q transformers accelerate torch torchvision sentencepiece xformers


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q xformers

# Access Files (Google Drive Setup)

## Mount to Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## File Directories

In [4]:
import pandas as pd

compiled_df = pd.read_parquet("/content/drive/MyDrive/DATA 298A/compiled_df.parquet")
mcq_df = pd.read_parquet("/content/drive/MyDrive/DATA 298A/mcq_df.parquet").reset_index(drop=True)
yn_df = pd.read_parquet("/content/drive/MyDrive/DATA 298A/yn_df.parquet").reset_index(drop=True)

In [5]:
yn_df.head()

Unnamed: 0,dataset_name,id_in_dataset,question,options,answer_label,question_type,prompt_text
0,pubmedqa,0,Do mitochondria play a role in remodelling lac...,Answer Choices:\nA. Yes\nB. No,yes,Y/N,Question:\nDo mitochondria play a role in remo...
1,pubmedqa,1,Landolt C and snellen e acuity: differences in...,Answer Choices:\nA. Yes\nB. No,no,Y/N,Question:\nLandolt C and snellen e acuity: dif...
2,pubmedqa,4,Can tailored interventions increase mammograph...,Answer Choices:\nA. Yes\nB. No,yes,Y/N,Question:\nCan tailored interventions increase...
3,pubmedqa,9,A short stay or 23-hour ward in a general and ...,Answer Choices:\nA. Yes\nB. No,yes,Y/N,Question:\nA short stay or 23-hour ward in a g...
4,pubmedqa,11,Therapeutic anticoagulation in the trauma pati...,Answer Choices:\nA. Yes\nB. No,no,Y/N,Question:\nTherapeutic anticoagulation in the ...


In [6]:
yn_df['question_type'].unique()

array(['Y/N'], dtype=object)

# Add Model (MedGemma-4B IT)

In [7]:
yn_df.loc[0, 'prompt_text']

'Question:\nDo mitochondria play a role in remodelling lace plant leaves during programmed cell death?\n\nAnswer Choices:\nA. Yes\nB. No'

In [8]:
yn_df.loc[0, 'options']

'Answer Choices:\nA. Yes\nB. No'

## Log Into HuggingFace to Access Model

In [9]:
from huggingface_hub import login
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import torch
from transformers import pipeline

login(new_session=False)

## Generate Pipeline for Model Usage

In [10]:
text2text_generator = pipeline(
    "text-generation",
    model="google/medgemma-4b-it"
)

config.json:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Device set to use cuda:0


In [11]:
prompt_ab = f"{yn_df.loc[0, 'prompt_text']} Answer (respond with EXACTLY one answer: A or B):"

## Running Input into Model For Response

In this use case, I am setting the model max key usage to 3, because I want to just see the response the model is giving when prompted to give just a yes or no answer

In [12]:
out = text2text_generator(
    prompt_ab,
    max_new_tokens=3,
    temperature=0.001,
    top_p=0.9,
    repetition_penalty=1.1,
    no_repeat_ngram_size=4,
    do_sample=False,
    return_full_text=False
)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [13]:
out # Output generated by the model

[{'generated_text': '\nB\n\n'}]

In [14]:
yn_df.loc[0, 'answer_label'] # What the real answer is

'yes'

# Extract Y/N Answer

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList, pipeline


Unlike pipeline, allows fine-grained control over generation

In [17]:
MODEL_ID = "google/medgemma-4b-it"
tok = AutoTokenizer.from_pretrained(MODEL_ID)
if tok.pad_token_id is None:
    tok.pad_token_id = tok.eos_token_id
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Function to extract model's tokens for 'A' and 'B'. We use this later to ensure the model outputs the answer it deems fit as the solution to the question.

In [18]:
def token_ids_for(chars={"A","B"}):
    ids = set()
    for i in range(len(tok)):
        s = tok.decode([i], skip_special_tokens=True).strip()
        if len(s) == 1 and s in chars:
            ids.add(i)
    return sorted(ids)

In [19]:
AB_IDS = token_ids_for({"A","B"})
assert AB_IDS, "Could not find A/B token ids."

Ensures that when an answer it outputted, the solution is either 'A' or 'B', and nothing else (will need to be changed later depending on the question type)

In [20]:
class OnlyABFirstToken(LogitsProcessor):
    def __init__(self, allowed_ids):
        super().__init__()
        self.allowed_ids = torch.tensor(allowed_ids)  # Ensures model only has vocabulary of the allowed tokens
        self.step0 = True

    def __call__(self, input_ids, scores):
        # On step 0, set all non-allowed_ids to negative scores so they aren't chosen in solution. Only 'A' and 'B'
        if self.step0:
            mask = torch.full_like(scores, float("-inf"))
            mask[:, self.allowed_ids] = scores[:, self.allowed_ids]
            scores = mask
            self.step0 = False
        return scores

Function that creates the pipeline to handle the prompt, and generate what the model believes to be the true answer to the question.

In [21]:
def predict_AB(question_block: str) -> str:
    prompt = f"""
    {question_block}
    Answer (respond with EXACTLY one character: A or B):
    """

    inputs = f(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out_ids = model.generate(
            **inputs,
            max_new_tokens=1,
            do_sample=False,
            logits_processor=processors,
            pad_token_id=tok.pad_token_id,
            eos_token_id=tok.eos_token_id,
        )

    # decode only the newly generated piece
    gen_part = out_ids[0, inputs["input_ids"].shape[1]:]
    text = tok.decode(gen_part, skip_special_tokens=True).strip()
    return text[:1]

Instantiates custom logits class to enforce pipeline rules for responses (the one that makes the model only choose certain tokens)

In [22]:
processors = LogitsProcessorList([OnlyABFirstToken(AB_IDS)])

Runs the prompt from the dataset against the pipelne, and outputs the answer. This allows us to see if the real answer lined up with the model's, and prints the accuracy of the model at the end.

In [23]:
import re

def ab_to_yesno(a_or_b: str) -> str:
    return "yes" if a_or_b.upper() == "A" else "no"

preds = []
gold = []

for i in range(25):
    qblock = yn_df.loc[i, "prompt_text"]
    pred_ab = predict_AB(qblock)
    pred = ab_to_yesno(pred_ab)
    y = str(yn_df.loc[i, "answer_label"]).strip().lower()
    preds.append(pred); gold.append(y)
    print(f"real: {y} | predicted: {pred_ab!r} → {pred}")

acc = sum(p==g for p,g in zip(preds,gold)) / len(gold)
print("Accuracy:", acc)


real: yes | predicted: 'A' → yes
real: no | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: no | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: no | predicted: 'B' → no
real: yes | predicted: 'A' → yes
real: yes | predicted: 'B' → no
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: no | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
real: yes | predicted: 'A' → yes
Accuracy: 0.84


# Extract Reasonings

Since we no longer need fine-grained control of what the model is saying, we instantiate another pipeline to generate the explanation for the answer it returned (regardless of if it was right or not).

## Generate Resoning for Answer Choice

In [44]:
prompt_exp = f"""{yn_df.loc[4, 'prompt_text']}

You already answered: {preds[4]}
Explain why this answer is correct.
Output only the explanation text."""

out = text2text_generator(
    prompt_exp,
    max_new_tokens=200,
    repetition_penalty=1.1,
    no_repeat_ngram_size=4,
    do_sample=False,
    return_full_text=False
)


Question:
Therapeutic anticoagulation in the trauma patient: is it safe?

Answer Choices:
A. Yes
B. No

You already answered: yes
Explain why this answer is correct.
Output only the explanation text.

The use of therapeutic anticoagulation in trauma patients is a complex issue with potential benefits and risks. While therapeutic anticoagulation can be beneficial in certain situations, such as preventing venous thromboembolism (VTE) in immobilized patients or treating specific bleeding disorders, it also carries significant risks, particularly in the setting of acute trauma where there may be ongoing hemorrhage. The decision to initiate therapeutic anticoagulation should be made on an individual basis after careful consideration of the patient's clinical condition, risk factors for VTE, and potential benefits versus risks. It is important to note that guidelines from organizations like the American College of Chest Physicians (ACCP) provide recommendations for the use of prophylactic an

## Clean Model Response to Remove Repetition in Answer

If the answer does not use all the tokens available, the model tends to repeat itself with another explanation of the results (often using similar vocabulary). Since we only want a single explanation, we will cut this off from the answer given by the model.

In [49]:
gen = out[0]["generated_text"].strip()

cut_markers = ["Final Answer:", "Answer:", "Final answer:"]
for m in cut_markers:
    if m in gen:
        gen = gen.split(m)[0].strip()

print(prompt_exp)
print()
print(f"Justification: {gen}")


Question:
Therapeutic anticoagulation in the trauma patient: is it safe?

Answer Choices:
A. Yes
B. No

You already answered: yes
Explain why this answer is correct.
Output only the explanation text.

Justification: The use of therapeutic anticoagulation in trauma patients is a complex issue with potential benefits and risks. While therapeutic anticoagulation can be beneficial in certain situations, such as preventing venous thromboembolism (VTE) in immobilized patients or treating specific bleeding disorders, it also carries significant risks, particularly in the setting of acute trauma where there may be ongoing hemorrhage. The decision to initiate therapeutic anticoagulation should be made on an individual basis after careful consideration of the patient's clinical condition, risk factors for VTE, and potential benefits versus risks. It is important to note that guidelines from organizations like the American College of Chest Physicians (ACCP) provide recommendations for the use of 