In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from difflib import SequenceMatcher

In [40]:
df = pd.read_json("hf://datasets/UCSC-VLAA/MedReason/ours_quality_33000.jsonl", lines=True)

df['question_char_len'] = df['question'].apply(len)
df['answer_char_len'] = df['answer'].apply(len)

df['question_word_count'] = df['question'].apply(lambda x: len(x.split()))
df['answer_word_count'] = df['answer'].apply(lambda x: len(x.split()))

In [41]:
def view_sample_answers(dataset, num_rows=5):
    for i in range(num_rows):
        print(dataset.iloc[i]['answer'])
        print('*' * 50)

def view_sample_options(dataset, num_rows=5):
    for i in range(num_rows):
        print(dataset.iloc[i]['options'])
        print('*' * 50)

def view_sample_questions(dataset, num_rows=5):
    for i in range(num_rows):
        print(dataset.iloc[i]['question'])
        print('*' * 50)

# MedMCQA

In [42]:
medmcqa_qs = df[df['dataset_name'] == 'medmcqa'].copy()
medmcqa_qs

Unnamed: 0,dataset_name,id_in_dataset,question,answer,reasoning,options,question_char_len,answer_char_len,question_word_count,answer_word_count
0,medmcqa,7131,Urogenital Diaphragm is made up of the followi...,Colle's fascia. Explanation: Colle's fascia do...,Finding reasoning paths:\n1. Urogenital diaphr...,Answer Choices:\nA. Deep transverse Perineus\n...,57,343,9,54
1,medmcqa,7133,Child with Type I Diabetes. What is the advise...,After 5 years. Explanation: Screening for diab...,**Finding reasoning paths:**\n\n1. Type 1 Diab...,Answer Choices:\nA. After 5 years\nB. After 2 ...,104,592,18,96
2,medmcqa,7134,Most sensitive test for H pylori is-,Biopsy urease test. Explanation: <P>Davidson&;...,**Finding reasoning paths:**\n\n1. Consider th...,Answer Choices:\nA. Fecal antigen test\nB. Bio...,36,178,7,21
3,medmcqa,7137,Ligation of the common hepatic aery will compr...,Right gastric and right gastroepiploic aery. E...,**Finding reasoning paths:**\n\n1. Common hepa...,Answer Choices:\nA. Right and Left gastric aer...,65,403,11,54
4,medmcqa,7138,Typhoid investigation of choice in 1st week,Blood culture. Explanation: (A) Blood culture ...,Finding reasoning paths:\n\n1. Consider the pa...,Answer Choices:\nA. Blood culture\nB. Widal te...,43,1360,7,239
...,...,...,...,...,...,...,...,...,...,...
29157,medmcqa,7125,A 60-year-old nursing home resident presents w...,Contact precautions. Explanation: There are fo...,### Finding Reasoning Paths:\n1. **Clinical Pr...,Answer Choices:\nA. Standard precautions\nB. C...,340,1610,45,238
29158,medmcqa,7126,Transcripton is inhibited by:,Actinomycin D. Explanation: A i.e. Actinomycin,### Finding Reasoning Paths:\n1. mRNA transcri...,Answer Choices:\nA. Actinomycin D\nB. Amanitin...,29,46,4,6
29159,medmcqa,7128,Triage is –,Categorisation of the patients and treating th...,### Finding Reasoning Paths:\n1. **Severity of...,Answer Choices:\nA. Treating the most serious ...,11,715,3,103
29160,medmcqa,7129,All of the following are true in respect of he...,Autosomal recessive disorder. Explanation: Her...,### Finding Reasoning Paths:\n1. **Complement ...,Answer Choices:\nA. Deficiency of C1 inhibitor...,89,531,14,73


In [43]:
def count_answer_choices(text):
    if pd.isnull(text):
        return 0
    lines = text.strip().split('\n')
    choice_lines = lines[1:]
    count = sum(bool(re.match(r'^[A-Z]\.', line.strip())) for line in choice_lines)
    return count

medmcqa_qs['num_choices'] = medmcqa_qs['options'].apply(count_answer_choices)

In [44]:
def clean_text(text):
    """Lowercase, remove newlines/tabs/extra spaces, strip, remove trailing period."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # replace all whitespace (including \n, \t) with single space
    text = text.strip()
    text = text.rstrip('.')
    return text

def extract_mcq_label_fuzzy(answer_text, options_text, threshold=0.9):
    """
    Extracts MCQ letter (A-D) corresponding to the correct answer using fuzzy matching.
    Only uses cleaned text for matching; original text remains unchanged.
    """
    # Clean main answer (before Explanation:)
    answer_main = clean_text(answer_text.split("Explanation:")[0])

    # Extract options
    matches = re.findall(r'([A-D])\.\s*(.*)', options_text)

    best_letter = None
    best_ratio = 0

    for letter, text in matches:
        option_text = clean_text(text)
        ratio = SequenceMatcher(None, answer_main, option_text).ratio()
        if ratio > best_ratio and ratio >= threshold:
            best_ratio = ratio
            best_letter = letter

    return best_letter

In [45]:
medmcqa_qs['answer_label'] = medmcqa_qs.apply(lambda row: extract_mcq_label_fuzzy(row['answer'], row['options']), axis=1)

medmcqa_qs['answer_label'].value_counts(dropna=False)

Unnamed: 0_level_0,count
answer_label,Unnamed: 1_level_1
A,1921
B,1554
C,1528
D,1194


In [46]:
medmcqa_qs["question_type"] = "MCQ"
medmcqa_qs

Unnamed: 0,dataset_name,id_in_dataset,question,answer,reasoning,options,question_char_len,answer_char_len,question_word_count,answer_word_count,num_choices,answer_label,question_type
0,medmcqa,7131,Urogenital Diaphragm is made up of the followi...,Colle's fascia. Explanation: Colle's fascia do...,Finding reasoning paths:\n1. Urogenital diaphr...,Answer Choices:\nA. Deep transverse Perineus\n...,57,343,9,54,4,C,MCQ
1,medmcqa,7133,Child with Type I Diabetes. What is the advise...,After 5 years. Explanation: Screening for diab...,**Finding reasoning paths:**\n\n1. Type 1 Diab...,Answer Choices:\nA. After 5 years\nB. After 2 ...,104,592,18,96,4,A,MCQ
2,medmcqa,7134,Most sensitive test for H pylori is-,Biopsy urease test. Explanation: <P>Davidson&;...,**Finding reasoning paths:**\n\n1. Consider th...,Answer Choices:\nA. Fecal antigen test\nB. Bio...,36,178,7,21,4,B,MCQ
3,medmcqa,7137,Ligation of the common hepatic aery will compr...,Right gastric and right gastroepiploic aery. E...,**Finding reasoning paths:**\n\n1. Common hepa...,Answer Choices:\nA. Right and Left gastric aer...,65,403,11,54,4,D,MCQ
4,medmcqa,7138,Typhoid investigation of choice in 1st week,Blood culture. Explanation: (A) Blood culture ...,Finding reasoning paths:\n\n1. Consider the pa...,Answer Choices:\nA. Blood culture\nB. Widal te...,43,1360,7,239,4,A,MCQ
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29157,medmcqa,7125,A 60-year-old nursing home resident presents w...,Contact precautions. Explanation: There are fo...,### Finding Reasoning Paths:\n1. **Clinical Pr...,Answer Choices:\nA. Standard precautions\nB. C...,340,1610,45,238,4,B,MCQ
29158,medmcqa,7126,Transcripton is inhibited by:,Actinomycin D. Explanation: A i.e. Actinomycin,### Finding Reasoning Paths:\n1. mRNA transcri...,Answer Choices:\nA. Actinomycin D\nB. Amanitin...,29,46,4,6,4,A,MCQ
29159,medmcqa,7128,Triage is –,Categorisation of the patients and treating th...,### Finding Reasoning Paths:\n1. **Severity of...,Answer Choices:\nA. Treating the most serious ...,11,715,3,103,4,B,MCQ
29160,medmcqa,7129,All of the following are true in respect of he...,Autosomal recessive disorder. Explanation: Her...,### Finding Reasoning Paths:\n1. **Complement ...,Answer Choices:\nA. Deficiency of C1 inhibitor...,89,531,14,73,4,D,MCQ


# PubMedQA

In [47]:
pubmedqa_qs = df[df['dataset_name'] == "pubmedqa"].copy()
pubmedqa_qs

Unnamed: 0,dataset_name,id_in_dataset,question,answer,reasoning,options,question_char_len,answer_char_len,question_word_count,answer_word_count
13055,pubmedqa,0,Do mitochondria play a role in remodelling lac...,The final decision is: yes. Results depicted m...,### Finding Reasoning Paths:\n1. Mitochondria ...,Answer Choices:\nA. Yes\nB. No,90,645,14,102
13056,pubmedqa,1,Landolt C and snellen e acuity: differences in...,The final decision is: no. Using the charts de...,### Finding Reasoning Paths:\n1. Strabismus ->...,Answer Choices:\nA. Yes\nB. No,68,258,10,43
13057,pubmedqa,4,Can tailored interventions increase mammograph...,The final decision is: yes. The effects of the...,### Finding Reasoning Paths:\n1. **Effectivene...,Answer Choices:\nA. Yes\nB. No,68,816,9,110
13058,pubmedqa,9,A short stay or 23-hour ward in a general and ...,The final decision is: yes. This data demonstr...,### Finding Reasoning Paths:\n1. **Bed Efficie...,Answer Choices:\nA. Yes\nB. No,95,333,16,55
13059,pubmedqa,11,Therapeutic anticoagulation in the trauma pati...,The final decision is: no. Trauma patients hav...,### Finding Reasoning Paths:\n1. **Response to...,Answer Choices:\nA. Yes\nB. No,62,333,9,46
...,...,...,...,...,...,...,...,...,...,...
13653,pubmedqa,993,Are many colorectal cancers due to missed aden...,The final decision is: no. In normal daily pra...,### Finding Reasoning Paths:\n1. Colorectal ca...,Answer Choices:\nA. Yes\nB. No,51,179,8,28
13654,pubmedqa,994,Does desflurane alter left ventricular functio...,The final decision is: no. This study demonstr...,### Finding Reasoning Paths:\n1. Desflurane's ...,Answer Choices:\nA. Yes\nB. No,112,238,15,33
13655,pubmedqa,995,Does gestational age misclassification explain...,The final decision is: no. Gestational age mis...,### Finding Reasoning Paths:\n1. Investigate w...,Answer Choices:\nA. Yes\nB. No,115,255,14,34
13656,pubmedqa,997,Is peak concentration needed in therapeutic dr...,The final decision is: no. These results sugge...,### Finding Reasoning Paths:\n1. Vancomycin ->...,Answer Choices:\nA. Yes\nB. No,74,195,10,31


In [48]:
def extract_decision(text):
    # Regex to find "yes" or "no" after "The final decision is:"
    match = re.search(r"The final decision is:\s*(yes|no|maybe)", text, flags=re.IGNORECASE)
    if match:
        return match.group(1).lower()   # return 'yes' or 'no'
    return None

# Apply to your dataframe
pubmedqa_qs['answer_label'] = pubmedqa_qs['answer'].apply(extract_decision)

# Count distribution
print(pubmedqa_qs['answer_label'].value_counts())

answer_label
yes      421
no       181
maybe      1
Name: count, dtype: int64


In [49]:
# Remove row with answer_label 'maybe'
pubmedqa_qs = pubmedqa_qs[pubmedqa_qs['answer_label'] != 'maybe']
pubmedqa_qs[['question', 'answer', 'answer_label']]

Unnamed: 0,question,answer,answer_label
13055,Do mitochondria play a role in remodelling lac...,The final decision is: yes. Results depicted m...,yes
13056,Landolt C and snellen e acuity: differences in...,The final decision is: no. Using the charts de...,no
13057,Can tailored interventions increase mammograph...,The final decision is: yes. The effects of the...,yes
13058,A short stay or 23-hour ward in a general and ...,The final decision is: yes. This data demonstr...,yes
13059,Therapeutic anticoagulation in the trauma pati...,The final decision is: no. Trauma patients hav...,no
...,...,...,...
13653,Are many colorectal cancers due to missed aden...,The final decision is: no. In normal daily pra...,no
13654,Does desflurane alter left ventricular functio...,The final decision is: no. This study demonstr...,no
13655,Does gestational age misclassification explain...,The final decision is: no. Gestational age mis...,no
13656,Is peak concentration needed in therapeutic dr...,The final decision is: no. These results sugge...,no


In [50]:
pubmedqa_qs["question_type"] = "Y/N"
pubmedqa_qs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pubmedqa_qs["question_type"] = "Y/N"


Unnamed: 0,dataset_name,id_in_dataset,question,answer,reasoning,options,question_char_len,answer_char_len,question_word_count,answer_word_count,answer_label,question_type
13055,pubmedqa,0,Do mitochondria play a role in remodelling lac...,The final decision is: yes. Results depicted m...,### Finding Reasoning Paths:\n1. Mitochondria ...,Answer Choices:\nA. Yes\nB. No,90,645,14,102,yes,Y/N
13056,pubmedqa,1,Landolt C and snellen e acuity: differences in...,The final decision is: no. Using the charts de...,### Finding Reasoning Paths:\n1. Strabismus ->...,Answer Choices:\nA. Yes\nB. No,68,258,10,43,no,Y/N
13057,pubmedqa,4,Can tailored interventions increase mammograph...,The final decision is: yes. The effects of the...,### Finding Reasoning Paths:\n1. **Effectivene...,Answer Choices:\nA. Yes\nB. No,68,816,9,110,yes,Y/N
13058,pubmedqa,9,A short stay or 23-hour ward in a general and ...,The final decision is: yes. This data demonstr...,### Finding Reasoning Paths:\n1. **Bed Efficie...,Answer Choices:\nA. Yes\nB. No,95,333,16,55,yes,Y/N
13059,pubmedqa,11,Therapeutic anticoagulation in the trauma pati...,The final decision is: no. Trauma patients hav...,### Finding Reasoning Paths:\n1. **Response to...,Answer Choices:\nA. Yes\nB. No,62,333,9,46,no,Y/N
...,...,...,...,...,...,...,...,...,...,...,...,...
13653,pubmedqa,993,Are many colorectal cancers due to missed aden...,The final decision is: no. In normal daily pra...,### Finding Reasoning Paths:\n1. Colorectal ca...,Answer Choices:\nA. Yes\nB. No,51,179,8,28,no,Y/N
13654,pubmedqa,994,Does desflurane alter left ventricular functio...,The final decision is: no. This study demonstr...,### Finding Reasoning Paths:\n1. Desflurane's ...,Answer Choices:\nA. Yes\nB. No,112,238,15,33,no,Y/N
13655,pubmedqa,995,Does gestational age misclassification explain...,The final decision is: no. Gestational age mis...,### Finding Reasoning Paths:\n1. Investigate w...,Answer Choices:\nA. Yes\nB. No,115,255,14,34,no,Y/N
13656,pubmedqa,997,Is peak concentration needed in therapeutic dr...,The final decision is: no. These results sugge...,### Finding Reasoning Paths:\n1. Vancomycin ->...,Answer Choices:\nA. Yes\nB. No,74,195,10,31,no,Y/N


# medqa

In [51]:
medqa_qs = df[df['dataset_name'] == "medqa"].copy()
medqa_qs

Unnamed: 0,dataset_name,id_in_dataset,question,answer,reasoning,options,question_char_len,answer_char_len,question_word_count,answer_word_count
2011,medqa,7595,A 62-year-old man presents with multiple episo...,Small cell carcinoma of the lung,**Finding reasoning paths:**\n\n1. Hemoptysis ...,Answer Choices:\nA. Tuberculoma\nB. Small cell...,553,32,94,6
2012,medqa,7596,An 84-year-old man is brought to the physician...,Denudation of tubular basement membrane,**Finding reasoning paths:**\n\n1. Decreased u...,Answer Choices:\nA. Leukocytic infiltration of...,582,39,99,5
2013,medqa,7598,"A 44-year-old man, with a history of intraveno...",Trimethoprim-sulfamethoxazole,Finding reasoning paths:\n1. Non-productive co...,"Answer Choices:\nA. Isoniazid, rifabutin, pyra...",1123,29,175,1
2014,medqa,7599,A 33-year-old woman comes to the physician bec...,"Optic neuritis\n""",**Finding reasoning paths:**\n\n1. Visual impa...,Answer Choices:\nA. Retinal detachment\nB. Nar...,963,16,165,3
2015,medqa,7600,A 23-year-old man is brought to the emergency ...,Schizophreniform disorder,Finding reasoning paths:\n1. Behavioral abnorm...,Answer Choices:\nA. Schizoaffective disorder\n...,791,25,130,2
...,...,...,...,...,...,...,...,...,...,...
32677,medqa,995,A 51-year-old man is bitten by a cottonmouth v...,Glucocorticoid taper with antihistamines,### Finding Reasoning Paths:\n1. **Pruritis an...,Answer Choices:\nA. Glucocorticoid taper with ...,1009,40,156,4
32678,medqa,996,A 47-year-old man presents to the clinic for a...,Bed bug bite,### Finding Reasoning Paths:\n1. **Pruritus an...,Answer Choices:\nA. Cutaneous larva migrans\nB...,750,12,128,3
32679,medqa,997,A 24-year-old woman presents to her primary ca...,Motile and helical-shaped bacteria,### Finding Reasoning Paths:\n1. The patient h...,Answer Choices:\nA. Gram-negative coccobacillu...,812,34,134,4
32680,medqa,998,"A 70-year-old man presents with fever, headach...",Ampicillin,"### Finding Reasoning Paths:\n1. Fever, headac...",Answer Choices:\nA. Ampicillin\nB. Ceftriaxone...,808,10,120,1


In [52]:
medqa_qs['num_choices'] = medqa_qs['options'].apply(count_answer_choices)

print(medqa_qs['num_choices'].value_counts())


num_choices
4    8016
Name: count, dtype: int64


In [53]:
def extract_mcq_label_fuzzy(answer_text, options_text, threshold=0.9):
    """
    Extracts MCQ letter (A-D) corresponding to the correct answer using fuzzy matching.
    Only uses cleaned text for matching; original text remains unchanged.
    """
    answer_main = clean_text(answer_text)

    # Extract options
    matches = re.findall(r'([A-D])\.\s*(.*)', options_text)

    best_letter = None
    best_ratio = 0

    for letter, text in matches:
        option_text = clean_text(text)
        ratio = SequenceMatcher(None, answer_main, option_text).ratio()
        if ratio > best_ratio and ratio >= threshold:
            best_ratio = ratio
            best_letter = letter

    return best_letter

In [54]:
medqa_qs['answer_label'] = medqa_qs.apply(lambda row: extract_mcq_label_fuzzy(row['answer'], row['options'], 0.80), axis=1)
medqa_qs['answer_label'].value_counts(dropna=False)


Unnamed: 0_level_0,count
answer_label,Unnamed: 1_level_1
B,2074
A,2055
C,2028
D,1859


In [55]:
medqa_qs['answer_label'].value_counts(dropna=False).sum()

np.int64(8016)

In [56]:
medqa_qs["question_type"] = "MCQ"
medqa_qs

Unnamed: 0,dataset_name,id_in_dataset,question,answer,reasoning,options,question_char_len,answer_char_len,question_word_count,answer_word_count,num_choices,answer_label,question_type
2011,medqa,7595,A 62-year-old man presents with multiple episo...,Small cell carcinoma of the lung,**Finding reasoning paths:**\n\n1. Hemoptysis ...,Answer Choices:\nA. Tuberculoma\nB. Small cell...,553,32,94,6,4,B,MCQ
2012,medqa,7596,An 84-year-old man is brought to the physician...,Denudation of tubular basement membrane,**Finding reasoning paths:**\n\n1. Decreased u...,Answer Choices:\nA. Leukocytic infiltration of...,582,39,99,5,4,C,MCQ
2013,medqa,7598,"A 44-year-old man, with a history of intraveno...",Trimethoprim-sulfamethoxazole,Finding reasoning paths:\n1. Non-productive co...,"Answer Choices:\nA. Isoniazid, rifabutin, pyra...",1123,29,175,1,4,D,MCQ
2014,medqa,7599,A 33-year-old woman comes to the physician bec...,"Optic neuritis\n""",**Finding reasoning paths:**\n\n1. Visual impa...,Answer Choices:\nA. Retinal detachment\nB. Nar...,963,16,165,3,4,D,MCQ
2015,medqa,7600,A 23-year-old man is brought to the emergency ...,Schizophreniform disorder,Finding reasoning paths:\n1. Behavioral abnorm...,Answer Choices:\nA. Schizoaffective disorder\n...,791,25,130,2,4,C,MCQ
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32677,medqa,995,A 51-year-old man is bitten by a cottonmouth v...,Glucocorticoid taper with antihistamines,### Finding Reasoning Paths:\n1. **Pruritis an...,Answer Choices:\nA. Glucocorticoid taper with ...,1009,40,156,4,4,A,MCQ
32678,medqa,996,A 47-year-old man presents to the clinic for a...,Bed bug bite,### Finding Reasoning Paths:\n1. **Pruritus an...,Answer Choices:\nA. Cutaneous larva migrans\nB...,750,12,128,3,4,B,MCQ
32679,medqa,997,A 24-year-old woman presents to her primary ca...,Motile and helical-shaped bacteria,### Finding Reasoning Paths:\n1. The patient h...,Answer Choices:\nA. Gram-negative coccobacillu...,812,34,134,4,4,C,MCQ
32680,medqa,998,"A 70-year-old man presents with fever, headach...",Ampicillin,"### Finding Reasoning Paths:\n1. Fever, headac...",Answer Choices:\nA. Ampicillin\nB. Ceftriaxone...,808,10,120,1,4,A,MCQ


# MMLU

In [57]:
MMLU_qs = df[df['dataset_name'] == "MMLU"].copy()
MMLU_qs

Unnamed: 0,dataset_name,id_in_dataset,question,answer,reasoning,options,question_char_len,answer_char_len,question_word_count,answer_word_count
12749,MMLU,1000,A 22-year-old woman comes to the office becaus...,BK virus,### Finding Reasoning Paths:\n1. **Immunosuppr...,Answer Choices:\nA. Adenovirus\nB. BK virus\nC...,695,8,100,2
12750,MMLU,1001,A 53-year-old man comes to the physician becau...,Superficial inguinal,### Finding Reasoning Paths:\n1. Anal mass loc...,Answer Choices:\nA. Internal iliac\nB. Poplite...,394,20,68,2
12751,MMLU,1002,A study is being conducted to assess mesotheli...,Minimize ascertainment bias,### Finding Reasoning Paths:\n1. Asbestos expo...,Answer Choices:\nA. Address confounding\nB. De...,624,27,99,3
12752,MMLU,1004,A 31-year-old man with a 5-year history of HIV...,Imiquimod,### Finding Reasoning Paths:\n1. **HIV infecti...,Answer Choices:\nA. Acyclovir\nB. Imiquimod\nC...,1244,9,195,1
12753,MMLU,1005,A 71-year-old woman is brought to the emergenc...,Vascular dementia,### Finding Reasoning Paths:\n1. **Hypertensio...,Answer Choices:\nA. Amyotrophic lateral sclero...,1394,17,235,2
...,...,...,...,...,...,...,...,...,...,...
24974,MMLU,995,You are visiting an 86-year-old woman in her h...,Arrange for consultation with a home hospice team,### Finding Reasoning Paths:\n1. Chronic obstr...,Answer Choices:\nA. Admit the patient to the h...,1436,49,239,8
24975,MMLU,996,A 37-year-old woman comes to the physician bec...,Streptococcus pneumoniae,### Finding Reasoning Paths:\n1. **Purulent na...,Answer Choices:\nA. Haemophilus influenzae typ...,774,24,126,2
24976,MMLU,997,A 42-year-old woman is brought to the emergenc...,Laparotomy,### Finding Reasoning Paths:\n1. **Abnormal br...,Answer Choices:\nA. CT scan of the abdomen\nB....,629,10,106,1
24977,MMLU,998,A 19-year-old woman who is a regular patient c...,Polycystic ovarian syndrome,### Finding Reasoning Paths:\n1. **Irregular m...,Answer Choices:\nA. Androgen-producing ovarian...,1151,27,199,3


In [58]:
MMLU_qs['num_choices'] = MMLU_qs['options'].apply(count_answer_choices)

print(MMLU_qs['num_choices'].value_counts())

num_choices
4    827
Name: count, dtype: int64


In [59]:
MMLU_qs['answer_label'] = MMLU_qs.apply(lambda row: extract_mcq_label_fuzzy(row['answer'], row['options'], 0.9), axis=1)
MMLU_qs['answer_label'].value_counts(dropna=False)

Unnamed: 0_level_0,count
answer_label,Unnamed: 1_level_1
D,257
C,196
B,190
A,184


In [60]:
MMLU_qs["question_type"] = "MCQ"
MMLU_qs

Unnamed: 0,dataset_name,id_in_dataset,question,answer,reasoning,options,question_char_len,answer_char_len,question_word_count,answer_word_count,num_choices,answer_label,question_type
12749,MMLU,1000,A 22-year-old woman comes to the office becaus...,BK virus,### Finding Reasoning Paths:\n1. **Immunosuppr...,Answer Choices:\nA. Adenovirus\nB. BK virus\nC...,695,8,100,2,4,B,MCQ
12750,MMLU,1001,A 53-year-old man comes to the physician becau...,Superficial inguinal,### Finding Reasoning Paths:\n1. Anal mass loc...,Answer Choices:\nA. Internal iliac\nB. Poplite...,394,20,68,2,4,D,MCQ
12751,MMLU,1002,A study is being conducted to assess mesotheli...,Minimize ascertainment bias,### Finding Reasoning Paths:\n1. Asbestos expo...,Answer Choices:\nA. Address confounding\nB. De...,624,27,99,3,4,C,MCQ
12752,MMLU,1004,A 31-year-old man with a 5-year history of HIV...,Imiquimod,### Finding Reasoning Paths:\n1. **HIV infecti...,Answer Choices:\nA. Acyclovir\nB. Imiquimod\nC...,1244,9,195,1,4,B,MCQ
12753,MMLU,1005,A 71-year-old woman is brought to the emergenc...,Vascular dementia,### Finding Reasoning Paths:\n1. **Hypertensio...,Answer Choices:\nA. Amyotrophic lateral sclero...,1394,17,235,2,4,D,MCQ
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24974,MMLU,995,You are visiting an 86-year-old woman in her h...,Arrange for consultation with a home hospice team,### Finding Reasoning Paths:\n1. Chronic obstr...,Answer Choices:\nA. Admit the patient to the h...,1436,49,239,8,4,B,MCQ
24975,MMLU,996,A 37-year-old woman comes to the physician bec...,Streptococcus pneumoniae,### Finding Reasoning Paths:\n1. **Purulent na...,Answer Choices:\nA. Haemophilus influenzae typ...,774,24,126,2,4,D,MCQ
24976,MMLU,997,A 42-year-old woman is brought to the emergenc...,Laparotomy,### Finding Reasoning Paths:\n1. **Abnormal br...,Answer Choices:\nA. CT scan of the abdomen\nB....,629,10,106,1,4,D,MCQ
24977,MMLU,998,A 19-year-old woman who is a regular patient c...,Polycystic ovarian syndrome,### Finding Reasoning Paths:\n1. **Irregular m...,Answer Choices:\nA. Androgen-producing ovarian...,1151,27,199,3,4,D,MCQ


# Compile

In [61]:
desired_cols = ["dataset_name", "id_in_dataset", "question", "options", "answer_label", "question_type"]

compiled_df = pd.concat([medmcqa_qs[desired_cols], pubmedqa_qs[desired_cols], medqa_qs[desired_cols], MMLU_qs[desired_cols]])
compiled_df

Unnamed: 0,dataset_name,id_in_dataset,question,options,answer_label,question_type
0,medmcqa,7131,Urogenital Diaphragm is made up of the followi...,Answer Choices:\nA. Deep transverse Perineus\n...,C,MCQ
1,medmcqa,7133,Child with Type I Diabetes. What is the advise...,Answer Choices:\nA. After 5 years\nB. After 2 ...,A,MCQ
2,medmcqa,7134,Most sensitive test for H pylori is-,Answer Choices:\nA. Fecal antigen test\nB. Bio...,B,MCQ
3,medmcqa,7137,Ligation of the common hepatic aery will compr...,Answer Choices:\nA. Right and Left gastric aer...,D,MCQ
4,medmcqa,7138,Typhoid investigation of choice in 1st week,Answer Choices:\nA. Blood culture\nB. Widal te...,A,MCQ
...,...,...,...,...,...,...
24974,MMLU,995,You are visiting an 86-year-old woman in her h...,Answer Choices:\nA. Admit the patient to the h...,B,MCQ
24975,MMLU,996,A 37-year-old woman comes to the physician bec...,Answer Choices:\nA. Haemophilus influenzae typ...,D,MCQ
24976,MMLU,997,A 42-year-old woman is brought to the emergenc...,Answer Choices:\nA. CT scan of the abdomen\nB....,D,MCQ
24977,MMLU,998,A 19-year-old woman who is a regular patient c...,Answer Choices:\nA. Androgen-producing ovarian...,D,MCQ


In [62]:
# check nulls
compiled_df.isnull().sum()

Unnamed: 0,0
dataset_name,0
id_in_dataset,0
question,0
options,0
answer_label,0
question_type,0


In [63]:
# reset index
compiled_df = compiled_df.reset_index(drop=True)
compiled_df

Unnamed: 0,dataset_name,id_in_dataset,question,options,answer_label,question_type
0,medmcqa,7131,Urogenital Diaphragm is made up of the followi...,Answer Choices:\nA. Deep transverse Perineus\n...,C,MCQ
1,medmcqa,7133,Child with Type I Diabetes. What is the advise...,Answer Choices:\nA. After 5 years\nB. After 2 ...,A,MCQ
2,medmcqa,7134,Most sensitive test for H pylori is-,Answer Choices:\nA. Fecal antigen test\nB. Bio...,B,MCQ
3,medmcqa,7137,Ligation of the common hepatic aery will compr...,Answer Choices:\nA. Right and Left gastric aer...,D,MCQ
4,medmcqa,7138,Typhoid investigation of choice in 1st week,Answer Choices:\nA. Blood culture\nB. Widal te...,A,MCQ
...,...,...,...,...,...,...
15637,MMLU,995,You are visiting an 86-year-old woman in her h...,Answer Choices:\nA. Admit the patient to the h...,B,MCQ
15638,MMLU,996,A 37-year-old woman comes to the physician bec...,Answer Choices:\nA. Haemophilus influenzae typ...,D,MCQ
15639,MMLU,997,A 42-year-old woman is brought to the emergenc...,Answer Choices:\nA. CT scan of the abdomen\nB....,D,MCQ
15640,MMLU,998,A 19-year-old woman who is a regular patient c...,Answer Choices:\nA. Androgen-producing ovarian...,D,MCQ


In [64]:
def make_prompt(row):
    q = str(row.get("question", "")).strip()
    opts = str(row.get("options", "")).strip()

    # Build final text: question first, then answer choices
    if opts and not pd.isnull(opts):
        return f"Question:\n{q}\n\n{opts}"
    else:
        return f"Question:\n{q}"


# Apply to dataframe
compiled_df["prompt_text"] = compiled_df.apply(make_prompt, axis=1)


In [65]:
for i in range(20):
    print(compiled_df.iloc[i]['prompt_text'])
    print('*' * 50)

Question:
Urogenital Diaphragm is made up of the following, except:

Answer Choices:
A. Deep transverse Perineus
B. Perinial membrane
C. Colle's fascia
D. Sphincter Urethrae
**************************************************
Question:
Child with Type I Diabetes. What is the advised time for fundus examinations from the time of diagnosis?

Answer Choices:
A. After 5 years
B. After 2 years
C. After 10 years
D. At the time of diagnosis
**************************************************
Question:
Most sensitive test for H pylori is-

Answer Choices:
A. Fecal antigen test
B. Biopsy urease test
C. Serological test
D. Urea breath test
**************************************************
Question:
Ligation of the common hepatic aery will compromise blood flow in

Answer Choices:
A. Right and Left gastric aery
B. Right gastric and sho gastric aeries
C. Right gastroepiploic and sho gastric aeries
D. Right gastric and right gastroepiploic aery
**************************************************
Ques

In [69]:
compiled_df

Unnamed: 0,dataset_name,id_in_dataset,question,options,answer_label,question_type,prompt_text
0,medmcqa,7131,Urogenital Diaphragm is made up of the followi...,Answer Choices:\nA. Deep transverse Perineus\n...,C,MCQ,Question:\nUrogenital Diaphragm is made up of ...
1,medmcqa,7133,Child with Type I Diabetes. What is the advise...,Answer Choices:\nA. After 5 years\nB. After 2 ...,A,MCQ,Question:\nChild with Type I Diabetes. What is...
2,medmcqa,7134,Most sensitive test for H pylori is-,Answer Choices:\nA. Fecal antigen test\nB. Bio...,B,MCQ,Question:\nMost sensitive test for H pylori is...
3,medmcqa,7137,Ligation of the common hepatic aery will compr...,Answer Choices:\nA. Right and Left gastric aer...,D,MCQ,Question:\nLigation of the common hepatic aery...
4,medmcqa,7138,Typhoid investigation of choice in 1st week,Answer Choices:\nA. Blood culture\nB. Widal te...,A,MCQ,Question:\nTyphoid investigation of choice in ...
...,...,...,...,...,...,...,...
15637,MMLU,995,You are visiting an 86-year-old woman in her h...,Answer Choices:\nA. Admit the patient to the h...,B,MCQ,Question:\nYou are visiting an 86-year-old wom...
15638,MMLU,996,A 37-year-old woman comes to the physician bec...,Answer Choices:\nA. Haemophilus influenzae typ...,D,MCQ,Question:\nA 37-year-old woman comes to the ph...
15639,MMLU,997,A 42-year-old woman is brought to the emergenc...,Answer Choices:\nA. CT scan of the abdomen\nB....,D,MCQ,Question:\nA 42-year-old woman is brought to t...
15640,MMLU,998,A 19-year-old woman who is a regular patient c...,Answer Choices:\nA. Androgen-producing ovarian...,D,MCQ,Question:\nA 19-year-old woman who is a regula...


In [66]:
# view total question count
len(compiled_df)

15642

In [67]:
# view question type counts
compiled_df['question_type'].value_counts()

Unnamed: 0_level_0,count
question_type,Unnamed: 1_level_1
MCQ,15040
Y/N,602


In [68]:
# export compiled_df as parquet
compiled_df.to_parquet("compiled_df.parquet")

In [70]:
# split df based on question type
mcq_df = compiled_df[compiled_df['question_type'] == 'MCQ']
yn_df = compiled_df[compiled_df['question_type'] == 'Y/N']

# export as parquet
mcq_df.to_parquet("mcq_df.parquet")
yn_df.to_parquet("yn_df.parquet")
