In [1]:
from lxml import etree
from datasets import Dataset

def parse_liveqa_xml(path):
    """
    Parse a LiveQA Medical TREC‑2017 XML file and return a list of dicts:
      - questionid : string
      - fRef       : string
      - subject    : string
      - message    : string
      - subquestions: list of {
            subqid : string,
            focus  : string,
            type   : string,
            answers: list of strings
        }
    """
    tree = etree.parse(path)
    root = tree.getroot()

    records = []
    for q in root.findall("NLM-QUESTION"):
        record = {
            # coerce attributes to str (never None)
            "questionid": str(q.get("questionid", "")),
            "fRef":       str(q.get("fRef", "")),
            "subject":    (q.findtext("SUBJECT") or "").strip(),
            "message":    (q.findtext("MESSAGE") or "").strip(),
            "subquestions": []
        }

        for sq in q.findall("SUB-QUESTIONS/SUB-QUESTION"):
            sq_data = {
                "subqid": str(sq.get("subqid", "")),           # force string
                "focus":  (sq.findtext("ANNOTATIONS/FOCUS") or "").strip(),
                "type":   (sq.findtext("ANNOTATIONS/TYPE") or "").strip(),
                "answers": []
            }
            for ans in sq.findall("ANSWERS/ANSWER"):
                text = ans.text or ""
                sq_data["answers"].append(text.strip())
            record["subquestions"].append(sq_data)

        records.append(record)
    return records


In [2]:
train1_list = parse_liveqa_xml("LiveQA_MedicalTask_TREC2017/TrainDataset/TREC-2017-LiveQA-Medical-Train-1.xml")
train2_list = parse_liveqa_xml("LiveQA_MedicalTask_TREC2017/TrainDataset/TREC-2017-LiveQA-Medical-Train-2.xml")

train1 = Dataset.from_list(train1_list)
train2 = Dataset.from_list(train2_list)

print(train1)           
print(train1[0])       
print(train1[-1]["subquestions"]) 

print(train2)           
print(train2[0])        
print(train2[-1]["subquestions"])  

Dataset({
    features: ['questionid', 'fRef', 'subject', 'message', 'subquestions'],
    num_rows: 200
})
{'questionid': 'Q1', 'fRef': '11373', 'subject': '', 'message': 'Literature on Cardiac amyloidosis.  Please let me know where I can get literature on Cardiac amyloidosis.  My uncle died yesterday from this disorder.  Since this is such a rare disorder, and to honor his memory, I would like to distribute literature at his funeral service.  I am a retired NIH employee, so I am familiar with the campus in case you have literature at NIH that I can come and pick up.  Thank you', 'subquestions': [{'answers': ['Cardiac amyloidosis is a disorder caused by deposits of an abnormal protein (amyloid) in the heart tissue. These deposits make it hard for the heart to work properly.', 'The term "amyloidosis" refers not to a single disease but to a collection of diseases in which a protein-based infiltrate deposits in tissues as beta-pleated sheets. The subtype of the disease is determined by wh

In [3]:
from datasets import concatenate_datasets, load_dataset
trec_dataset = concatenate_datasets([train1, train2]).shuffle(seed=42)
print(len(trec_dataset))

446


In [4]:
rows = []
for rec in trec_dataset:
    msg = rec.get('message', '').strip()
    for sub in rec.get('subquestions', []):
        for ans in sub.get('answers', []):
            rows.append({
                'message': msg,
                'answer': ans.strip()
            })

In [5]:
import pandas as pd
df = pd.DataFrame(rows, columns=['message', 'answer'])
df["instruction"] = "Answer the medical question."
df.columns = ["input", "output","instruction"]
df.head()

Unnamed: 0,input,output,instruction
0,I have had Tourette syndrome since age seven. ...,What disorders are associated with TS? Many i...,Answer the medical question.
1,I have undergone Total Hip Replacement of both...,When Is Revision Surgery Necessary? Hip repla...,Answer the medical question.
2,atypical pnuemonia. what is the possibility o...,Most patients with pneumonia due to mycoplasma...,Answer the medical question.
3,"Dear Sir, My mother is 80 years old. Her eye ...","If you have low vision, eyeglasses, contact le...",Answer the medical question.
4,I am a 75 year old man. I have 2 inguinal hern...,An inguinal hernia happens when contents of th...,Answer the medical question.


In [6]:
print(df.shape)
df = df.dropna()
print(df.shape)

(634, 3)
(634, 3)


In [7]:
medicationqa = load_dataset("truehealth/medicationqa")
medalpca = load_dataset("medalpaca/medical_meadow_medical_flashcards")
medquad = load_dataset("keivalya/MedQuad-MedicalQnADataset")['train']
print(medicationqa)
print(medalpca)
print(medquad)

DatasetDict({
    train: Dataset({
        features: ['Question', 'Focus (Drug)', 'Question Type', 'Answer', 'Section Title', 'URL'],
        num_rows: 690
    })
})
DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 33955
    })
})
Dataset({
    features: ['qtype', 'Question', 'Answer'],
    num_rows: 16407
})


In [8]:
medqa = medicationqa["train"].to_pandas()
medqa["instruction"] = "Answer the medical question."
medqa = medqa.rename(columns={"Question": "input", "Answer": "output"})
df_medqa = medqa[["instruction", "input", "output"]]
print(df_medqa.shape)
df_medqa = df_medqa.dropna()
print(df_medqa.shape)
df_medqa.head()

(690, 3)
(689, 3)


Unnamed: 0,instruction,input,output
0,Answer the medical question.,how does rivatigmine and otc sleep medicine in...,tell your doctor and pharmacist what prescript...
1,Answer the medical question.,how does valium affect the brain,Diazepam is a benzodiazepine that exerts anxio...
2,Answer the medical question.,what is morphine,Morphine is a pain medication of the opiate fa...
3,Answer the medical question.,what are the milligrams for oxycodone e,… 10 mg … 20 mg … 40 mg … 80 mg ...
4,Answer the medical question.,81% aspirin contain resin and shellac in it. ?,Inactive Ingredients Ingredient Name


In [9]:
mq = medquad.to_pandas()
mq["instruction"] = "Answer the medical question."
mq = mq.rename(columns={"Question": "input", "Answer": "output"})
df_mq = mq[["instruction", "input", "output"]]
print(df_mq.shape)
df_mq = df_mq.dropna()
print(df_mq.shape)
df_mq.head()

(16407, 3)
(16407, 3)


Unnamed: 0,instruction,input,output
0,Answer the medical question.,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...
1,Answer the medical question.,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...
2,Answer the medical question.,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...
3,Answer the medical question.,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos..."
4,Answer the medical question.,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen..."


In [10]:
alp = medalpca["train"].to_pandas()
df_alp = alp[["instruction", "input", "output"]]
df_alp["instruction"] = "Answer the medical question."
print(df_alp.shape)
df_alp = df_alp.dropna()
print(df_alp.shape)
df_alp.head()

(33955, 3)
(33955, 3)


Unnamed: 0,instruction,input,output
0,Answer the medical question.,What is the relationship between very low Mg2+...,Very low Mg2+ levels correspond to low PTH lev...
1,Answer the medical question.,What leads to genitourinary syndrome of menopa...,Low estradiol production leads to genitourinar...
2,Answer the medical question.,What does low REM sleep latency and experienci...,Low REM sleep latency and experiencing halluci...
3,Answer the medical question.,What are some possible causes of low PTH and h...,"PTH-independent hypercalcemia, which can be ca..."
4,Answer the medical question.,How does the level of anti-müllerian hormone r...,The level of anti-müllerian hormone is directl...


In [11]:
import pandas as pd, pathlib
dfs = {p.name: pd.read_csv(p) for p in pathlib.Path(r"C:\Users\vasal\Study\TSAI\SOAI_Capstone_Project\SFT\medicalQAdataset").glob("*.csv")}
dfs.keys()

dict_keys(['CancerQA.csv', 'Diabetes_and_Digestive_and_Kidney_DiseasesQA.csv', 'Disease_Control_and_PreventionQA.csv', 'Genetic_and_Rare_DiseasesQA.csv', 'growth_hormone_receptorQA.csv', 'Heart_Lung_and_BloodQA.csv', 'MedicalQuestionAnswering.csv', 'Neurological_Disorders_and_StrokeQA.csv', 'OtherQA.csv', 'SeniorHealthQA.csv'])

In [12]:
medicalqadataset = []
for i in dfs.keys():
    print(f"{i}: {dfs[i].shape}")
    medicalqadataset.append(dfs[i])
    
medicalqadataset = pd.concat(medicalqadataset, ignore_index=True)
print(medicalqadataset.shape) 

CancerQA.csv: (729, 4)
Diabetes_and_Digestive_and_Kidney_DiseasesQA.csv: (1192, 4)
Disease_Control_and_PreventionQA.csv: (270, 4)
Genetic_and_Rare_DiseasesQA.csv: (5388, 4)
growth_hormone_receptorQA.csv: (5430, 4)
Heart_Lung_and_BloodQA.csv: (559, 4)
MedicalQuestionAnswering.csv: (16406, 4)
Neurological_Disorders_and_StrokeQA.csv: (1088, 4)
OtherQA.csv: (981, 4)
SeniorHealthQA.csv: (769, 4)
(32812, 4)


In [13]:
medicalqadataset["instruction"] = "Answer the medical question."
medicalqadataset = medicalqadataset.rename(columns={"Question": "input", "Answer": "output"})
df_mqadata = medicalqadataset[["instruction", "input", "output"]]
print(df_mqadata.shape)
df_mqadata = df_mqadata.dropna()
print(df_mqadata.shape)
df_mqadata.head()

(32812, 3)
(32812, 3)


Unnamed: 0,instruction,input,output
0,Answer the medical question.,What is (are) Non-Small Cell Lung Cancer ?,Key Points\n - Non-small ce...
1,Answer the medical question.,Who is at risk for Non-Small Cell Lung Cancer? ?,Smoking is the major risk factor for non-small...
2,Answer the medical question.,What are the symptoms of Non-Small Cell Lung C...,Signs of non-small cell lung cancer include a ...
3,Answer the medical question.,How to diagnose Non-Small Cell Lung Cancer ?,Tests that examine the lungs are used to detec...
4,Answer the medical question.,What is the outlook for Non-Small Cell Lung Ca...,Certain factors affect prognosis (chance of re...


In [14]:
print(df.shape, df_medqa.shape, df_mq.shape, df_alp.shape, df_mqadata.shape)

(634, 3) (689, 3) (16407, 3) (33955, 3) (32812, 3)


In [15]:
print(df.isna().sum())
print(df_medqa.isna().sum())
print(df_mq.isna().sum())
print(df_alp.isna().sum())
print(df_mqadata.isna().sum())

input          0
output         0
instruction    0
dtype: int64
instruction    0
input          0
output         0
dtype: int64
instruction    0
input          0
output         0
dtype: int64
instruction    0
input          0
output         0
dtype: int64
instruction    0
input          0
output         0
dtype: int64


In [16]:
final_df = pd.concat([df, df_medqa, df_mq, df_alp, df_mqadata], ignore_index=True)
final_df.shape

(84497, 3)

In [17]:
final_df.head()

Unnamed: 0,input,output,instruction
0,I have had Tourette syndrome since age seven. ...,What disorders are associated with TS? Many i...,Answer the medical question.
1,I have undergone Total Hip Replacement of both...,When Is Revision Surgery Necessary? Hip repla...,Answer the medical question.
2,atypical pnuemonia. what is the possibility o...,Most patients with pneumonia due to mycoplasma...,Answer the medical question.
3,"Dear Sir, My mother is 80 years old. Her eye ...","If you have low vision, eyeglasses, contact le...",Answer the medical question.
4,I am a 75 year old man. I have 2 inguinal hern...,An inguinal hernia happens when contents of th...,Answer the medical question.


In [18]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84497 entries, 0 to 84496
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   input        84497 non-null  object
 1   output       84497 non-null  object
 2   instruction  84497 non-null  object
dtypes: object(3)
memory usage: 1.9+ MB


In [31]:
final_df.to_csv("final_dataset.csv", index=False)