In [7]:
# prompt: write a code to download git repo

!git clone https://github.com/abachaa/MedQuAD.git


fatal: destination path 'MedQuAD' already exists and is not an empty directory.


In [1]:
import os
import xml.etree.ElementTree as ET

def extract_text(element):
    """
    Extracts text content from an XML element, including its children.
    """
    if element is None:
        return "No data provided"
    # Join the text of the element and all its sub-elements
    return "".join(element.itertext()).strip() or "No data provided"

def parse_medquad_directory(base_directory):
    all_qa_pairs = []

    # Traverse through all subdirectories and files
    for root_dir, _, files in os.walk(base_directory):
        for file in files:
            if file.endswith(".xml"):
                file_path = os.path.join(root_dir, file)

                try:
                    # Parse each XML file
                    tree = ET.parse(file_path)
                    root = tree.getroot()

                    # Extract focus and questions-answers
                    document_id = root.attrib.get("id", "Unknown")
                    focus = extract_text(root.find("Focus"))

                    for qa_pair in root.findall(".//QAPair"):
                        question = extract_text(qa_pair.find("Question"))
                        answer = extract_text(qa_pair.find("Answer"))
                        question_type = qa_pair.find("Question").attrib.get("qtype", "Unknown")
                        if answer=="No data provided":
                            continue
                        else :
                          all_qa_pairs.append({
                              "document_id": document_id,
                              "focus": focus,
                              "question": question,
                              "answer": answer,
                              "question_type": question_type
                          })
                except Exception as e:
                    print(f"Error parsing file {file_path}: {e}")

    return all_qa_pairs

# Path to the MedQuAD directory
base_directory = "MedQuAD"

# Parse the dataset
qa_pairs = parse_medquad_directory(base_directory)
print(f"Total QA pairs parsed: {len(qa_pairs)}")

# Display the first QA pair as a check
import pprint
pprint.pprint(qa_pairs[2000] if qa_pairs else "No QA pairs found.")


Total QA pairs parsed: 16407
{'answer': 'What are the signs and symptoms of Deafness, autosomal dominant '
           'nonsyndromic sensorineural 17? The Human Phenotype Ontology '
           'provides the following list of signs and symptoms for Deafness, '
           'autosomal dominant nonsyndromic sensorineural 17. If the '
           'information is available, the table below includes how often the '
           'symptom is seen in people with this condition. You can use the '
           'MedlinePlus Medical Dictionary to look up the definitions for '
           'these medical terms. Signs and Symptoms Approximate number of '
           'patients (when available) Autosomal dominant inheritance - '
           'High-frequency hearing impairment - Juvenile onset - The Human '
           'Phenotype Ontology (HPO) has collected information on how often a '
           'sign or symptom occurs in a condition. Much of this information '
           'comes from Orphanet, a European rare disea

In [2]:
%pip install faiss-cpu




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [3]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load a pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare embeddings for the dataset
questions = [qa['question'] for qa in qa_pairs]
embeddings = model.encode(questions)

# Create a FAISS index
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(np.array(embeddings))

# Mapping index to Q&A data
qa_mapping = {i: qa for i, qa in enumerate(qa_pairs)}

# Retrieval function
def retrieve_answer(user_query):
    query_embedding = model.encode([user_query])
    _, indices = faiss_index.search(query_embedding, k=1)  
    closest_idx = indices[0][0]
    return qa_mapping[closest_idx]


  from tqdm.autonotebook import tqdm, trange


In [4]:
query = "What is (are) Acid Lipase Disease ?"
result = retrieve_answer(query)
pprint.pprint(result)

{'answer': 'Acid lipase disease or deficiency occurs when the enzyme needed to '
           'break down certain fats that are normally digested by the body is '
           'lacking or missing, resulting in the toxic buildup of these fats '
           'in the bodys cells and tissues. These fatty substances, called '
           'lipids, include fatty acids, oils, and cholesterol. Two rare lipid '
           'storage diseases are caused by the deficiency of the enzyme '
           'lysosomal acid lipase:\n'
           '                \n'
           'Wolmans disease (also known as acid lipase deficiency) is an '
           'autosomal recessive disorder marked by the buildup of cholesteryl '
           'esters (normally a tranport form of cholesterol that brings '
           'nutrients into the cells and carries out waste) and triglycerides '
           '(a chemical form in which fats exist in the body). Infants with '
           'the disorder appear normal at birth but quickly develop '
 

In [5]:
import faiss
import pickle

# Save the FAISS index
faiss.write_index(faiss_index, "faiss_index.bin")

with open("qa_mapping.pkl", "wb") as f:
    pickle.dump(qa_mapping, f)
