In [None]:
!pip install -qU "elasticsearch<9" sentence-transformers==2.7.0 xmltodict

In [None]:
import os
import xmltodict
from elasticsearch import Elasticsearch, helpers
from sentence_transformers import SentenceTransformer
from getpass import getpass
import tqdm as notebook_tqdm

In [None]:
model = SentenceTransformer("intfloat/multilingual-e5-base")

In [None]:
ELASTIC_CLOUD_ID = getpass("Elastic Cloud ID: ")

ELASTIC_API_KEY = getpass("Elastic Api Key: ")

In [None]:
es = Elasticsearch(
    ELASTIC_CLOUD_ID,
    api_key=ELASTIC_API_KEY
)

INDEX_NAME = "medquad_index_with_embeddings_gemini_multilingual"

mapping = {
    "mappings": {
        "properties": {
            "document_id": {"type": "keyword"},
            "source": {"type": "keyword"},
            "url": {"type": "keyword"},
            "focus": {"type": "text", "analyzer": "english"},
            "cuis": {"type": "keyword"},
            "semantic_types": {"type": "keyword"},
            "semantic_group": {"type": "keyword"},
            "synonyms": {"type": "text", "analyzer": "english"},
            "question_id": {"type": "keyword"},
            "question_type": {"type": "keyword"},
            "question": {"type": "text", "analyzer": "english"},
            "answer": {"type": "text", "analyzer": "english"},
            "file_path": {"type": "keyword"},
            "question_embedding": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "answer_embedding": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

# Create index with mapping
if not es.indices.exists(index=INDEX_NAME):
    es.indices.create(index=INDEX_NAME, body=mapping)
    print(f"Created index: {INDEX_NAME}")
else:
    print(f"Index {INDEX_NAME} already exists.")

In [None]:

def parse_medquad_xml(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        xml_dict = xmltodict.parse(f.read())

    doc = xml_dict.get("Document", {})
    base_info = {
        "document_id": doc.get("@id"),
        "source": doc.get("@source"),
        "url": doc.get("@url"),
        "focus": doc.get("Focus"),
        "file_path": file_path,
    }

    anns = doc.get("FocusAnnotations", {})
    umls = anns.get("UMLS", {})
    base_info["cuis"] = umls.get("CUIs", {}).get("CUI")
    base_info["semantic_types"] = umls.get("SemanticTypes", {}).get("SemanticType")
    base_info["semantic_group"] = umls.get("SemanticGroup")
    syns = anns.get("Synonyms", {}).get("Synonym")
    if isinstance(syns, list):
        base_info["synonyms"] = syns
    elif syns:
        base_info["synonyms"] = [syns]
    else:
        base_info["synonyms"] = []

    qa_pairs = doc.get("QAPairs", {}).get("QAPair", [])
    if not isinstance(qa_pairs, list):
        qa_pairs = [qa_pairs]

    docs = []
    for qa in qa_pairs:
        q = qa.get("Question", {})
        a = qa.get("Answer", "")
        docs.append({
            **base_info,
            "question_id": q.get("@qid"),
            "question_type": q.get("@qtype"),
            "question": q.get("#text"),
            "answer": a,
        })
    return docs

def index_medquad_one_by_one(base_folder):
    total_docs = 0
    for root, _, files in os.walk(base_folder):
        for file in files:
            if file.endswith(".xml"):
                full_path = os.path.join(root, file)
                try:
                    parsed_docs = parse_medquad_xml(full_path)
                    for doc in parsed_docs:
                        doc["_source"] = {**doc,
                                          "question_embedding": model.encode(doc["question"]).tolist(),
                                          "answer_embedding": model.encode(doc["answer"]).tolist()
                                        }
                        es.index(index=INDEX_NAME, document=doc["_source"])
                        total_docs += 1
                except Exception as e:
                    print(f"⚠️ Error parsing {file}: {e}")
    es.indices.refresh(index=INDEX_NAME)
    print(f"✅ Indexed {total_docs} Q&A pairs successfully!")

index_medquad_one_by_one("./MedQuAD")