#### __DISEASE HIGHER LEVEL CATEGORY MAPPING__
___

In [1]:
# Loading the packages
import pandas as pd
import json
import requests
import time
import xml.etree.ElementTree as ET
from collections import defaultdict

In [2]:
# Load the json disease_dict.json file
# Load from JSON file
with open("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gut_microbiome_project/resources/disease_dict.json", "r") as f:
    disease_dict = json.load(f)

In [3]:
disease_dict

{'D006262': 'Health',
 'D015209': 'Cholangitis, Sclerosing',
 'D003093': 'Colitis, Ulcerative',
 'D043183': 'Irritable Bowel Syndrome',
 'D001714': 'Bipolar Disorder',
 'D003863': 'Depression',
 'D007410': 'Intestinal Diseases',
 'D008171': 'Lung Diseases',
 'D012559': 'Schizophrenia',
 'D013959': 'Thyroid Diseases',
 'D001327': 'Autoimmune Diseases',
 'D003967': 'Diarrhea',
 'D003920': 'Diabetes Mellitus',
 'D003015': 'Clostridium Infections',
 'D015212': 'Inflammatory Bowel Diseases',
 'D003248': 'Constipation',
 'D008881': 'Migraine Disorders',
 'D004827': 'Epilepsy',
 'D002318': 'Cardiovascular Diseases',
 'D002446': 'Celiac Disease',
 'D007674': 'Kidney Diseases',
 'D001289': 'Attention Deficit Disorder with Hyperactivity',
 'D008107': 'Liver Diseases',
 'D000067877': 'Autism Spectrum Disorder',
 'D010661': 'Phenylketonurias',
 'D000544': 'Alzheimer Disease',
 'D009765': 'Obesity',
 'D009767': 'Obesity, Morbid',
 'D006973': 'Hypertension',
 'D007234': 'Infant, Premature',
 'D00132

In [None]:
# === File paths ===
MESH_XML_PATH = "/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gm_repository/desc2025.xml"

In [None]:


# === Step 1: Parse the XML ===
print("📂 Parsing MeSH XML...")
tree = ET.parse(MESH_XML_PATH)
root = tree.getroot()

# Build mappings:
uid_to_name = {}
uid_to_tree_numbers = {}
tree_number_to_uid = {}

for record in root.findall("DescriptorRecord"):
    uid = record.findtext("DescriptorUI")
    name = record.findtext("DescriptorName/String")
    tree_numbers = [tn.text for tn in record.findall(".//TreeNumberList/TreeNumber")]

    uid_to_name[uid] = name
    uid_to_tree_numbers[uid] = tree_numbers

    for tn in tree_numbers:
        tree_number_to_uid[tn] = uid

print(f"✅ Indexed {len(uid_to_name)} descriptors and {len(tree_number_to_uid)} tree paths.\n")

# === Step 2: Use existing `disease_dict` already loaded ===
# (Assume disease_dict is already defined earlier in the notebook/script)

# === Step 3: For each disease, find its tree numbers, parents, and top-level categories ===
output = {}

for uid, disease_name in disease_dict.items():
    tree_numbers = uid_to_tree_numbers.get(uid, [])
    parents = []
    top_categories = []

    for tn in tree_numbers:
        # Get parent by trimming last part
        parts = tn.split('.')
        if len(parts) > 1:
            parent_tn = '.'.join(parts[:-1])
            parent_uid = tree_number_to_uid.get(parent_tn)
            if parent_uid:
                parent_name = uid_to_name.get(parent_uid)
                if parent_name:
                    parents.append(parent_name)

        # Get topmost category (first segment)
        top_tn = parts[0]
        top_uid = tree_number_to_uid.get(top_tn)
        if top_uid:
            top_name = uid_to_name.get(top_uid)
            if top_name:
                top_categories.append(top_name)

    output[uid] = {
        "name": disease_name,
        "tree_numbers": tree_numbers,
        "parents": sorted(set(parents)),
        "top_categories": sorted(set(top_categories))
    }

# === Step 4: Save result ===
import os

# === Step 4: Save result ===
output_path = "/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gut_microbiome_project/resources/disease_category.json"

# Create directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save JSON
with open(output_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"✅ Saved full disease hierarchy info to: {output_path}")



📂 Parsing MeSH XML...
✅ Indexed 30956 descriptors and 64883 tree paths.

✅ Saved full disease hierarchy info to: /mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gut_microbiome_project/resources/disease_category.json
