### <span style="color:teal"> __DISEASE DICTIONARY MAPPING__
____

In [1]:
import pandas as pd
import json
import requests
import time
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
from collections import defaultdict

This block constructs a clean disease dictionary and enriches it with MeSH hierarchy so the dataset can be grouped biologically in later steps. It begins by resolving each input MeSH disease code (`disease_list`) to a MeSH UID via NCBI E-utilities (`esearch`), handling timeouts and logging any codes that cannot be resolved; it then batches those UIDs through `esummary` to fetch the preferred disease term and builds `disease_dict` (code → human-readable name), assigning `"Unknown"` where no UID or term is found and saving the result to JSON. Next, it parses the local MeSH descriptor XML (`desc2025.xml`) and indexes the thesaurus (UID → name, UID → tree numbers, and tree number → UID). Using these indices, it augments each entry with its full set of MeSH tree numbers, its direct parents (by trimming each tree path), and its top-level MeSH categories (by taking the first segment of each tree number and looking up the corresponding descriptor). That enriched structure is written to `disease_category.json` as a **traceable, intermediate file** capturing the full hierarchy for audit and inspection. Finally, it distills a compact **disease classification** mapping (disease code → list of top categories) and saves it as `disease_classification.json`; **this classification is the artifact intended for downstream analyses** (e.g., stratifying cohorts, filtering by category families, or summarizing burden by MeSH sections), while `disease_category.json` remains an intermediate reference to support transparency and reproducibility.

In [2]:
# Load the main dataset (GMRepo 'sample_to_disease_info.txt')
df = pd.read_csv("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gm_repository/sample_to_disease_info.txt", sep = '\t')

# Descriptive summary of 'disease' column
total_entries = len(df)
missing_disease = df['disease'].isnull().sum()
unique_without_nan = df['disease'].nunique()

print("📊 Disease Column Summary")
print("----------------------------")
print(f"🔹 Total rows             : {total_entries}")
print(f"🔹 Missing disease labels: {missing_disease}")
print(f"🔹 Unique values (without NaN) - Total diseases types in the whole dataset: {unique_without_nan}")

# Get list of unique non-null disease names
disease_list = sorted(df['disease'].dropna().unique().tolist())

📊 Disease Column Summary
----------------------------
🔹 Total rows             : 66118
🔹 Missing disease labels: 6969
🔹 Unique values (without NaN) - Total diseases types in the whole dataset: 92


In [3]:
# Fetching disease names (phenotyes) from NCBI-MeSH website
API_KEY = "87fdd45a6ee743fecdbd1b7e9f010d669109"
BATCH_SIZE = 200

def get_uids_for_mesh_ids(mesh_ids):
    uid_map = {}
    not_found = []
    for mesh_id in tqdm(mesh_ids, desc="Fetching disease names (phenotyes) from NLM-MeSH website using MESH IDs (disease codes)"):
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        params = {
            'db': 'mesh',
            'term': mesh_id,
            'retmode': 'json',
            'api_key': API_KEY
        }
        try:
            r = requests.get(url, params=params, timeout=10)
            r.raise_for_status()
            ids = r.json().get('esearchresult', {}).get('idlist', [])
            if ids:
                uid_map[mesh_id] = ids[0]
            else:
                not_found.append(mesh_id)
        except Exception as e:
            print(f"Error resolving {mesh_id}: {e}")
            not_found.append(mesh_id)
        time.sleep(0.2)
    return uid_map, not_found

def fetch_mesh_terms_batch(uid_batch):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        'db': 'mesh',
        'id': ",".join(uid_batch),
        'retmode': 'json',
        'api_key': API_KEY
    }
    try:
        r = requests.get(url, params=params, timeout=15)
        r.raise_for_status()
        data = r.json().get('result', {})
        return {uid: data[uid]['ds_meshterms'][0] for uid in uid_batch if uid in data and 'ds_meshterms' in data[uid]}
    except Exception as e:
        print(f"Batch fetch error: {e}")
        return {}

# Step 1: Get UIDs for MeSH IDs
disease_dict = {}
uid_map, mesh_not_found = get_uids_for_mesh_ids(disease_list)
uids = list(uid_map.values())

# Step 2: Batch fetch MeSH terms using UIDs
for i in tqdm(range(0, len(uids), BATCH_SIZE), desc="Fetching MeSH terms in batch"):
    batch = uids[i:i + BATCH_SIZE]
    terms = fetch_mesh_terms_batch(batch)
    for mesh_id, uid in uid_map.items():
        if uid in terms:
            try:
                disease_dict[int(mesh_id)] = str(terms[uid])
            except Exception:
                disease_dict[mesh_id] = str(terms[uid])

# Step 3: Add 'Unknown' for not found
for mesh_id in disease_list:
    try:
        key = int(mesh_id)
    except Exception:
        key = mesh_id
    if key not in disease_dict:
        disease_dict[key] = "Unknown"

# Step 4: Save as JSON
with open("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gmp_jms/disease_dict.json", "w") as f:
    json.dump({str(k): v for k, v in disease_dict.items()}, f, indent=2)

# Final report
found_count = sum(v != "Unknown" for v in disease_dict.values())
unknown_count = sum(v == "Unknown" for v in disease_dict.values())
not_found_count = len(mesh_not_found)

print(f"\nFinished: {len(disease_dict)} MeSH IDs processed")
print(f"Found (with term): {found_count}")
print(f"Unknown: {unknown_count}")
print(f"Not found (no UID resolved): {not_found_count}")

if not_found_count > 0:
    print(f"MeSH IDs not found: {mesh_not_found}")

Fetching disease names (phenotyes) from NLM-MeSH website using MESH IDs (disease codes): 100%|██████████| 92/92 [00:50<00:00,  1.82it/s]
Fetching MeSH terms in batch: 100%|██████████| 1/1 [00:00<00:00,  1.81it/s]


Finished: 92 MeSH IDs processed
Found (with term): 92
Unknown: 0
Not found (no UID resolved): 0





In [4]:
# === File paths ===
MESH_XML_PATH = "/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gmp_jms/desc2025.xml"


# === Step 1: Parse the XML ===
print("📂 Parsing MeSH XML...")
tree = ET.parse(MESH_XML_PATH)
root = tree.getroot()

# Build mappings:
uid_to_name = {}
uid_to_tree_numbers = {}
tree_number_to_uid = {}

for record in root.findall("DescriptorRecord"):
    uid = record.findtext("DescriptorUI")
    name = record.findtext("DescriptorName/String")
    tree_numbers = [tn.text for tn in record.findall(".//TreeNumberList/TreeNumber")]

    uid_to_name[uid] = name
    uid_to_tree_numbers[uid] = tree_numbers

    for tn in tree_numbers:
        tree_number_to_uid[tn] = uid

print(f"✅ Indexed {len(uid_to_name)} descriptors and {len(tree_number_to_uid)} tree paths.\n")

# === Step 2: Use existing `disease_dict` already loaded ===
# (Assume disease_dict is already defined earlier in the notebook/script)

# === Step 3: For each disease, find its tree numbers, parents, and top-level categories ===
output = {}

for uid, disease_name in disease_dict.items():
    tree_numbers = uid_to_tree_numbers.get(uid, [])
    parents = []
    top_categories = []

    for tn in tree_numbers:
        # Get parent by trimming last part
        parts = tn.split('.')
        if len(parts) > 1:
            parent_tn = '.'.join(parts[:-1])
            parent_uid = tree_number_to_uid.get(parent_tn)
            if parent_uid:
                parent_name = uid_to_name.get(parent_uid)
                if parent_name:
                    parents.append(parent_name)

        # Get topmost category (first segment)
        top_tn = parts[0]
        top_uid = tree_number_to_uid.get(top_tn)
        if top_uid:
            top_name = uid_to_name.get(top_uid)
            if top_name:
                top_categories.append(top_name)

    output[uid] = {
        "name": disease_name,
        "tree_numbers": tree_numbers,
        "parents": sorted(set(parents)),
        "top_categories": sorted(set(top_categories))
    }

# === Step 4: Save result ===
import os

# === Step 4: Save result ===
output_path = "/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gmp_jms/disease_category.json"

# Create directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save JSON
with open(output_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"✅ Saved full disease hierarchy info to: {output_path}")

📂 Parsing MeSH XML...
✅ Indexed 30956 descriptors and 64883 tree paths.

✅ Saved full disease hierarchy info to: /mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gmp_jms/disease_category.json


In [5]:
# Open the JSON file containing disease name mappings
with open("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/microbiome_project/resources/disease_dict.json", "r") as f:
    # Load the contents of the JSON file into a Python dictionary
    disease_dict = json.load(f)
    
with open("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/microbiome_project/resources/disease_category.json", "r") as f:
    disease_category = json.load(f)

In [6]:
# Extract all unique top categories from the dictionary
unique_top_categories = sorted({
    cat
    for meta in disease_category.values()
    for cat in meta.get("top_categories", [])
    if cat  # Avoid empty strings
})
# Extracts all unique top catgories.

# Create dictionary
disease_code_to_top_categories = {
    code: meta.get("top_categories", [])
    for code, meta in disease_category.items()
}

# Save to JSON
with open("disease_classification.json", "w") as f:
    json.dump(disease_code_to_top_categories, f, indent=4)

print("Saved to disease_classification.json")


# Get all unique categories across all values in the dictionary
unique_categories = {
    cat
    for categories in disease_code_to_top_categories.values()
    for cat in categories
}

print(f"Number of unique top categories: {len(unique_categories)}")
unique_categories  # optional, to see them

Saved to disease_classification.json
Number of unique top categories: 24


{'Bacterial Infections',
 'Behavior and Behavior Mechanisms',
 'Cardiovascular Diseases',
 'Congenital, Hereditary, and Neonatal Diseases and Abnormalities',
 'Digestive System Diseases',
 'Endocrine System Diseases',
 'Eye Diseases',
 'Health',
 'Hemic and Lymphatic Diseases',
 'Immune System Diseases',
 'Infant, Newborn, Diseases',
 'Infections',
 'Mental Disorders',
 'Musculoskeletal Diseases',
 'Neoplasms',
 'Nervous System Diseases',
 'Nutritional and Metabolic Diseases',
 'Pregnancy',
 'Respiratory Tract Diseases',
 'Signs and Symptoms, Digestive',
 'Skin and Connective Tissue Diseases',
 'Stomatognathic Diseases',
 'Urogenital Diseases',
 'Viral Infections'}