### <span style="color:teal"> __DISEASE DICTIONARY MAPPING__
____

In [1]:
# Loading packages
import pandas as pd
import numpy as np
import json
import requests
import time
from tqdm import tqdm

In [2]:
# Load the main dataset (GMRepo 'sample_to_disease_info.txt')
df = pd.read_csv("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gm_repository/sample_to_disease_info.txt", sep = '\t')

In [3]:
# Descriptive summary of 'disease' column
total_entries = len(df)
missing_disease = df['disease'].isnull().sum()
unique_without_nan = df['disease'].nunique()

print("📊 Disease Column Summary")
print("----------------------------")
print(f"🔹 Total rows             : {total_entries}")
print(f"🔹 Missing disease labels: {missing_disease}")
print(f"🔹 Unique values (without NaN): {unique_without_nan}")

# Get list of unique non-null disease names
disease_list = sorted(df['disease'].dropna().unique().tolist())



📊 Disease Column Summary
----------------------------
🔹 Total rows             : 66118
🔹 Missing disease labels: 6969
🔹 Unique values (without NaN): 92


In [None]:
# Fetching disease names (phenotyes) from NCBI-MeSH website
API_KEY = "87fdd45a6ee743fecdbd1b7e9f010d669109"
BATCH_SIZE = 200

def get_uids_for_mesh_ids(mesh_ids):
    uid_map = {}
    not_found = []
    for mesh_id in tqdm(mesh_ids, desc="Fetching disease names (phenotyes) from NCBI-MeSH website using MESH IDs (disease codes)"):
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        params = {
            'db': 'mesh',
            'term': mesh_id,
            'retmode': 'json',
            'api_key': API_KEY
        }
        try:
            r = requests.get(url, params=params, timeout=10)
            r.raise_for_status()
            ids = r.json().get('esearchresult', {}).get('idlist', [])
            if ids:
                uid_map[mesh_id] = ids[0]
            else:
                not_found.append(mesh_id)
        except Exception as e:
            print(f"Error resolving {mesh_id}: {e}")
            not_found.append(mesh_id)
        time.sleep(0.2)
    return uid_map, not_found

def fetch_mesh_terms_batch(uid_batch):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        'db': 'mesh',
        'id': ",".join(uid_batch),
        'retmode': 'json',
        'api_key': API_KEY
    }
    try:
        r = requests.get(url, params=params, timeout=15)
        r.raise_for_status()
        data = r.json().get('result', {})
        return {uid: data[uid]['ds_meshterms'][0] for uid in uid_batch if uid in data and 'ds_meshterms' in data[uid]}
    except Exception as e:
        print(f"Batch fetch error: {e}")
        return {}

# Step 1: Get UIDs for MeSH IDs
disease_dict = {}
uid_map, mesh_not_found = get_uids_for_mesh_ids(disease_list)
uids = list(uid_map.values())

# Step 2: Batch fetch MeSH terms using UIDs
for i in tqdm(range(0, len(uids), BATCH_SIZE), desc="Fetching MeSH terms in batch"):
    batch = uids[i:i + BATCH_SIZE]
    terms = fetch_mesh_terms_batch(batch)
    for mesh_id, uid in uid_map.items():
        if uid in terms:
            try:
                disease_dict[int(mesh_id)] = str(terms[uid])
            except Exception:
                disease_dict[mesh_id] = str(terms[uid])

# Step 3: Add 'Unknown' for not found
for mesh_id in disease_list:
    try:
        key = int(mesh_id)
    except Exception:
        key = mesh_id
    if key not in disease_dict:
        disease_dict[key] = "Unknown"

# Step 4: Save as JSON
with open("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gut_microbiome_project/resources/disease_dict.json", "w") as f:
    json.dump({str(k): v for k, v in disease_dict.items()}, f, indent=2)

# Final report
found_count = sum(v != "Unknown" for v in disease_dict.values())
unknown_count = sum(v == "Unknown" for v in disease_dict.values())
not_found_count = len(mesh_not_found)

print(f"\nFinished: {len(disease_dict)} MeSH IDs processed")
print(f"Found (with term): {found_count}")
print(f"Unknown: {unknown_count}")
print(f"Not found (no UID resolved): {not_found_count}")

if not_found_count > 0:
    print(f"MeSH IDs not found: {mesh_not_found}")

Fetching disease names (phenotyes) from NCBI-MeSH website using MESH IDs (disease codes):   0%|          | 0/92 [00:00<?, ?it/s]

Fetching disease names (phenotyes) from NCBI-MeSH website using MESH IDs (disease codes): 100%|██████████| 92/92 [00:46<00:00,  1.99it/s]
Fetching MeSH terms in batch: 100%|██████████| 1/1 [00:00<00:00,  1.91it/s]


Finished: 92 MeSH IDs processed
Found (with term): 92
Unknown: 0
Not found (no UID resolved): 0



