### <span style="color:teal"> __TAXONOMY DICTIONARY MAPPING__
____

In [1]:
# Loading packages
import pandas as pd
import numpy as np
import json
import requests
import time
from tqdm import tqdm

In [2]:
# Load the main dataset (GMRepo 'species_abundance.txt')
df = pd.read_csv("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gm_repository/species_abundance.txt", sep = '\t')

__TAXONOMY DICTIONARY__
____

In [3]:
taxon_id = df['ncbi_taxon_id'].unique()

In [6]:
#get update taxonomy species ID from NCBI or uniprot

API_KEY = "87fdd45a6ee743fecdbd1b7e9f010d669109"
BATCH_SIZE = 200

# Ensure your taxon_id list exists and convert to strings
taxon_id = [str(x) for x in taxon_id]

# Final dictionary and counters
taxon_dict = {}
from_ncbi = 0
from_uniprot = 0
unresolved = 0

def fetch_ncbi_batch(batch_ids):
    """Query NCBI for names (species preferred, fallback to higher rank)."""
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        'db': 'taxonomy',
        'id': ",".join(batch_ids),
        'retmode': 'json',
        'api_key': API_KEY
    }
    species = {}
    fallback = {}
    try:
        r = requests.get(url, params=params, timeout=15)
        if r.status_code == 200:
            result = r.json().get("result", {})
            for tid in batch_ids:
                record = result.get(tid, {})
                name = record.get("scientificname")
                rank = record.get("rank", "")
                if name:
                    if rank == "species":
                        species[tid] = name
                    else:
                        fallback[tid] = name
    except Exception as e:
        print(f" NCBI error for {batch_ids[:3]}: {e}")
    return species, fallback

def get_name_from_uniprot(tax_id):
    """Query UniProt as fallback."""
    url = f"https://rest.uniprot.org/taxonomy/{tax_id}"
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            return r.json().get("scientificName")
    except Exception as e:
        print(f" UniProt error for {tax_id}: {e}")
    return None

# Step 1: Query NCBI
missing_ids = []
for i in tqdm(range(0, len(taxon_id), BATCH_SIZE), desc="NCBI Batch lookup"):
    batch = taxon_id[i:i+BATCH_SIZE]
    species_dict, fallback_dict = fetch_ncbi_batch(batch)

    for tid in batch:
        tid_int = int(tid)
        if tid in species_dict:
            taxon_dict[tid_int] = species_dict[tid]
            from_ncbi += 1
        elif tid in fallback_dict:
            taxon_dict[tid_int] = fallback_dict[tid]
            from_ncbi += 1
        else:
            missing_ids.append(tid)

    time.sleep(0.4)  # NCBI rate limit

# Step 2: Query UniProt
for tid in tqdm(missing_ids, desc="UniProt fallback"):
    name = get_name_from_uniprot(tid)
    tid_int = int(tid)
    if name:
        taxon_dict[tid_int] = name
        from_uniprot += 1
    else:
        taxon_dict[tid_int] = "Unknown"
        unresolved += 1

# Step 3: Save the complete dictionary
with open("taxon_dict.json", "w") as f:
    json.dump({str(k): v for k, v in taxon_dict.items()}, f, indent=2)

# Step 4: Report summary
print("\n✅ Taxonomic Name Resolution Summary")
print("------------------------------------")
print(f"🔹 Total Taxon IDs Provided     : {len(taxon_id)}")
print(f"🔹 Resolved from NCBI           : {from_ncbi}")
print(f"🔹 Resolved from UniProt        : {from_uniprot}")
print(f"🔹 Unresolved IDs               : {unresolved}")
print(f"🔹 Final Entries in Dictionary  : {len(taxon_dict)}")


# Save final dictionary
with open("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gut_microbiome_project1/resources/taxon_dict.json", "w") as f:
    json.dump({str(k): v for k, v in taxon_dict.items()}, f, indent=2)

print("\n💾 Files saved:")
print(" - taxon_dict.json")




NCBI Batch lookup: 100%|██████████| 45/45 [00:35<00:00,  1.27it/s]
UniProt fallback: 100%|██████████| 86/86 [00:40<00:00,  2.14it/s]


✅ Taxonomic Name Resolution Summary
------------------------------------
🔹 Total Taxon IDs Provided     : 8910
🔹 Resolved from NCBI           : 8824
🔹 Resolved from UniProt        : 85
🔹 Unresolved IDs               : 1
🔹 Final Entries in Dictionary  : 8910

💾 Files saved:
 - taxon_dict.json





In [7]:
taxon_dict

{1678: 'Bifidobacterium',
 561: 'Escherichia',
 816: 'Bacteroides',
 1301: 'Streptococcus',
 1279: 'Staphylococcus',
 375288: 'Parabacteroides',
 32207: 'Rothia',
 102106: 'Collinsella',
 1378: 'Gemella',
 572511: 'Blautia',
 53335: 'Pantoea',
 40544: 'Sutterella',
 292632: 'Subdoligranulum',
 216816: 'Bifidobacterium longum',
 1681: 'Bifidobacterium bifidum',
 562: 'Escherichia coli',
 1680: 'Bifidobacterium adolescentis',
 1304: 'Streptococcus salivarius',
 821: 'Phocaeicola vulgatus',
 1282: 'Staphylococcus epidermidis',
 820: 'Bacteroides uniformis',
 43675: 'Rothia mucilaginosa',
 1290: 'Staphylococcus hominis',
 74426: 'Collinsella aerofaciens',
 47678: 'Bacteroides caccae',
 823: 'Parabacteroides distasonis',
 28037: 'Streptococcus mitis',
 1379: 'Gemella haemolysans',
 1280: 'Staphylococcus aureus',
 357276: 'Phocaeicola dorei',
 28116: 'Bacteroides ovatus',
 33038: 'Mediterraneibacter gnavus',
 46506: 'Bacteroides stercoris',
 204516: 'Phocaeicola massiliensis',
 818: 'Bactero