### <span style="color:teal"> __TAXON-CLASS DICTIONARY__
___

In [1]:
# Loading packages
import pandas as pd
import numpy as np
import json
import requests
import time
from tqdm import tqdm
import requests
import xml.etree.ElementTree as ET


In [2]:
# Load JSON file into dictionary
with open("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gut_microbiome_project/resources/taxon_dict.json", "r") as f:
    taxon_dict = json.load(f)
len(taxon_dict)


8910

In [3]:
import requests
import xml.etree.ElementTree as ET
import time

# === Your API key here
NCBI_API_KEY = "87fdd45a6ee743fecdbd1b7e9f010d669109"

# === Function to fetch taxonomy XML for a batch of taxon IDs
def fetch_phylum_batch(taxon_ids):
    joined_ids = ",".join(str(t) for t in taxon_ids)
    params = {
        "db": "taxonomy",
        "id": joined_ids,
        "retmode": "xml",
        "api_key": NCBI_API_KEY
    }
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"⚠️ Error fetching batch {taxon_ids[:3]}...: {e}")
        return None

# === Main script to fetch phylum for each TaxID
def batch_get_phylum(taxon_dict, batch_size=100):
    all_taxids = [int(tid) for tid in taxon_dict.keys() if tid != "-1"]
    taxon_phylum_map = {"-1": "Unknown"}

    for i in range(0, len(all_taxids), batch_size):
        batch = all_taxids[i:i + batch_size]
        print(f"🔍 Processing batch {i+1} to {i+len(batch)} of {len(all_taxids)}...")

        xml_data = fetch_phylum_batch(batch)
        if not xml_data:
            # Retry once after delay
            time.sleep(1)
            xml_data = fetch_phylum_batch(batch)
            if not xml_data:
                for tid in batch:
                    taxon_phylum_map[str(tid)] = "Error"
                continue

        # Parse each Taxon block individually
        root = ET.fromstring(xml_data)
        for taxon in root.findall("Taxon"):
            tid = taxon.findtext("TaxId")
            phylum_name = "phylum_Not_Found"

            for ancestor in taxon.findall(".//LineageEx/Taxon"):
                if ancestor.findtext("Rank") == "phylum":
                    phylum_name = ancestor.findtext("ScientificName")
                    break

            taxon_phylum_map[tid] = phylum_name

        time.sleep(0.34)  # NCBI rate limit: keep <3 requests/sec

    return taxon_phylum_map

# === Example usage
# taxon_dict = {"562": "Escherichia coli", "1280": "Lactobacillus", ...}
output_dict = batch_get_phylum(taxon_dict, batch_size=100)


🔍 Processing batch 1 to 100 of 8909...
🔍 Processing batch 101 to 200 of 8909...
🔍 Processing batch 201 to 300 of 8909...
🔍 Processing batch 301 to 400 of 8909...
🔍 Processing batch 401 to 500 of 8909...
🔍 Processing batch 501 to 600 of 8909...
🔍 Processing batch 601 to 700 of 8909...
🔍 Processing batch 701 to 800 of 8909...
🔍 Processing batch 801 to 900 of 8909...
🔍 Processing batch 901 to 1000 of 8909...
🔍 Processing batch 1001 to 1100 of 8909...
🔍 Processing batch 1101 to 1200 of 8909...
🔍 Processing batch 1201 to 1300 of 8909...
🔍 Processing batch 1301 to 1400 of 8909...
🔍 Processing batch 1401 to 1500 of 8909...
🔍 Processing batch 1501 to 1600 of 8909...
🔍 Processing batch 1601 to 1700 of 8909...
🔍 Processing batch 1701 to 1800 of 8909...
🔍 Processing batch 1801 to 1900 of 8909...
🔍 Processing batch 1901 to 2000 of 8909...
🔍 Processing batch 2001 to 2100 of 8909...
🔍 Processing batch 2101 to 2200 of 8909...
🔍 Processing batch 2201 to 2300 of 8909...
🔍 Processing batch 2301 to 2400 

In [4]:
output_dict

{'-1': 'Unknown',
 '1678': 'Actinomycetota',
 '561': 'Pseudomonadota',
 '816': 'Bacteroidota',
 '1301': 'Bacillota',
 '1279': 'Bacillota',
 '375288': 'Bacteroidota',
 '32207': 'Actinomycetota',
 '102106': 'Actinomycetota',
 '1378': 'Bacillota',
 '572511': 'Bacillota',
 '53335': 'Pseudomonadota',
 '40544': 'Pseudomonadota',
 '292632': 'Bacillota',
 '216816': 'Actinomycetota',
 '1681': 'Actinomycetota',
 '562': 'Pseudomonadota',
 '1680': 'Actinomycetota',
 '1304': 'Bacillota',
 '821': 'Bacteroidota',
 '1282': 'Bacillota',
 '820': 'Bacteroidota',
 '43675': 'Actinomycetota',
 '1290': 'Bacillota',
 '74426': 'Actinomycetota',
 '47678': 'Bacteroidota',
 '823': 'Bacteroidota',
 '28037': 'Bacillota',
 '1379': 'Bacillota',
 '1280': 'Bacillota',
 '357276': 'Bacteroidota',
 '28116': 'Bacteroidota',
 '33038': 'Bacillota',
 '46506': 'Bacteroidota',
 '204516': 'Bacteroidota',
 '818': 'Bacteroidota',
 '40545': 'Pseudomonadota',
 '68892': 'Bacillota',
 '1685': 'Actinomycetota',
 '1318': 'Bacillota',
 '

In [5]:
unique_values = set(output_dict.values())
print(f"Number of unique values: {len(unique_values)}")


Number of unique values: 48


In [6]:
# === Save to JSON
with open("/mnt/iusers01/fatpou01/bmh01/msc-bioinf-2024-2025/h44063jg/gut_microbiome_project/resources/taxon_phyla_dict.json", "w") as f:
    json.dump(output_dict, f, indent=2)

print("✅ Done! Orders saved to 'taxon_to_phyla.json'")

✅ Done! Orders saved to 'taxon_to_phyla.json'
