In [56]:
import requests
import pandas as pd
from pandas import json_normalize
import random

merged_df = pd.DataFrame()
medianums= ['190', '680', '742', '888', '1']

#for num in range(1,10):
for num in medianums:
    url = f'https://mediadive.dsmz.de/rest/medium-composition/{num}'
    if requests.get(url).status_code==200: 
        df = json_normalize(requests.get(url).json().get('data'))[["name", "g_l"]]
        df = df.rename(columns={"g_l": f"{num}"})  # give each column a unique name
        if merged_df.empty:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, on="name", how="outer")
    else:
        print('error',requests.get(url).status_code, ' for ',num)

medianums = list(merged_df.columns[1:])
print('medianums available:', medianums)
merged_df.head()

medianums available: ['190', '680', '742', '888', '1']


Unnamed: 0,name,190,680,742,888,1
0,(DL)-alpha-Lipoic acid,,,4.9e-05,,
1,(NH4)2SO4,,,,1.0,
2,Agar,15.0,20.0,,0.5,15.0
3,AlK(SO4)2 x 12 H2O,,,0.000198,,
4,B12,,,,1e-06,


In [62]:
df_list=[]
for num in medianums:
    url = f'https://mediadive.dsmz.de/rest/medium-strains/{num}'
    if requests.get(url).status_code==200: 
        df = json_normalize(requests.get(url).json().get('data'))[["species","bacdive_id"]]
        df = df.rename(columns={"species": f"species_{num}","bacdive_id": f"id_{num}"})  # give each column a unique name
        df_list.append(df)
    else:
        print('error',requests.get(url).status_code, ' for ',num)
merged_dfstrains = pd.concat(df_list, axis=1)

merged_dfstrains.head()

bacdive_id_dict = {}

for num in medianums:
    url = f'https://mediadive.dsmz.de/rest/medium-strains/{num}'
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json().get('data', [])
        if data:  # check if data list is not empty
            df = json_normalize(data)
            if "bacdive_id" in df.columns and not df.empty:
                first_id = df.iloc[0]["bacdive_id"]  # iloc needs integer index and column name as string
                bacdive_id_dict[num] = first_id
            else:
                print(f"No 'bacdive_id' found in data for {num}")
                bacdive_id_dict[num] = None
        else:
            print(f"No data found for {num}")
            bacdive_id_dict[num] = None
    else:
        print(f"Error {response.status_code} for {num}")
        bacdive_id_dict[num] = None

bacdive_id_dict


{'190': None, '680': 11053, '742': 4034, '888': 10442, '1': 654.0}

In [77]:
import bacdive
client = bacdive.BacdiveClient('tanliangxun2000@gmail.com', 'pass1234')

media_ncbi_dict = {}
for num in medianums:
    bacdive_id = bacdive_id_dict.get(num)
    print(f"medianum: {num} Strain bacdiveID: {bacdive_id}")
    
    if bacdive_id is not None:
        strain_data = client.search(id=int(bacdive_id))
        
        for strain in client.retrieve(['NCBI tax id']):
            strain_key = list(strain)[0]
            ncbi_list = strain[strain_key]  # this is always a list
            
            for entry in ncbi_list:
                ncbi_info = entry.get('NCBI tax id')
                
                if isinstance(ncbi_info, dict):
                    tax_id = ncbi_info.get('NCBI tax id')
                    level = ncbi_info.get('Matching level')
                    if level == 'species':
                        print(f"Tax ID: {tax_id}")

                elif isinstance(ncbi_info, list):
                    for subentry in ncbi_info:
                        tax_id = subentry.get('NCBI tax id')
                        level = subentry.get('Matching level')
                        if level == 'species':
                            print(f"Tax ID: {tax_id}")

                else:
                    # You can skip or print unexpected formats if needed
                    pass
        media_ncbi_dict[num]= tax_id
    else:
        print(f"No valid BacDive ID found for {num}")

media_ncbi_dict
# client.search(id='4437;4453')

# # for strain in client.retrieve():
# #     print(strain["General"])
# for strain in client.retrieve(['NCBI tax id']):
#     # print(strain)
#     # print(strain[list(strain)[0]])
#     print(strain[list(strain)[0]][0]['NCBI tax id']['NCBI tax id'])

-- Authentication successful --
medianum: 190 Strain bacdiveID: None
No valid BacDive ID found for 190
medianum: 680 Strain bacdiveID: 11053
Tax ID: 39122
medianum: 742 Strain bacdiveID: 4034
Tax ID: 53245
medianum: 888 Strain bacdiveID: 10442
Tax ID: 682713
medianum: 1 Strain bacdiveID: 654.0
Tax ID: 1398


{'680': 39122, '742': 53245, '888': 682713, '1': 1398}

In [83]:
import subprocess
import shlex   # for safe quoting if you ever need it

# Grab the *values* (index 1) from each (key, value) pair
taxid_list = [item[1] for item in media_ncbi_dict.items()]   # e.g. [1, 3, 4]

# Build the command as a list (avoid shell-string pitfalls)
cmd = [
    "/Users/liangxuntan/Code/fyp2025/scripts/run_taxid2fna.sh",
    *map(str, [562, 384])            # unpack TaxIDs as separate args
]

# Run it
subprocess.run(cmd, check=True)

→ Processing TaxID 562
→ Processing TaxID 384
✓ Finished. Full log: /Users/liangxuntan/Code/fyp2025/data/logs/taxid2fna_20250519_105046.log


CompletedProcess(args=['/Users/liangxuntan/Code/fyp2025/scripts/run_taxid2fna.sh', '562', '384'], returncode=0)

In [None]:
import subprocess
from pathlib import Path

fasta_dir = Path("/Users/liangxuntan/Code/fyp2025/data/labelled_genomes")
fna_files = sorted(fasta_dir.glob("*.fna"))

# --- or: run on ALL .fna files ---------------------------------------------
for fna in fna_files:
#for fna in [fna_files[0]]:
    subprocess.run(
        ["/Users/liangxuntan/Code/fyp2025/scripts/fna2prot.py", str(fna)],
        check=True
    )



Written 350 protein sequences to '/Users/liangxuntan/Code/fyp2025/data/prodigal_output/222_predictedproteins.prot'


In [102]:
from pathlib import Path
from fna2prot_parallel import run_all

fasta_dir = Path("/Users/liangxuntan/Code/fyp2025/data/labelled_genomes")
fna_files = sorted(fasta_dir.glob("*.fna"))
evalue = "1e-50"

results = run_all(fna_files, evalue, nproc=4)

print("Protein prediction output files:")
for r in results:
    print(r)



Written 350 protein sequences to '/Users/liangxuntan/Code/fyp2025/data/prodigal_output/222_predictedproteins.prot'




Written 210 protein sequences to '/Users/liangxuntan/Code/fyp2025/data/prodigal_output/562_predictedproteins.prot'
Written 346 protein sequences to '/Users/liangxuntan/Code/fyp2025/data/prodigal_output/382_predictedproteins.prot'
Written 321 protein sequences to '/Users/liangxuntan/Code/fyp2025/data/prodigal_output/384_predictedproteins.prot'
Protein prediction output files:
/Users/liangxuntan/Code/fyp2025/data/prodigal_output/222_predictedproteins.prot
/Users/liangxuntan/Code/fyp2025/data/prodigal_output/382_predictedproteins.prot
/Users/liangxuntan/Code/fyp2025/data/prodigal_output/384_predictedproteins.prot
/Users/liangxuntan/Code/fyp2025/data/prodigal_output/562_predictedproteins.prot
