In [4]:
import cobra
import requests
from Bio import SeqIO
import pandas as pd
import re

In [5]:

def fetch_pr(taxon_id, reviewed=False, format='fasta', limit=1000):
    base_url = "https://rest.uniprot.org/uniprotkb/stream"
    query = f"organism_id:{taxon_id}"
    if reviewed:
        query += " AND reviewed:true"

    params = {"query": query, "format": format, "size": limit}

    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        with open(f"/home/aac/Mohammad/GNN/MetaBiomeX-main/data/fastas/uniprot_{taxon_id}.fasta", "w") as f:
            f.write(response.text)
        return response.text 
    else:
        return None

In [14]:
models_list= pd.read_csv('/home/aac/Mohammad/GNN/MetaBiomeX-main/data/ListofModelsforSpecies.csv')

In [16]:
xml_files = []

for entry in models_list['models'].dropna():
    found = re.findall(r'\b[\w\-\.]+\.xml\b', str(entry))
    xml_files.extend(found)

xml_files = sorted(set(xml_files))
xml_files[:5]

['Acidaminococcus_fermentans_DSM_20731.xml',
 'Acidaminococcus_intestini_RyC_MR95.xml',
 'Actinomyces_naeslundii_str_Howell_279.xml',
 'Actinomyces_odontolyticus_ATCC_17982.xml',
 'Actinomyces_oris_K20.xml']

In [18]:
# df['Strain_cleaned'] = df['Strain'].str.replace('.xml', '', regex=False)


base_url = "https://www.vmh.life/_api/microbes/?reconstruction="

def get_ncbiid(strain):
    response = requests.get(base_url + strain)
    if response.status_code == 200:
        data = response.json()
        if data.get("results"):
            return data["results"][0].get("ncbiid")

In [19]:
xml_df = pd.DataFrame(xml_files, columns=['XML File'])
xml_df['strain_name'] = xml_df['XML File'].str.replace('.xml', '', regex=False)


In [20]:
xml_df['strain_name'][0]

'Acidaminococcus_fermentans_DSM_20731'

In [21]:
a= get_ncbiid(xml_df['strain_name'][0])
print(a)

591001


In [22]:
xml_df.head()

Unnamed: 0,XML File,strain_name
0,Acidaminococcus_fermentans_DSM_20731.xml,Acidaminococcus_fermentans_DSM_20731
1,Acidaminococcus_intestini_RyC_MR95.xml,Acidaminococcus_intestini_RyC_MR95
2,Actinomyces_naeslundii_str_Howell_279.xml,Actinomyces_naeslundii_str_Howell_279
3,Actinomyces_odontolyticus_ATCC_17982.xml,Actinomyces_odontolyticus_ATCC_17982
4,Actinomyces_oris_K20.xml,Actinomyces_oris_K20


In [23]:
xml_df['ncbiid'] = xml_df['strain_name'].apply(get_ncbiid)

In [25]:
xml_df.head()

Unnamed: 0,XML File,strain_name,ncbiid
0,Acidaminococcus_fermentans_DSM_20731.xml,Acidaminococcus_fermentans_DSM_20731,591001
1,Acidaminococcus_intestini_RyC_MR95.xml,Acidaminococcus_intestini_RyC_MR95,568816
2,Actinomyces_naeslundii_str_Howell_279.xml,Actinomyces_naeslundii_str_Howell_279,1115803
3,Actinomyces_odontolyticus_ATCC_17982.xml,Actinomyces_odontolyticus_ATCC_17982,411466
4,Actinomyces_oris_K20.xml,Actinomyces_oris_K20,871541


In [26]:
path = "/home/aac/Mohammad/GNN/MetaBiomeX-main/data/strain_with_NCBIids.csv"
xml_df.to_csv(path, index=False)

# Fetch Pr. seqs

In [1]:
import pandas as pd
xml_df= pd.read_csv("/home/aac/Mohammad/GNN/MetaBiomeX-main/data/strain_with_NCBIids.csv")

In [2]:
ncbi_ids = xml_df['ncbiid']#.dropna().unique()

In [6]:
fetched_files = {}
from tqdm import tqdm
for taxon_id in tqdm(ncbi_ids[130:140]):
    # print(f"fetching: {taxon_id}")
    try:
        file_path = fetch_pr(taxon_id=int(taxon_id), reviewed=False)
        if file_path:
            fetched_files[taxon_id] = file_path
            # print('done')
    except Exception as e:
        fetched_files[taxon_id] = f"Error: {str(e)}"
        print('failed')



  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:01<00:00,  6.13s/it]


In [3]:
ncbi_ids[130:140]

130    718252
131    411485
132    657322
133    546269
134    334413
135    411475
136    469605
137    469615
138    469616
139    469603
Name: ncbiid, dtype: int64

## Number of Downloaded sequences

In [17]:
import os
import glob
from Bio import SeqIO
from tqdm import tqdm
import multiprocessing as mp

empty_fasta = []
def process_fasta(file_path):
    """
    Process a single FASTA file:
    - Count sequences
    - Determine the min and max sequence lengths in the file
    """
    total = 0
    min_length = float('inf')
    max_length = 0
    
    # Parse each record in the FASTA file
    for record in SeqIO.parse(file_path, "fasta"):
        seq_len = len(record.seq)
        total += 1
        if seq_len < min_length:
            min_length = seq_len
        if seq_len > max_length:
            max_length = seq_len
            
    # Handle files that might be empty
    if total == 0:
        min_length = 0
        
    return total, min_length, max_length,file_path

if __name__ == "__main__":
    # Specify the directory containing the FASTA files
    directory = "/home/aac/Mohammad/GNN/MetaBiomeX-main/data/fastas"  # change this to your directory path
    # Gather all FASTA files (adjust the pattern if your file extension differs)
    fasta_files = glob.glob(os.path.join(directory, "*.fasta"))
    
    total_sequences = 0
    overall_min = float('inf')
    overall_max = 0

    # Use a multiprocessing Pool to process the FASTA files concurrently
    with mp.Pool() as pool:
        # Use tqdm to track progress of the multiprocessing jobs
        results = list(tqdm(pool.imap_unordered(process_fasta, fasta_files),
                            total=len(fasta_files),
                            desc="Processing FASTA files"))
    
    # Combine the results from each file
    for total, min_length, max_length,path in results:
        if total ==0:
            empty_fasta.append(path)
        total_sequences += total
        overall_min = min(overall_min, min_length)
        overall_max = max(overall_max, max_length)
    
    # If no sequences were found overall, adjust the minimum length
    if overall_min == float('inf'):
        overall_min = 0

    print("Total sequences:", total_sequences)
    print("Min sequence length:", overall_min)
    print("Max sequence length:", overall_max)


Processing FASTA files: 100%|██████████| 286/286 [00:00<00:00, 2091.02it/s]


Total sequences: 504004
Min sequence length: 0
Max sequence length: 9769


In [18]:
empty_ids = ([int(w.replace('.fasta','').split('_')[-1]) for w in empty_fasta])
len(empty_ids)

50

# Redownload empty ones

In [16]:
fetched_files = {}
from tqdm import tqdm
for taxon_id in tqdm(empty_ids):
    # print(f"fetching: {taxon_id}")
    try:
        file_path = fetch_pr(taxon_id=int(taxon_id), reviewed=False)
        if file_path:
            fetched_files[taxon_id] = file_path
            # print('done')
    except Exception as e:
        fetched_files[taxon_id] = f"Error: {str(e)}"
        print('failed')

100%|██████████| 50/50 [02:36<00:00,  3.12s/it]


In [19]:
empty_ids

[469617,
 273526,
 440497,
 702444,
 1235815,
 435837,
 888048,
 457424,
 563194,
 575595,
 1121102,
 1215915,
 1400136,
 742820,
 1328388,
 411901,
 1262650,
 525337,
 1328337,
 469599,
 888727,
 702443,
 1322347,
 411487,
 469602,
 536233,
 649757,
 1353979,
 663952,
 1121342,
 469618,
 657324,
 1035196,
 1122982,
 525279,
 871541,
 1440052,
 1316931,
 469616,
 469603,
 883166,
 657316,
 1121445,
 944562,
 1203540,
 562982,
 1437595,
 1074451,
 469615,
 1384484]