In [2]:
import glob
import numpy
import pandas
import seaborn
import matplotlib.pyplot as plt
from tqdm import tqdm
import multiprocessing as mp
import os
from build import build_model
import re

In [3]:
from IPython.display import display, HTML, Math, Markdown
display(HTML("<style>.container { width:95% !important; }</style>"))

%load_ext autoreload
%autoreload 2

In [4]:
import Bio
from Bio.SeqIO import parse,SeqRecord
from Bio.Seq import Seq
def load_genbank(org):
    return [i for i in parse(directory+org+".gb","gb")]

def extract_genes(genbank_records, output_fasta,org):
    # Open the FASTA file for writing
    with open(output_fasta, "w") as fasta_file:
        # Iterate through GenBank records
        for record in genbank_records:
            # Iterate through features in the record
            for feature in record.features:
                # Check if the feature is a gene
                if feature.type == "source":
                    continue
                if "locus_tag" not in feature.qualifiers:
                    continue
                # Extract the nucleotide sequence
                gene_sequence = feature.extract(record.seq)
                # Write the gene to the FASTA file
                fasta_file.write(f">{feature.qualifiers['locus_tag'][0]}\n{gene_sequence}\n")
                gene_to_genome[feature.qualifiers['locus_tag'][0]] = org

# Replace 'input.genbank' and 'output.fasta' with your file paths
# extract_genes('input.genbank', 'output.fasta')

In [5]:
taxonomy = pandas.read_csv("taxonomy.txt",index_col=0, sep = '\t')

In [6]:
survivors = set(pandas.read_csv("survivors.txt",index_col=0,header=None).index.to_list())

In [11]:
directory = "./agora-models/genbank-mixed-ncbi-agora-sources/"

lengths = {}
gene_to_genome = {}
for i in tqdm(os.listdir(directory)):
    if ".gb" not in i:
        continue
    # print(directory+i)
    org = i.split(".gb")[0]
    if org not in survivors:
        continue
    if org not in taxonomy.index:
        continue
    gbfile = load_genbank(org)
    id = str(taxonomy.loc[org]["NCBI Taxonomy ID"])
    extract_genes(gbfile, "./ibdmdb/annotated_genomes/" + id + ".fna",org)
    
    seq = "".join([str(i.seq) for i in gbfile])
    lengths[id] = len(seq)
    fasta = [SeqRecord(seq=Seq(seq),id=id,description=id)]
    with open("./ibdmdb/whole_genomes/" + id + ".fna", 'w') as outfile:
        for contig in fasta:
            Bio.SeqIO.write(contig, outfile, 'fasta')
    for i in gbfile:
        i.id = "{}|{}".format(org,i.id)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 757/757 [03:43<00:00,  3.39it/s]


In [12]:
pandas.DataFrame.from_dict({"genome":gene_to_genome}).to_csv("ibdmdb/gene_to_genome.csv")

In [69]:
metadata = taxonomy.copy().reset_index()
metadata["#genome"] = metadata["NCBI Taxonomy ID"]
metadata["unique_name"] = metadata["MicrobeID"]
metadata["total_length"] = [lengths[str(i)] for i in metadata["#genome"]]

In [70]:
metadata.to_csv("ibdmdb/custom_metadata.tsv",sep='\t')