In [8]:
# The script is used to generate genbank files, used as input to the domain visualization tool - CLINKER, iterating through each cluster creates 
# the corresponding files named: |identifier of genome|_|cluster|_|cluster number|_|domains|.gb, ex. NC_027119.1_cluster_3_domains.gb

# Finally, all the created genbank files are moved to directories and subdirectories in a specific location on the computer.

In [9]:
# All the necessary imports to make the script work
import os
import shutil
import pandas as pd
from Bio import Entrez, SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqRecord import SeqRecord

In [10]:
# Set up your email address here, and your NCBI API key if you have one
Entrez.email = "maksnecki@gmail.com"
#Entrez.api_key = "your_ncbi_api_key"

# Load data from a CSV file, this file is a table from the script “maniak_analiza.ipynb”, showing in each row the domain that is in the genome.
data = pd.read_csv('C:/Users/maksn/Downloads/data_non_core.csv', sep=",", header=0)


# A function that retrieves nucleotide sequences from the Refseq database for a given genome
def fetch_genome_sequence(genome_id):
    handle = Entrez.efetch(db="nucleotide", id=genome_id, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "genbank")
    handle.close()
    return record


In [12]:
# Extraction of unique cluster numbers

unique_clusters = data['__mclCluster'].unique()

# Sorting of unique cluster numbers
unique_clusters.sort()

In [None]:
# The following code snippet is used to retrieve genbank files for a given genomic identifier from the Refseq database. 
# The ORFs coordinates are then swapped with ECOD domain coordinates, such a file is saved, and we iterate over the clusters.

In [14]:
# Iteration through cluster numbers
for cluster_number in unique_clusters:
    
    # Filter data for a given cluster
    cluster_data = data[data['__mclCluster'] == cluster_number]

    # Iterate through the unique genome_id in the cluster
    for genome_id in cluster_data['genome_id'].unique():
        
        # Download the genome sequence record from NCBI Refseq
        try:
            genome_record = fetch_genome_sequence(genome_id)
        except Exception as e:
            print(f"Failed to download the sequence for {genome_id}: {e}")
            continue
        
        # Filter the data for a given genome_id in the cluster
        genome_data = cluster_data[cluster_data['genome_id'] == genome_id]
        
        # Create a SeqRecord object from the downloaded sequence
        record = SeqRecord(genome_record.seq, id=genome_record.id, name=genome_record.name, description=genome_record.description)
        
        # Add annotation on the type of sequence
        record.annotations["molecule_type"] = "DNA"
        
        # Add domains and their parameters as features to the record
        for index, row in genome_data.iterrows():
            feature = SeqFeature(
                location=FeatureLocation(start=row['domain_start'], end=row['domain_end'], strand=row['strand']),
                type="CDS",
                qualifiers={
                    "domain_id": row['t_id'],
                    "domain_name": row['t_name'],
                    "domain_length": str(row['domain_length']),
                    "function_of_protein": str(row['func_annot']),
                    "category_of_function": str(row['func_category'])
                }
            )
            record.features.append(feature)
        
        # Save the record to a GenBank file
        output_file = f"{genome_id}_cluster_{cluster_number}_domains_non_core.gb"
        with open(output_file, "w") as output_handle:
            SeqIO.write(record, output_handle, "genbank")
        
        print(f"GenBank file saved as: {output_file}")

GenBank file saved as: NC_054935.2_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054936.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054908.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054907.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_028780.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054934.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054905.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054906.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054904.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054937.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054910.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054911.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054909.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054932.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC_054931.1_cluster_1_domains_non_core.gb
GenBank file saved as: NC

In [None]:
# This code snippet is used to move all domain genbank files for each genome from the default work folder to the folder suggested by the "destination_folder" variable.
# The structure of the moved files is as follows:

# Main_folder_with_clusters
#          -> Cluster_1
#                -> Genome_1_for_cluster_1
#                -> Genome_2_for_cluster_1
#                -> Genome_3_for_cluster_1
#          -> Cluster_2
#                -> Genome_1_for_cluster_2
#                -> Genome_2_for_cluster_2
#          . . . . .
#          -> Cluster_last
#                -> Genome_1_for_last_cluster

In [15]:
# Path to file folder
source_folder = 'C:/Users/maksn/Downloads'
# Path to new main folder
destination_folder = 'C:/Users/maksn/Downloads/proj_domain_clusters_non_core'

# Make sure that the destination folder exists
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Iterating through the files in the source folder
for filename in os.listdir(source_folder):
    # Checking that the file has the right format
    if '_cluster_' in filename and '_domains_non_core' in filename:
        # Extracting cluster number
        cluster_number = filename.split('_cluster_')[1].split('_domains')[0]
        cluster_folder_name = f'cluster_{cluster_number}'
        
        # Path to cluster folder
        cluster_folder_path = os.path.join(destination_folder, cluster_folder_name)
        
        # Make sure that the cluster folder exists
        if not os.path.exists(cluster_folder_path):
            os.makedirs(cluster_folder_path)
        
        # Paths to the source and destination file
        source_file_path = os.path.join(source_folder, filename)
        destination_file_path = os.path.join(cluster_folder_path, filename)
        
        # File transfer
        shutil.move(source_file_path, destination_file_path)

print("The files have been transferred.")

The files have been transferred.
