In [18]:
def convert_genbank_to_faa(genbank_file, output_directory):
    with open(genbank_file, "r") as gb_fh:
        for record in SeqIO.parse(gb_fh, "genbank"):
            genome_name = os.path.basename(genbank_file).split('.')[0].split('_double')[0]  # Get the genome name from the file name
            output_file = os.path.join(output_directory, f"{genome_name}.faa")
            with open(output_file, "w") as out_fh:
                for feature in record.features:
                    if feature.type == "CDS" and "translation" in feature.qualifiers:
                        protein_seq = feature.qualifiers["translation"][0]
                        protein_id = feature.qualifiers.get("ID", [feature.qualifiers["locus_tag"][0]])[0]
                        product = feature.qualifiers.get("product", ["Unknown"])[0]
                        header = f">{genome_name}_{protein_id} {product}"  # Modify the header to include the genome name
                        out_fh.write(f"{header}\n")
                        out_fh.write(f"{protein_seq}\n")
    print(f"Conversion complete: {genbank_file} -> {output_file}")

# List of GenBank files to process
genbank_files = [
    "/n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/NC_031039_double_annotated.gbff",
    "/n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/NC_043027_double_annotated.gbff",
    "/n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/MW749003_double_annotated.gbff",
    "/n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/MZ422438_double_annotated.gbff",
    "/n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/MW248466_double_annotated.gbff",
    "/n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/LC680885_double_annotated.gbff",
    "/n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/MN091626_double_annotated.gbff"
]

# Output directory for the converted files
output_directory = "./actual_genbanks/"

# Ensure the output directory exists, if not create it
os.makedirs(output_directory, exist_ok=True)

# Process each GenBank file
for genbank_file in genbank_files:
    convert_genbank_to_faa(genbank_file, output_directory)


Conversion complete: /n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/NC_031039_double_annotated.gbff -> ./actual_genbanks/NC_031039.faa
Conversion complete: /n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/NC_043027_double_annotated.gbff -> ./actual_genbanks/NC_043027.faa
Conversion complete: /n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/MW749003_double_annotated.gbff -> ./actual_genbanks/MW749003.faa
Conversion complete: /n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/MZ422438_double_annotated.gbff -> ./actual_genbanks/MZ422438.faa
Conversion complete: /n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/MW248466_double_annotated.gbff -> ./actual_genbanks/MW248466.faa
Conversion complete: /n/eddy_lab/users/lmerk/phage_groupII/data/script_output/updated_genomes/LC680885_double_annotated.gbff -> ./actual_genbanks/LC680885.faa
Conversion complete: /n/eddy_lab/users/lme

In [15]:
import os
import subprocess

def crop_genbank_file(target_name, start, end, output_path):
    # Find the GenBank file that matches the target name
    genbank_file = None
    for filename in os.listdir("."):
        if filename.startswith(target_name):
            genbank_file = filename
            break
    print(genbank_file)
    if genbank_file is None:
        print(f"No GenBank file found matching the target name '{target_name}'.")
        return
    
    # Construct the paths for the input GenBank file and the output cropped file
    genbank_file_path = os.path.join(".", genbank_file)
    output_file_path = os.path.join(output_path, f"{target_name}.gb")
    
    # Construct the command
    cmd = f"python Genbank_slicer.py -g {genbank_file_path} -s {start-1} -e {end+1} -o {output_file_path}"
    
    # Run the command
    subprocess.run(cmd, shell=True)

# Example usage:
crop_genbank_file("NC_031039", 118211, 123199, "./cropped_new/")


NC_031039_double_and_intron_and_defense.gbff


In [16]:
# Example usage:
crop_genbank_file("MW749003", 22783, 26778, "./cropped_new/")


MW749003_double_and_intron_and_defense.gbff


In [19]:
crop_genbank_file("MZ422438", 149389, 150900, "./cropped_new/")

MZ422438_double_and_intron_and_defense.gbff


In [23]:
cmd = f"python Genbank_slicer.py -g ./MW248466.gb -s {110055-1} -e {113191+1} -o ./cropped_new/MW248466.gb"
    
# Run the command
subprocess.run(cmd, shell=True)


CompletedProcess(args='python Genbank_slicer.py -g ./MW248466.gb -s 110054 -e 113192 -o ./cropped_new/MW248466.gb', returncode=0)

In [24]:
cmd = f"python Genbank_slicer.py -g ./MW248466.gb -s {236746-1} -e {240125+1} -o ./cropped_rnapol/MW248466.gb"
    
# Run the command
subprocess.run(cmd, shell=True)

CompletedProcess(args='python Genbank_slicer.py -g ./MW248466.gb -s 236745 -e 240126 -o ./cropped_rnapol/MW248466.gb', returncode=0)

In [29]:
cmd = f"python Genbank_slicer.py -g ./ar9_with_intron.gb -s {62476-1} -e {65571+1} -o ./cropped_rnapol/NC_031039.gb"
    
# Run the command
subprocess.run(cmd, shell=True)

CompletedProcess(args='python Genbank_slicer.py -g ./ar9_with_intron.gb -s 62475 -e 65572 -o ./cropped_rnapol/NC_031039.gb', returncode=0)

In [27]:
crop_genbank_file("MW749003", 218305, 221221, "./cropped_rnapol/")

MW749003_double_and_intron_and_defense.gbff


In [28]:
crop_genbank_file("MN091626", 48427, 51053, "./cropped_rnapol/")

MN091626_double_and_intron_and_defense.gbff


In [30]:
crop_genbank_file("LC680885", 189934, 193894, "./cropped_rnapol/")

LC680885_double_and_intron_and_defense.gbff


In [31]:
crop_genbank_file("MZ422438", 162807, 166178, "./cropped_rnapol/")

MZ422438_double_and_intron_and_defense.gbff
