In [1]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation

#Specify transposable element keywords/terms
te_keywords = ["transposase"]

#Specify the multi-GenBank file containing annotations
multigenbank_file = "/Users/xanderlee/Desktop/Honours/Database/Archive_Database/TnCentral_Database/TnCentral_Genbank_Files/TnCentral_Genbank.gbff"

#List to store relevant TE annotations
te_annotations = []

#Parse the multi-GenBank file and extract relevant TE annotations
for index, record in enumerate(SeqIO.parse(multigenbank_file, "genbank")):

    #Get the date or set default to "unknown"
    record_date = record.annotations.get("date", "unknown")

    #Get the topology or set default to "unknown"
    topology = record.annotations.get("topology", "unknown")

    #Get the taxonomy or set default to ["unknown"]
    taxonomy = record.annotations.get("taxonomy", "unknown")

    #Get the source or set default to "unknown"
    source = record.annotations.get("source", "unknown")

    #Get the data file division or set default to "unknown"
    data_file_division = record.annotations.get("data_file_division", "unknown")

    for feature in record.features:
        if feature.type == "CDS" and any(kw in feature.qualifiers.get("product", [""])[0].lower() for kw in te_keywords):
            #Extract the feature's sequence and create a new SeqRecord
            feature_seq = feature.extract(record)

            #Extract accession number
            accession = record.id.split(".")[0]

            #Get the 'product' information from the CDS feature
            product = feature.qualifiers.get("product", "unknown")[0]

            #Extract the 'start' and 'end' positions from the CDS feature
            start = feature.location.start
            end = feature.location.end

            #Check if the feature is on the complement strand
            if feature.location.strand == -1:
                #Create a complement SeqFeature for the CDS
                cds_feature = SeqFeature(
                    FeatureLocation(start=start, end=end),
                    type="CDS",
                    #Set strand to -1 (complement)
                    strand=-1,  
                    qualifiers={"product": product}
                )
            else:
                #Create a SeqFeature for the CDS
                cds_feature = SeqFeature(
                    FeatureLocation(start=start, end=end),
                    type="CDS",
                    qualifiers={"product": product}
                )

            #Create new record for each genbank entry
            new_record = SeqRecord(
                seq = feature_seq.seq,
                id = accession,
                description = product,
                annotations = {
                    "molecule_type": "DNA",
                    "accession": accession,
                    "date": record_date,
                    "topology": topology,
                    "taxonomy": taxonomy,
                    "source": source,
                    "data_file_division": data_file_division,
                }
            )

            #CDS feature to the SeqRecord
            new_record.features.append(cds_feature)
            te_annotations.append(new_record)

#Write the TE annotations to a new GenBank file
with open("/Users/xanderlee/Desktop/Honours/Database/updated_TnCentral_annotation_genbank.gbff", "w") as output_file:
    SeqIO.write(te_annotations, output_file, "genbank")


'LOCUS       Exported                8525 bp DNA     linear   BCT 01-JUN-2022\n'
Found locus 'Exported' size '8525' residue_type 'DNA'
Some fields may be wrong.
'LOCUS       Exported                6418 bp DNA     linear   BCT 01-JUN-2022\n'
Found locus 'Exported' size '6418' residue_type 'DNA'
Some fields may be wrong.
'LOCUS       Exported                8107 bp DNA     linear   BCT 08-MAY-2021\n'
Found locus 'Exported' size '8107' residue_type 'DNA'
Some fields may be wrong.
'LOCUS       Exported                8781 bp DNA     linear   BCT 08-MAY-2021\n'
Found locus 'Exported' size '8781' residue_type 'DNA'
Some fields may be wrong.
'LOCUS       Exported               11000 bp DNA     linear   BCT 01-JUN-2022\n'
Found locus 'Exported' size '11000' residue_type 'DNA'
Some fields may be wrong.
'LOCUS       Exported               17336 bp DNA     linear   BCT 01-JUN-2022\n'
Found locus 'Exported' size '17336' residue_type 'DNA'
Some fields may be wrong.
'LOCUS       Exported           

ValueError: Problem with 'misc_feature' feature:
1..441
/note="identical to sequence adjacent to 3'-CS of In3