In [1]:
from Bio import SeqIO
from Bio.Seq import Seq

In [9]:
def validate_and_convert_fasta(fasta_file):
    protein_sequences = {}
    invalid_sequences = {}

    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence_id = record.id
        print("sequence_id: ", sequence_id)
        dna_sequence = str(record.seq)

        # Kiểm tra độ dài hợp lệ (chia hết cho 3)
        if len(dna_sequence) % 3 != 0:
            invalid_sequences[sequence_id] = "Length not divisible by 3"
            continue

        # Kiểm tra các ký tự hợp lệ
        if not all(base in 'ATGC' for base in dna_sequence.upper()):
            invalid_sequences[sequence_id] = "Contains invalid bases"
            continue

        try:
            protein = str(Seq(dna_sequence).translate())
            protein = protein.rstrip("*")
            protein_sequences[sequence_id] = protein
        except Exception as e:
            invalid_sequences[sequence_id] = str(e)

    return protein_sequences, invalid_sequences

In [10]:
fasta_file = "../../data/dee_phage_data/data/data/training/training/Dataset-1_temperate.fasta"
valid_proteins, invalid_proteins = validate_and_convert_fasta(fasta_file)

sequence_id:  Temp_gi|149|ref|NC_013055|
sequence_id:  Temp_gi|56|ref|NC_011976|
sequence_id:  Temp_gi|133|ref|NC_011613|
sequence_id:  Temp_gi|130|ref|NC_011611|
sequence_id:  Temp_gi|129|ref|NC_011357|
sequence_id:  Temp_gi|84|ref|NC_013059|
sequence_id:  Temp_gi|61|ref|NC_006949|
sequence_id:  Temp_gi|163|ref|NC_005056|
sequence_id:  Temp_gi|62|ref|NC_003356|
sequence_id:  Temp_gi|22|ref|NC_003288|
sequence_id:  Temp_gi|202|ref|NC_000929|
sequence_id:  Temp_gi|140|ref|NC_001895|
sequence_id:  Temp_gi|185|ref|NC_001609|
sequence_id:  Temp_gi|155|ref|AF503408|
sequence_id:  Temp_gi|115|ref|NC_001697|
sequence_id:  Temp_gi|184|ref|NC_005345|
sequence_id:  Temp_gi|79|ref|NC_005178|
sequence_id:  Temp_gi|122|ref|NC_001416|
sequence_id:  Temp_gi|223|ref|NC_001978|
sequence_id:  Temp_gi|52|ref|NC_000924|
sequence_id:  Temp_gi|69|ref|NC_002166|
sequence_id:  Temp_gi|64|ref|NC_002371|
sequence_id:  Temp_gi|67|ref|NC_005344|
sequence_id:  Temp_gi|112|ref|NC_002185|
sequence_id:  Temp_gi|89|re

In [14]:
print("Successfully converted sequences:")
for seq_id, protein in valid_proteins.items():
    print(f"{seq_id}: {protein[:50]}...")

Successfully converted sequences:
Temp_gi|130|ref|NC_011611|: CGEVTVLTHAKRGMDRLEPL*H*GSSRRGAVRVFGKYRSGLSKE*KILIV...
Temp_gi|202|ref|NC_000929|: CIDSLEVRKKPGGHWIIRDLMGLDLVGLASL*CKF*SLINETRKIVKNCF...
Temp_gi|115|ref|NC_001697|: SFPTEFTLKICVFSVRFQNKN*SNLLK*KRLLMQLNRDLNCKNFNLFHHF...
Temp_gi|79|ref|NC_005178|: CCGEVTVLTHAKRGRDRLEPL*H*DSSQTGSVRAFGKYRSGLSKDCKILI...
Temp_gi|64|ref|NC_002371|: VKYSIAN*T*QLWFPLQHQYRIRYYQIFSTHFQV*PLCHIVCNFWKTHFF...
Temp_gi|112|ref|NC_002185|: LYIVCFTNFQAA***NQKNFKKNQENC*H*I*FKL*YVCKLVREEEQNDR...
Temp_gi|89|ref|NC_003050|: ADEL*RYKATP*IINIFKGFCCVMKSKRGNQGAELKNSI*FIDQFILHIF...
Temp_gi|86|ref|NC_005822|: HEELHSKGYICFR*WVSS*FLMICCIFVVRKL*LQ*QLNYFILIFIF*LF...
Temp_gi|68|ref|NC_005340|: GVAGKALRASGGA*Y*KLSSERVVMALSWSLSVR**SGVVVRVWRV*GVM...
Temp_gi|119|ref|NC_000896|: EQ*IKPCSNSAIGLLCVIVKLEKGQKRDSFIQNFA*LHRILSVVYG*RVN...
Temp_gi|19|ref|NC_002486|: LPMDRFEFLF*ACYYAFYKLLNCYLIPI*YRLIKYAHSLMFFISI*MRVR...
Temp_gi|53|ref|NC_002661|: TITI**YVYFKKSFNASVLKRFIKLVKNIRAKKGRI*ANLECF

In [13]:
print("\nInvalid sequences:")
for seq_id, error in invalid_proteins.items():
    print(f"{seq_id}: {error}")


Invalid sequences:
Temp_gi|149|ref|NC_013055|: Length not divisible by 3
Temp_gi|56|ref|NC_011976|: Length not divisible by 3
Temp_gi|133|ref|NC_011613|: Length not divisible by 3
Temp_gi|129|ref|NC_011357|: Length not divisible by 3
Temp_gi|84|ref|NC_013059|: Length not divisible by 3
Temp_gi|61|ref|NC_006949|: Length not divisible by 3
Temp_gi|163|ref|NC_005056|: Length not divisible by 3
Temp_gi|62|ref|NC_003356|: Length not divisible by 3
Temp_gi|22|ref|NC_003288|: Length not divisible by 3
Temp_gi|140|ref|NC_001895|: Length not divisible by 3
Temp_gi|185|ref|NC_001609|: Length not divisible by 3
Temp_gi|155|ref|AF503408|: Length not divisible by 3
Temp_gi|184|ref|NC_005345|: Length not divisible by 3
Temp_gi|122|ref|NC_001416|: Length not divisible by 3
Temp_gi|223|ref|NC_001978|: Length not divisible by 3
Temp_gi|52|ref|NC_000924|: Length not divisible by 3
Temp_gi|69|ref|NC_002166|: Length not divisible by 3
Temp_gi|67|ref|NC_005344|: Length not divisible by 3
Temp_gi|9|ref|NC_

In [None]:
import os
data_dir = f"../../data/my_data/protein_format/{trim}/{data_type}"
result_dir = f"../../data/my_data/protbert_embedding/{mode}/{data_type}"
for folder in os.listdir(os.path.join(data_dir)):