In [None]:
# Import the needed libraries
from Bio.PDB import PDBParser, is_aa, Polypeptide
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio import SeqIO
import os

In [None]:
# Get the total number of PDB files in the PDB Data folder
files_count = len(os.listdir("PDBData")) - 1
print("The total number of PDB files is: ", files_count)

In [None]:
# Define a function that will preprocess the PDB file
def preprocess_sequence(pdb_files):
    sequences = []
    non_standard_residue_count = 0
    missing_atom_count = 0
    
    for pdb_file in pdb_files:
        structure = PDBParser().get_structure(pdb_file, f"PDB Data/{pdb_file}.pdb")
        
        for model in structure:
            for chain in model:
                sequence = ""
                for residue in chain:
                    # Check to see if it's not a standard amino acid
                    if not is_aa(residue):
                        non_standard_residue_count += 1
                        continue

                    # Ignore residues that are missing atoms  
                    if residue.is_disordered():
                        missing_atom_count += 1
                        continue
                    
                    # Convert to one-letter code
                    try:
                        sequence += Polypeptide.three_to_one(residue.get_resname())
                    except KeyError:
                        continue

                # Add correctly formed sequences
                if sequence:
                    sequences.append(sequence)
                    
    print(f"Found {non_standard_residue_count} non-standard residues")
    print(f"Found {missing_atom_count} missing atoms")
    
    return sequences

In [None]:
pdb_files_with_extension = os.listdir("PDB Data")
pdb_files = [file[:-4] for file in pdb_files_with_extension if file.endswith(".pdb")] 
sequences = preprocess_sequence(pdb_files)

In [None]:
# Write the sequences to a FASTA file
records = [SeqRecord(Seq(seq), id=f"seq{i}") for i, seq in enumerate(sequences)]
SeqIO.write(records, "sequences.fasta", "fasta")

In [None]:
# Align the sequences using Clustal Omega
in_file = "sequences.fasta"
out_file = "aligned_sequences.fasta"

clustalomega_cline = ClustalOmegaCommandline(infile=in_file, outfile=out_file, verbose=True, auto=True)
os.system(str(clustalomega_cline))