In [9]:
# Import the needed libraries
from Bio.PDB import PDBParser, is_aa, Polypeptide
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio import SeqIO
import os
import subprocess

In [10]:
# Define a function that will preprocess the PDB file
def preprocess_sequence(pdb_files):
    sequence_records = []
    sequences_seen = set()
    non_standard_residue_count = 0
    missing_atom_count = 0

    for pdb_file in pdb_files:
        structure = PDBParser().get_structure(pdb_file, f"PDBData/{pdb_file}.pdb")

        for model in structure:
            for chain in model:
                sequence = ""
                for residue in chain:

                    # Check if residue is not a standard amino acid
                    if not is_aa(residue):
                        non_standard_residue_count += 1
                        continue

                    # Check if residue is missing atoms  
                    if residue.is_disordered():
                        missing_atom_count += 1
                        continue

                    # Convert to one-letter code
                    try:
                        sequence += Polypeptide.three_to_one(residue.get_resname())
                    except KeyError:
                        continue
                # Add correctly formed sequences
                if sequence and sequence not in sequences_seen:
                    sequence_records.append(SeqRecord(Seq(sequence), id=f"{pdb_file}_{chain.id}",
                                                      description=f"Source File: {pdb_file}, Chain: {chain.id}"))
                    sequences_seen.add(sequence)

    print(f"Found {non_standard_residue_count} non-standard residues")
    print(f"Found {missing_atom_count} missing atoms")

    return sequence_records


In [11]:
# Perform preprocessing on the PDB Data
pdb_files_with_extension = os.listdir("PDBData")
pdb_files = [file[:-4] for file in pdb_files_with_extension if file.endswith(".pdb")] 
sequences = preprocess_sequence(pdb_files)



Found 97375 non-standard residues
Found 2532 missing atoms




In [12]:
# Write the sequences to a FASTA file
SeqIO.write(sequences, "sequences.fasta", "fasta")

888

In [14]:
# Align the sequences using Clustal
in_file = "sequences.fasta"
out_file = "aligned_sequences.fasta"
clustal_cline = ClustalOmegaCommandline(infile=in_file, outfile=out_file, verbose=True, auto=True)

try:
    subprocess.run(str(clustal_cline), check=True, shell=True)
except subprocess.CalledProcessError as e:
    print(f"Error: {e}, {e.output}")

Using 8 threads
Read 888 sequences (type: Protein) from sequences.fasta
Setting options automatically based on input sequence characteristics (might overwrite some of your options).
Auto settings: Setting iteration to 1.
Using 95 seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of 888 sequences)
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 26.30u 0.08s 00:00:26.38 Elapsed: 00:00:11
mBed created 43 cluster/s (with a minimum of 1 and a soft maximum of 100 sequences each)
Distance calculation within sub-clusters done. CPU time: 6.86u 0.06s 00:00:06.92 Elapsed: 00:00:03
Guide-tree computation (mBed) done.
Progressive alignment progress done. CPU time: 867.70u 3.90s 00:14:31.60 Elapsed: 00:02:17
Iteration step 1 out of 1
Computing new guide tree (iteration step 0)
Using 95 seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of 888 sequences)
Calculating pairwise aligned identity