Import the needed libraries

In [1]:
from Bio.PDB import PDBParser, is_aa, Polypeptide
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio import SeqIO
import os
import subprocess

Define a function that will preprocess the PDB files

In [2]:
def preprocess_sequence(pdb_files):
    sequence_records = []
    sequences_seen = set()
    
    # Count the number of missing atoms and non-standard residues 
    non_standard_residue_count = 0
    missing_atom_count = 0
    
    # Loop through all the PDB files
    for pdb_file in pdb_files:
        
        # Parse the PDB files 
        structure = PDBParser(QUIET=True).get_structure(pdb_file, f"PDB Data/{pdb_file}.pdb")

        for model in structure:
            for chain in model:
                sequence = ""
                for residue in chain:

                    # Check if residue is not a standard amino acid
                    if not is_aa(residue):
                        non_standard_residue_count += 1
                        continue

                    # Check if residue is missing atoms  
                    if residue.is_disordered():
                        missing_atom_count += 1
                        continue

                    # Convert to one-letter code
                    try:
                        sequence += Polypeptide.three_to_one(residue.get_resname())
                    except KeyError:
                        continue
                        
                # Add correctly formed sequences
                if sequence and sequence not in sequences_seen:
                    sequence_records.append(SeqRecord(Seq(sequence), id=f"{pdb_file}_{chain.id}",
                                                      description=f"Source File: {pdb_file}, Chain: {chain.id}"))
                    sequences_seen.add(sequence)

    print(f"Found {non_standard_residue_count} non-standard residues")
    print(f"Found {missing_atom_count} missing atoms")

    return sequence_records

Perform the preprocessing on the PDB Data

In [3]:
pdb_files_with_extension = os.listdir("PDB Data")
pdb_files = [file[:-4] for file in pdb_files_with_extension if file.endswith(".pdb")] 
sequences = preprocess_sequence(pdb_files)

Found 1343635 non-standard residues
Found 7206 missing atoms


Write the sequences to a FASTA file

In [4]:
SeqIO.write(sequences, "FASTA Data/Sequences.fasta", "fasta")

7753

Align the sequences using Clustal Omega

In [2]:
in_file = "FASTA Data/Sequences.fasta"
out_file = "FASTA Data/Aligned_Sequences.fasta"
clustal_cline = ClustalOmegaCommandline(infile=in_file, outfile=out_file, verbose=True, auto=True)

try:
    subprocess.run(str(clustal_cline), check=True, shell=True)
except subprocess.CalledProcessError as e:
    print(f"Error: {e}, {e.output}")

Using 8 threads
Read 7753 sequences (type: Protein) from FASTAData/Sequences.fasta
Setting options automatically based on input sequence characteristics (might overwrite some of your options).
Using 166 seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of 7753 sequences)
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 189.17u 0.51s 00:03:09.67 Elapsed: 00:01:23
mBed created 248 cluster/s (with a minimum of 1 and a soft maximum of 100 sequences each)
Distance calculation within sub-clusters done. CPU time: 52.48u 0.07s 00:00:52.55 Elapsed: 00:00:22
Guide-tree computation (mBed) done.
Progressive alignment progress done. CPU time: 60241.44u 214.81s 16:47:36.25 Elapsed: 03:16:31
Alignment written to FASTAData/Aligned_Sequences.fasta
