In [18]:
import os
import pandas as pd

# Function to read CSV files from each folder
def read_csv_from_folders(base_path):
    data_frames = []
    for folder_name in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder_name)
        if os.path.isdir(folder_path):
            csv_path = os.path.join(folder_path, 'final_design_stats.csv')
            if os.path.exists(csv_path):
                df = pd.read_csv(csv_path)
                data_frames.append(df)
    return data_frames

# Example usage
base_path = './../out/bindcraft/'
data_frames = read_csv_from_folders(base_path)

# Display the first dataframe as an example
if data_frames:
    data_frames[0].head()
else:
    print("No CSV files found.")

In [6]:
from Bio.PDB import PDBParser, PPBuilder

# Initialize the parser
parser = PDBParser(QUIET=True)

# Load the PDB file (replace 'example.pdb' with your file)
structure = parser.get_structure("Protein", "1yi5_l86_s832794_mpnn20_model2.pdb")

# Use the Polypeptide builder to extract sequences
ppb = PPBuilder()

print("Amino Acid Sequences by Chain:")
for model in structure:
    for chain in model:
        print(f"Chain {chain.id}:")
        peptides = ppb.build_peptides(chain)
        if peptides:
            for i, peptide in enumerate(peptides):
                print(f"{peptide.get_sequence()}")
        else:
            print("  No sequence found for this chain.")

# print("Amino Acid Sequences:")
# for pp in ppb.build_peptides(structure):
#     print(pp.get_sequence())

Amino Acid Sequences by Chain:
Chain A:
IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCPTVKTGVDIQCCSTDNCNPFPTR
Chain B:
SPEEAYKELMEKVKEGGGEELAKEIEEVFRRYIPDDPAALPDDWARWEELGRELEPLIKRLPEEYQKEIVDLYWRWVWAVREAHAA


In [7]:
import glob
from Bio.PDB import PDBParser, PPBuilder

# Specify the file pattern (e.g., '*.pdb' for all PDB files in the current directory)
file_pattern = "*.pdb"

# Initialize the PDB parser
parser = PDBParser(QUIET=True)

# Use glob to get a list of matching files
pdb_files = glob.glob(file_pattern)

# Loop through each PDB file
for pdb_file in pdb_files:
    print(f"Processing file: {pdb_file}")
    
    # Parse the PDB structure
    structure = parser.get_structure("Protein", pdb_file)
    
    # Use the Polypeptide builder to extract sequences for each chain
    ppb = PPBuilder()
    
    print("\nAmino Acid Sequences by Chain:")
    for model in structure:
        for chain in model:
            print(f"Chain {chain.id}:")
            peptides = ppb.build_peptides(chain)
            if peptides:
                for i, peptide in enumerate(peptides):
                    print(f"  Sequence {i + 1}: {peptide.get_sequence()}")
            else:
                print("  No sequence found for this chain.")
    print("-" * 50)


Processing file: 1yi5_l86_s832794_mpnn20_model2.pdb

Amino Acid Sequences by Chain:
Chain A:
  Sequence 1: IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCPTVKTGVDIQCCSTDNCNPFPTR
Chain B:
  Sequence 1: SPEEAYKELMEKVKEGGGEELAKEIEEVFRRYIPDDPAALPDDWARWEELGRELEPLIKRLPEEYQKEIVDLYWRWVWAVREAHAA
--------------------------------------------------


In [17]:
import os
from Bio.PDB import PDBParser, PPBuilder

def extract_sequences_from_pdb(parent_folder):
    """
    Recursively searches for PDB files in the given parent folder and extracts sequences.
    
    Args:
        parent_folder (str): The path to the parent folder to search for PDB files.
        
    Returns:
        dict: A dictionary where keys are filenames and values are sequences by chain.
    """
    # Initialize the PDB parser
    parser = PDBParser(QUIET=True)
    
    # Dictionary to store sequences by file
    sequences = {}
    
    # Walk through the parent folder and all its subdirectories
    for root, _, files in os.walk(parent_folder):
        for file in files:
            if file.endswith(".pdb"):  # Check if the file is a PDB file
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")
                
                try:
                    # Parse the PDB structure
                    structure = parser.get_structure("Protein", file_path)
                    
                    # Use the Polypeptide builder to extract sequences for each chain
                    ppb = PPBuilder()
                    file_sequences = {}
                    
                    for model in structure:
                        for chain in model:
                            chain_id = chain.id
                            peptides = ppb.build_peptides(chain)
                            chain_sequences = [
                                str(peptide.get_sequence()) for peptide in peptides
                            ]
                            if chain_sequences:
                                file_sequences[chain_id] = chain_sequences
                            else:
                                file_sequences[chain_id] = ["No sequence found"]
                    
                    # Add the file's sequences to the result dictionary
                    sequences[file_path] = file_sequences
                
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
    
    return sequences

# Example usage
parent_folder = "./../out/bindcraft"  # Replace with your parent folder path
sequences = extract_sequences_from_pdb(parent_folder)

# Print the results
for file, chains in sequences.items():
    print(f"\nFile: {file}")
    for chain, seq_list in chains.items():
        print(f"  Chain {chain}:")
        for i, seq in enumerate(seq_list):
            print(f"    Sequence {i + 1}: {seq}")


Processing file: ./../out/bindcraft/2501180951/Accepted/1yi5_l159_s712800_mpnn6_model1.pdb
Processing file: ./../out/bindcraft/2501180951/Accepted/1yi5_l159_s712800_mpnn15_model1.pdb
Processing file: ./../out/bindcraft/2501180951/Accepted/1yi5_l153_s269018_mpnn8_model2.pdb
Processing file: ./../out/bindcraft/2501180951/Accepted/Ranked/2_1yi5_l153_s269018_mpnn8_model2.pdb
Processing file: ./../out/bindcraft/2501180951/Accepted/Ranked/1_1yi5_l159_s712800_mpnn15_model1.pdb
Processing file: ./../out/bindcraft/2501180951/Accepted/Ranked/3_1yi5_l159_s712800_mpnn6_model1.pdb
Processing file: ./../out/bindcraft/2501180951/Trajectory/Clashing/1yi5_l154_s286566.pdb
Processing file: ./../out/bindcraft/2501180951/Trajectory/Relaxed/1yi5_l153_s269018.pdb
Processing file: ./../out/bindcraft/2501180951/Trajectory/Relaxed/1yi5_l160_s99436.pdb
Processing file: ./../out/bindcraft/2501180951/Trajectory/Relaxed/1yi5_l150_s209750.pdb
Processing file: ./../out/bindcraft/2501180951/Trajectory/Relaxed/1yi5_l1

In [35]:
import os
from Bio.PDB import PDBParser, PPBuilder

def extract_sequences_from_accepted_folders(parent_folder):
    """
    Searches for PDB files directly in 'Accepted' folders within the given parent folder and extracts sequences.
    Ignores subdirectories inside 'Accepted' folders.
    
    Args:
        parent_folder (str): The path to the parent folder to search for PDB files.
        
    Returns:
        dict: A dictionary where keys are filenames and values are sequences by chain.
    """
    # Initialize the PDB parser
    parser = PDBParser(QUIET=True)
    
    # Dictionary to store sequences by file
    sequences = {}
    
    # Walk through the parent folder and all its subdirectories
    for root, dirs, files in os.walk(parent_folder):
        # Check if the current directory is an 'Accepted' folder
        if os.path.basename(root) == "Accepted":
            for file in files:
                if file.endswith(".pdb"):  # Check if the file is a PDB file
                    file_path = os.path.join(root, file)
                    print(f"Processing file: {file_path}")
                    
                    try:
                        # Parse the PDB structure
                        structure = parser.get_structure("Protein", file_path)
                        
                        # Use the Polypeptide builder to extract sequences for each chain
                        ppb = PPBuilder()
                        file_sequences = {}
                        
                        for model in structure:
                            for chain in model:
                                chain_id = chain.id
                                peptides = ppb.build_peptides(chain)
                                chain_sequences = [
                                    str(peptide.get_sequence()) for peptide in peptides
                                ]
                                if chain_sequences:
                                    file_sequences[chain_id] = chain_sequences
                                else:
                                    file_sequences[chain_id] = ["No sequence found"]
                        
                        # Add the file's sequences to the result dictionary
                        sequences[file_path] = file_sequences
                    
                    except Exception as e:
                        print(f"Error processing file {file_path}: {e}")
    
    return sequences


import pandas as pd
def extract_sequences_to_dataframe(sequences):
    """
    Extracts File ID, Sequence 1, and Sequence 2 from the sequences dictionary into a DataFrame.
    
    Args:
        sequences (dict): Dictionary with file paths as keys and chain sequences as values.
        
    Returns:
        pd.DataFrame: DataFrame containing File ID, Sequence 1, and Sequence 2.
    """
    # List to store results
    data = []
    
    for file, chains in sequences.items():
        # Extract the File ID from the file path
        file_id = os.path.splitext(os.path.basename(file))[0]
        
        # Initialize placeholders for Sequence 1 and Sequence 2
        sequence1 = None
        sequence2 = None
        
        # Iterate through chains to get sequences
        for chain, seq_list in chains.items():
            if chain == "A" and seq_list:  # Assume Sequence 1 corresponds to Chain A
                sequence1 = seq_list[0]  # Take the first sequence from Chain A
            elif chain == "B" and seq_list:  # Assume Sequence 2 corresponds to Chain B
                sequence2 = seq_list[0]  # Take the first sequence from Chain B
        
        # Append to the results
        data.append([file_id, sequence1, sequence2])
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=["DesignModel", "TargetSequence", "Sequence"])
    return df

# Example usage
parent_folder = "./../out/bindcraft"  # Replace with your parent folder path
sequences = extract_sequences_from_accepted_folders(parent_folder)
df = extract_sequences_to_dataframe(sequences)
df.head()
# # Print the results
# for file, chains in sequences.items():
#     print(f"\nFile: {file}")
#     for chain, seq_list in chains.items():
#         print(f"  Chain {chain}:")
#         for i, seq in enumerate(seq_list):
#             print(f"    Sequence {i + 1}: {seq}")


Processing file: ./../out/bindcraft/2501180951/Accepted/1yi5_l159_s712800_mpnn6_model1.pdb
Processing file: ./../out/bindcraft/2501180951/Accepted/1yi5_l159_s712800_mpnn15_model1.pdb
Processing file: ./../out/bindcraft/2501180951/Accepted/1yi5_l153_s269018_mpnn8_model2.pdb
Processing file: ./../out/bindcraft/2501171452/Accepted/5nq4_l120_s808237_mpnn8_model2.pdb
Processing file: ./../out/bindcraft/2501171452/Accepted/5nq4_l120_s808237_mpnn5_model2.pdb
Processing file: ./../out/bindcraft/2501162231/Accepted/5nq4_l86_s533102_mpnn8_model1.pdb
Processing file: ./../out/bindcraft/2501162231/Accepted/5nq4_l88_s194736_mpnn6_model2.pdb
Processing file: ./../out/bindcraft/2501162231/Accepted/5nq4_l88_s194736_mpnn5_model2.pdb
Processing file: ./../out/bindcraft/2501162231/Accepted/5nq4_l86_s533102_mpnn6_model1.pdb
Processing file: ./../out/bindcraft/2501181208/Accepted/1yi5_l96_s23541_mpnn2_model2.pdb
Processing file: ./../out/bindcraft/2501181208/Accepted/1yi5_l96_s23541_mpnn13_model1.pdb
Proce

Unnamed: 0,DesignModel,TargetSequence,Sequence
0,1yi5_l159_s712800_mpnn6_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,SAMVEELLEEAEKMHQAMLDKAPPEVLKPLQTKHLSKFDEVFAGWA...
1,1yi5_l159_s712800_mpnn15_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,SAEVEELLKEAEKMHEAMLAGAPEEVLKPLQTAHLESFDKVMAGLF...
2,1yi5_l153_s269018_mpnn8_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,MLENIKPWLQKTIAPSDWSTPYELLWALYWLLDDIRYIVEELKKKI...
3,5nq4_l120_s808237_mpnn8_model2,LKCNKLVPIAYKTCPEGKNLCYKMFMMSDLTIPVKRGCIDVCPKNS...,MDVNLTEEKIREAAEKDPREAMILFMKAFHALRGDDKGIVKVFDIA...
4,5nq4_l120_s808237_mpnn5_model2,LKCNKLVPIAYKTCPEGKNLCYKMFMMSDLTIPVKRGCIDVCPKNS...,MNVDLTEEKIREALEKDPRDAMILFMKAFHALRGDDEKIVEVFEKF...


In [36]:
df.tail()

Unnamed: 0,DesignModel,TargetSequence,Sequence
23,1yi5_l153_s230218_mpnn1_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,PLSPEELELRYKFEDVVDFIIDMYYMFELFKELVENGKSKYPLSEI...
24,1yi5_l89_s121962_mpnn16_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,LDPWHRVPWEIWEQLEPLMMEFLKEISKETGKSYKEVVRAFFEVYE...
25,1yi5_l86_s832794_mpnn20_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,SPEEAYKELMEKVKEGGGEELAKEIEEVFRRYIPDDPAALPDDWAR...
26,1yi5_l96_s315414_mpnn5_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,MPIKLSRHEQIWMVDDMEWMRQEYIKKYGELPKDFKELFEYYKKVL...
27,1yi5_l96_s315414_mpnn6_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,EPIKLSRHEQIWMLDDMEWMREEYIKEHGELPKDFEEQFEYFKEVL...


In [37]:
final_design_stats_df = pd.concat(data_frames, ignore_index=True)
final_design_stats_df.head()

Unnamed: 0,Rank,Design,Protocol,Length,Seed,Helicity,Target_Hotspot,Sequence,InterfaceResidues,MPNN_score,...,1_Binder_RMSD,2_Binder_RMSD,3_Binder_RMSD,4_Binder_RMSD,5_Binder_RMSD,DesignTime,Notes,TargetSettings,Filters,AdvancedSettings
0,1,1yi5_l159_s712800_mpnn15,4stage,159,712800,-0.3,"H6,H7,H26-H37,H50,H51",SAEVEELLKEAEKMHEAMLAGAPEEVLKPLQTAHLESFDKVMAGLF...,"B14,B18,B24,B28,B31,B32,B34,B35,B77,B78,B80,B8...",1.2,...,2.96,2.97,,,,"0 hours, 1 minutes, 20 seconds",,1yi5,default_filters,default_4stage_multimer
1,2,1yi5_l153_s269018_mpnn8,4stage,153,269018,-0.3,"H6,H7,H26-H37,H50,H51",MLENIKPWLQKTIAPSDWSTPYELLWALYWLLDDIRYIVEELKKKI...,"B12,B13,B14,B15,B16,B17,B26,B29,B30,B33,B34,B3...",0.89,...,1.88,1.94,,,,"0 hours, 1 minutes, 54 seconds",,1yi5,default_filters,default_4stage_multimer
2,3,1yi5_l159_s712800_mpnn6,4stage,159,712800,-0.3,"H6,H7,H26-H37,H50,H51",SAMVEELLEEAEKMHQAMLDKAPPEVLKPLQTKHLSKFDEVFAGWA...,"B14,B18,B28,B31,B32,B34,B35,B77,B78,B81,B84,B8...",1.17,...,2.66,2.79,,,,"0 hours, 1 minutes, 47 seconds",,1yi5,default_filters,default_4stage_multimer
3,1,5nq4_l120_s808237_mpnn5,4stage,120,808237,-0.3,,MNVDLTEEKIREALEKDPRDAMILFMKAFHALRGDDEKIVEVFEKF...,"B22,B23,B26,B27,B29,B30,B33,B71,B74,B75,B77,B7...",0.96,...,1.99,1.89,,,,"0 hours, 1 minutes, 11 seconds",,5nq4,default_filters,default_4stage_multimer
4,2,5nq4_l120_s808237_mpnn8,4stage,120,808237,-0.3,,MDVNLTEEKIREAAEKDPREAMILFMKAFHALRGDDKGIVKVFDIA...,"B22,B23,B25,B26,B27,B29,B30,B33,B71,B74,B75,B7...",0.97,...,2.56,2.18,,,,"0 hours, 1 minutes, 14 seconds",,5nq4,default_filters,default_4stage_multimer


In [38]:
final_design_stats_df.shape

(28, 232)

In [39]:
df.shape

(28, 3)

In [40]:
combined_df = pd.merge(final_design_stats_df, df, on='Sequence')
combined_df['TargetSequenceLength'] = df['TargetSequence'].apply(len)
combined_df.head()

Unnamed: 0,Rank,Design,Protocol,Length,Seed,Helicity,Target_Hotspot,Sequence,InterfaceResidues,MPNN_score,...,4_Binder_RMSD,5_Binder_RMSD,DesignTime,Notes,TargetSettings,Filters,AdvancedSettings,DesignModel,TargetSequence,TargetSequenceLength
0,1,1yi5_l159_s712800_mpnn15,4stage,159,712800,-0.3,"H6,H7,H26-H37,H50,H51",SAEVEELLKEAEKMHEAMLAGAPEEVLKPLQTAHLESFDKVMAGLF...,"B14,B18,B24,B28,B31,B32,B34,B35,B77,B78,B80,B8...",1.2,...,,,"0 hours, 1 minutes, 20 seconds",,1yi5,default_filters,default_4stage_multimer,1yi5_l159_s712800_mpnn15_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
1,2,1yi5_l153_s269018_mpnn8,4stage,153,269018,-0.3,"H6,H7,H26-H37,H50,H51",MLENIKPWLQKTIAPSDWSTPYELLWALYWLLDDIRYIVEELKKKI...,"B12,B13,B14,B15,B16,B17,B26,B29,B30,B33,B34,B3...",0.89,...,,,"0 hours, 1 minutes, 54 seconds",,1yi5,default_filters,default_4stage_multimer,1yi5_l153_s269018_mpnn8_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
2,3,1yi5_l159_s712800_mpnn6,4stage,159,712800,-0.3,"H6,H7,H26-H37,H50,H51",SAMVEELLEEAEKMHQAMLDKAPPEVLKPLQTKHLSKFDEVFAGWA...,"B14,B18,B28,B31,B32,B34,B35,B77,B78,B81,B84,B8...",1.17,...,,,"0 hours, 1 minutes, 47 seconds",,1yi5,default_filters,default_4stage_multimer,1yi5_l159_s712800_mpnn6_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
3,1,5nq4_l120_s808237_mpnn5,4stage,120,808237,-0.3,,MNVDLTEEKIREALEKDPRDAMILFMKAFHALRGDDEKIVEVFEKF...,"B22,B23,B26,B27,B29,B30,B33,B71,B74,B75,B77,B7...",0.96,...,,,"0 hours, 1 minutes, 11 seconds",,5nq4,default_filters,default_4stage_multimer,5nq4_l120_s808237_mpnn5_model2,LKCNKLVPIAYKTCPEGKNLCYKMFMMSDLTIPVKRGCIDVCPKNS...,60
4,2,5nq4_l120_s808237_mpnn8,4stage,120,808237,-0.3,,MDVNLTEEKIREAAEKDPREAMILFMKAFHALRGDDKGIVKVFDIA...,"B22,B23,B25,B26,B27,B29,B30,B33,B71,B74,B75,B7...",0.97,...,,,"0 hours, 1 minutes, 14 seconds",,5nq4,default_filters,default_4stage_multimer,5nq4_l120_s808237_mpnn8_model2,LKCNKLVPIAYKTCPEGKNLCYKMFMMSDLTIPVKRGCIDVCPKNS...,60


In [41]:
combined_df.tail()

Unnamed: 0,Rank,Design,Protocol,Length,Seed,Helicity,Target_Hotspot,Sequence,InterfaceResidues,MPNN_score,...,4_Binder_RMSD,5_Binder_RMSD,DesignTime,Notes,TargetSettings,Filters,AdvancedSettings,DesignModel,TargetSequence,TargetSequenceLength
23,4,1yi5_l146_s61041_mpnn2,4stage,146,61041,-0.3,"H26-H37,H50,H51",MSKEEKLRKKFYEVVSKVTRWYVDDILWAISLGEEELSKISMKEVA...,"B21,B22,B24,B25,B26,B28,B29,B32,B40,B42,B44,B4...",0.94,...,,,"0 hours, 1 minutes, 15 seconds",,1yi5,default_filters,default_4stage_multimer,1yi5_l146_s61041_mpnn2_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
24,1,1yi5_l86_s832794_mpnn20,4stage,86,832794,-0.3,,SPEEAYKELMEKVKEGGGEELAKEIEEVFRRYIPDDPAALPDDWAR...,"B44,B48,B51,B52,B54,B55,B58,B59,B66,B67,B70,B7...",0.96,...,,,"0 hours, 0 minutes, 40 seconds",,1yi5,default_filters,default_4stage_multimer,1yi5_l86_s832794_mpnn20_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
25,2,1yi5_l89_s121962_mpnn16,4stage,89,121962,-0.3,,LDPWHRVPWEIWEQLEPLMMEFLKEISKETGKSYKEVVRAFFEVYE...,"B1,B4,B5,B6,B7,B9,B12,B13,B16,B19,B20,B23,B34,...",0.86,...,,,"0 hours, 0 minutes, 49 seconds",,1yi5,default_filters,default_4stage_multimer,1yi5_l89_s121962_mpnn16_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
26,1,1yi5_l96_s315414_mpnn6,4stage,96,315414,-0.3,H26-H37,EPIKLSRHEQIWMLDDMEWMREEYIKEHGELPKDFEEQFEYFKEVL...,"B7,B8,B11,B12,B15,B16,B19,B46,B49,B50,B53,B55,...",1.07,...,,,"0 hours, 0 minutes, 57 seconds",,1yi5,default_filters,default_4stage_multimer,1yi5_l96_s315414_mpnn6_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
27,2,1yi5_l96_s315414_mpnn5,4stage,96,315414,-0.3,H26-H37,MPIKLSRHEQIWMVDDMEWMRQEYIKKYGELPKDFKELFEYYKKVL...,"B7,B8,B9,B11,B12,B13,B15,B16,B19,B20,B45,B46,B...",1.06,...,,,"0 hours, 1 minutes, 25 seconds",,1yi5,default_filters,default_4stage_multimer,1yi5_l96_s315414_mpnn5_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68


In [7]:
combined_df.to_csv('combined_data.csv', index=False)