In [1]:
import os
import pandas as pd

# Function to read CSV files from each folder and combine them into a single DataFrame
def read_csv_from_folders(base_path):
    data_frames = []
    
    # Iterate through each folder in the base path
    for folder_name in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder_name)
        
        # Check if it's a directory
        if os.path.isdir(folder_path):
            # Define the path to the CSV file
            csv_path = os.path.join(folder_path, 'final_design_stats.csv')
            
            # If the CSV file exists, read it
            if os.path.exists(csv_path):
                df = pd.read_csv(csv_path)
                # Optionally, add a column for folder name to track from which folder the data came
                df['Folder'] = folder_name
                data_frames.append(df)
    
    # Combine all the DataFrames into a single DataFrame (if any)
    if data_frames:
        combined_df = pd.concat(data_frames, ignore_index=True)
        return combined_df
    else:
        return None

# Example usage
base_path = './../out/bindcraft/'
final_design_stats_df = read_csv_from_folders(base_path)
final_design_stats_df.head()
final_design_stats_df.shape

(34, 233)

In [2]:
import os
from Bio.PDB import PDBParser, PPBuilder

def extract_sequences_from_accepted_folders(parent_folder):
    """
    Searches for PDB files directly in 'Accepted' folders within the given parent folder and extracts sequences.
    Ignores subdirectories inside 'Accepted' folders.
    
    Args:
        parent_folder (str): The path to the parent folder to search for PDB files.
        
    Returns:
        dict: A dictionary where keys are filenames and values are sequences by chain.
    """
    # Initialize the PDB parser
    parser = PDBParser(QUIET=True)
    
    # Dictionary to store sequences by file
    sequences = {}
    
    # Walk through the parent folder and all its subdirectories
    for root, dirs, files in os.walk(parent_folder):
        # Check if the current directory is an 'Accepted' folder
        if os.path.basename(root) == "Accepted":
            for file in files:
                if file.endswith(".pdb"):  # Check if the file is a PDB file
                    file_path = os.path.join(root, file)
                    print(f"Processing file: {file_path}")
                    
                    try:
                        # Parse the PDB structure
                        structure = parser.get_structure("Protein", file_path)
                        
                        # Use the Polypeptide builder to extract sequences for each chain
                        ppb = PPBuilder()
                        file_sequences = {}
                        
                        for model in structure:
                            for chain in model:
                                chain_id = chain.id
                                peptides = ppb.build_peptides(chain)
                                chain_sequences = [
                                    str(peptide.get_sequence()) for peptide in peptides
                                ]
                                if chain_sequences:
                                    file_sequences[chain_id] = chain_sequences
                                else:
                                    file_sequences[chain_id] = ["No sequence found"]
                        
                        # Add the file's sequences to the result dictionary
                        sequences[file_path] = file_sequences
                    
                    except Exception as e:
                        print(f"Error processing file {file_path}: {e}")
    
    return sequences


import pandas as pd
def extract_sequences_to_dataframe(sequences):
    """
    Extracts File ID, Sequence 1, and Sequence 2 from the sequences dictionary into a DataFrame.
    
    Args:
        sequences (dict): Dictionary with file paths as keys and chain sequences as values.
        
    Returns:
        pd.DataFrame: DataFrame containing File ID, Sequence 1, and Sequence 2.
    """
    # List to store results
    data = []
    
    for file, chains in sequences.items():
        # Extract the File ID from the file path
        file_id = os.path.splitext(os.path.basename(file))[0]
        
        # Initialize placeholders for Sequence 1 and Sequence 2
        sequence1 = None
        sequence2 = None
        
        # Iterate through chains to get sequences
        for chain, seq_list in chains.items():
            if chain == "A" and seq_list:  # Assume Sequence 1 corresponds to Chain A
                sequence1 = seq_list[0]  # Take the first sequence from Chain A
            elif chain == "B" and seq_list:  # Assume Sequence 2 corresponds to Chain B
                sequence2 = seq_list[0]  # Take the first sequence from Chain B
        
        # Append to the results
        data.append([file_id, sequence1, sequence2])
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=["DesignModel", "TargetSequence", "Sequence"])
    return df

# Example usage
parent_folder = "./../out/bindcraft"  # Replace with your parent folder path
sequences = extract_sequences_from_accepted_folders(parent_folder)
accepted_df = extract_sequences_to_dataframe(sequences)
accepted_df.head()

Processing file: ./../out/bindcraft/2501180951/Accepted/1yi5_l159_s712800_mpnn6_model1.pdb
Processing file: ./../out/bindcraft/2501180951/Accepted/1yi5_l159_s712800_mpnn15_model1.pdb
Processing file: ./../out/bindcraft/2501180951/Accepted/1yi5_l153_s269018_mpnn8_model2.pdb
Processing file: ./../out/bindcraft/2501171452/Accepted/5nq4_l120_s808237_mpnn8_model2.pdb
Processing file: ./../out/bindcraft/2501171452/Accepted/5nq4_l120_s808237_mpnn5_model2.pdb
Processing file: ./../out/bindcraft/2501162231/Accepted/5nq4_l86_s533102_mpnn8_model1.pdb
Processing file: ./../out/bindcraft/2501162231/Accepted/5nq4_l88_s194736_mpnn6_model2.pdb
Processing file: ./../out/bindcraft/2501162231/Accepted/5nq4_l88_s194736_mpnn5_model2.pdb
Processing file: ./../out/bindcraft/2501162231/Accepted/5nq4_l86_s533102_mpnn6_model1.pdb
Processing file: ./../out/bindcraft/2501181208/Accepted/1yi5_l96_s23541_mpnn2_model2.pdb
Processing file: ./../out/bindcraft/2501181208/Accepted/1yi5_l96_s23541_mpnn13_model1.pdb
Proce

Unnamed: 0,DesignModel,TargetSequence,Sequence
0,1yi5_l159_s712800_mpnn6_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,SAMVEELLEEAEKMHQAMLDKAPPEVLKPLQTKHLSKFDEVFAGWA...
1,1yi5_l159_s712800_mpnn15_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,SAEVEELLKEAEKMHEAMLAGAPEEVLKPLQTAHLESFDKVMAGLF...
2,1yi5_l153_s269018_mpnn8_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,MLENIKPWLQKTIAPSDWSTPYELLWALYWLLDDIRYIVEELKKKI...
3,5nq4_l120_s808237_mpnn8_model2,LKCNKLVPIAYKTCPEGKNLCYKMFMMSDLTIPVKRGCIDVCPKNS...,MDVNLTEEKIREAAEKDPREAMILFMKAFHALRGDDKGIVKVFDIA...
4,5nq4_l120_s808237_mpnn5_model2,LKCNKLVPIAYKTCPEGKNLCYKMFMMSDLTIPVKRGCIDVCPKNS...,MNVDLTEEKIREALEKDPRDAMILFMKAFHALRGDDEKIVEVFEKF...


In [3]:
final_design_stats_df.head()

Unnamed: 0,Rank,Design,Protocol,Length,Seed,Helicity,Target_Hotspot,Sequence,InterfaceResidues,MPNN_score,...,2_Binder_RMSD,3_Binder_RMSD,4_Binder_RMSD,5_Binder_RMSD,DesignTime,Notes,TargetSettings,Filters,AdvancedSettings,Folder
0,1,1yi5_l159_s712800_mpnn15,4stage,159,712800,-0.3,"H6,H7,H26-H37,H50,H51",SAEVEELLKEAEKMHEAMLAGAPEEVLKPLQTAHLESFDKVMAGLF...,"B14,B18,B24,B28,B31,B32,B34,B35,B77,B78,B80,B8...",1.2,...,2.97,,,,"0 hours, 1 minutes, 20 seconds",,1yi5,default_filters,default_4stage_multimer,2501180951
1,2,1yi5_l153_s269018_mpnn8,4stage,153,269018,-0.3,"H6,H7,H26-H37,H50,H51",MLENIKPWLQKTIAPSDWSTPYELLWALYWLLDDIRYIVEELKKKI...,"B12,B13,B14,B15,B16,B17,B26,B29,B30,B33,B34,B3...",0.89,...,1.94,,,,"0 hours, 1 minutes, 54 seconds",,1yi5,default_filters,default_4stage_multimer,2501180951
2,3,1yi5_l159_s712800_mpnn6,4stage,159,712800,-0.3,"H6,H7,H26-H37,H50,H51",SAMVEELLEEAEKMHQAMLDKAPPEVLKPLQTKHLSKFDEVFAGWA...,"B14,B18,B28,B31,B32,B34,B35,B77,B78,B81,B84,B8...",1.17,...,2.79,,,,"0 hours, 1 minutes, 47 seconds",,1yi5,default_filters,default_4stage_multimer,2501180951
3,1,5nq4_l120_s808237_mpnn5,4stage,120,808237,-0.3,,MNVDLTEEKIREALEKDPRDAMILFMKAFHALRGDDEKIVEVFEKF...,"B22,B23,B26,B27,B29,B30,B33,B71,B74,B75,B77,B7...",0.96,...,1.89,,,,"0 hours, 1 minutes, 11 seconds",,5nq4,default_filters,default_4stage_multimer,2501171452
4,2,5nq4_l120_s808237_mpnn8,4stage,120,808237,-0.3,,MDVNLTEEKIREAAEKDPREAMILFMKAFHALRGDDKGIVKVFDIA...,"B22,B23,B25,B26,B27,B29,B30,B33,B71,B74,B75,B7...",0.97,...,2.18,,,,"0 hours, 1 minutes, 14 seconds",,5nq4,default_filters,default_4stage_multimer,2501171452


In [4]:
combined_df = pd.merge(final_design_stats_df, accepted_df, on='Sequence')
combined_df['TargetSequenceLength'] = combined_df['TargetSequence'].apply(len)
combined_df.head()

Unnamed: 0,Rank,Design,Protocol,Length,Seed,Helicity,Target_Hotspot,Sequence,InterfaceResidues,MPNN_score,...,5_Binder_RMSD,DesignTime,Notes,TargetSettings,Filters,AdvancedSettings,Folder,DesignModel,TargetSequence,TargetSequenceLength
0,1,1yi5_l159_s712800_mpnn15,4stage,159,712800,-0.3,"H6,H7,H26-H37,H50,H51",SAEVEELLKEAEKMHEAMLAGAPEEVLKPLQTAHLESFDKVMAGLF...,"B14,B18,B24,B28,B31,B32,B34,B35,B77,B78,B80,B8...",1.2,...,,"0 hours, 1 minutes, 20 seconds",,1yi5,default_filters,default_4stage_multimer,2501180951,1yi5_l159_s712800_mpnn15_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
1,2,1yi5_l153_s269018_mpnn8,4stage,153,269018,-0.3,"H6,H7,H26-H37,H50,H51",MLENIKPWLQKTIAPSDWSTPYELLWALYWLLDDIRYIVEELKKKI...,"B12,B13,B14,B15,B16,B17,B26,B29,B30,B33,B34,B3...",0.89,...,,"0 hours, 1 minutes, 54 seconds",,1yi5,default_filters,default_4stage_multimer,2501180951,1yi5_l153_s269018_mpnn8_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
2,3,1yi5_l159_s712800_mpnn6,4stage,159,712800,-0.3,"H6,H7,H26-H37,H50,H51",SAMVEELLEEAEKMHQAMLDKAPPEVLKPLQTKHLSKFDEVFAGWA...,"B14,B18,B28,B31,B32,B34,B35,B77,B78,B81,B84,B8...",1.17,...,,"0 hours, 1 minutes, 47 seconds",,1yi5,default_filters,default_4stage_multimer,2501180951,1yi5_l159_s712800_mpnn6_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
3,1,5nq4_l120_s808237_mpnn5,4stage,120,808237,-0.3,,MNVDLTEEKIREALEKDPRDAMILFMKAFHALRGDDEKIVEVFEKF...,"B22,B23,B26,B27,B29,B30,B33,B71,B74,B75,B77,B7...",0.96,...,,"0 hours, 1 minutes, 11 seconds",,5nq4,default_filters,default_4stage_multimer,2501171452,5nq4_l120_s808237_mpnn5_model2,LKCNKLVPIAYKTCPEGKNLCYKMFMMSDLTIPVKRGCIDVCPKNS...,60
4,2,5nq4_l120_s808237_mpnn8,4stage,120,808237,-0.3,,MDVNLTEEKIREAAEKDPREAMILFMKAFHALRGDDKGIVKVFDIA...,"B22,B23,B25,B26,B27,B29,B30,B33,B71,B74,B75,B7...",0.97,...,,"0 hours, 1 minutes, 14 seconds",,5nq4,default_filters,default_4stage_multimer,2501171452,5nq4_l120_s808237_mpnn8_model2,LKCNKLVPIAYKTCPEGKNLCYKMFMMSDLTIPVKRGCIDVCPKNS...,60


In [5]:
combined_df.tail()

Unnamed: 0,Rank,Design,Protocol,Length,Seed,Helicity,Target_Hotspot,Sequence,InterfaceResidues,MPNN_score,...,5_Binder_RMSD,DesignTime,Notes,TargetSettings,Filters,AdvancedSettings,Folder,DesignModel,TargetSequence,TargetSequenceLength
29,4,1yi5_l146_s61041_mpnn2,4stage,146,61041,-0.3,"H26-H37,H50,H51",MSKEEKLRKKFYEVVSKVTRWYVDDILWAISLGEEELSKISMKEVA...,"B21,B22,B24,B25,B26,B28,B29,B32,B40,B42,B44,B4...",0.94,...,,"0 hours, 1 minutes, 15 seconds",,1yi5,default_filters,default_4stage_multimer,2501181203,1yi5_l146_s61041_mpnn2_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
30,1,1yi5_l86_s832794_mpnn20,4stage,86,832794,-0.3,,SPEEAYKELMEKVKEGGGEELAKEIEEVFRRYIPDDPAALPDDWAR...,"B44,B48,B51,B52,B54,B55,B58,B59,B66,B67,B70,B7...",0.96,...,,"0 hours, 0 minutes, 40 seconds",,1yi5,default_filters,default_4stage_multimer,2501192248,1yi5_l86_s832794_mpnn20_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
31,2,1yi5_l89_s121962_mpnn16,4stage,89,121962,-0.3,,LDPWHRVPWEIWEQLEPLMMEFLKEISKETGKSYKEVVRAFFEVYE...,"B1,B4,B5,B6,B7,B9,B12,B13,B16,B19,B20,B23,B34,...",0.86,...,,"0 hours, 0 minutes, 49 seconds",,1yi5,default_filters,default_4stage_multimer,2501192248,1yi5_l89_s121962_mpnn16_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
32,1,1yi5_l96_s315414_mpnn6,4stage,96,315414,-0.3,H26-H37,EPIKLSRHEQIWMLDDMEWMREEYIKEHGELPKDFEEQFEYFKEVL...,"B7,B8,B11,B12,B15,B16,B19,B46,B49,B50,B53,B55,...",1.07,...,,"0 hours, 0 minutes, 57 seconds",,1yi5,default_filters,default_4stage_multimer,2501180802,1yi5_l96_s315414_mpnn6_model2,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68
33,2,1yi5_l96_s315414_mpnn5,4stage,96,315414,-0.3,H26-H37,MPIKLSRHEQIWMVDDMEWMRQEYIKKYGELPKDFKELFEYYKKVL...,"B7,B8,B9,B11,B12,B13,B15,B16,B19,B20,B45,B46,B...",1.06,...,,"0 hours, 1 minutes, 25 seconds",,1yi5,default_filters,default_4stage_multimer,2501180802,1yi5_l96_s315414_mpnn5_model1,IRCFITPDITSKDCPNGHVCYTKTWCDAFCSIRGKRVDLGCAATCP...,68


In [6]:
combined_df.to_csv('combined_data.csv', index=False)

In [7]:
combined_df.shape

(34, 236)