In [1]:
from Bio.PDB import PDBList
import os
import requests
import pandas as pd
import subprocess
import time
import tempfile

In [2]:
df_test = pd.read_csv('./pair_sample.csv', index_col=0)
df_test

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb
0,P9WJA3,A0A1M6N9Z6,1Y5H,
1,I6XFS7,A0A1M6WSV2,6M1C,
2,Q65EQ1,A0A521F3Z2,6NKG,
3,F5HRS7,A0A2T0LBQ2,7QH4,
4,P9WHM1,C7MUW2,3LP6,
...,...,...,...,...
95,A0A4Z0GXN3,A0A1G7W5M9,,
96,G4H893,A0A1W6VMF1,,
97,A0A120GMI5,Q5L0I9,,
98,A0A4V2YRI4,A0A3N2GW27,,


In [10]:
def download_structures(df, pdb_column, u_column, pdb_dir):
    start_time = time.time()  # Start measuring time
    pdbl = PDBList()
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)
        
    for i, row in df.iterrows():
        pdb_id = row[pdb_column]
        uniprot_id = row[u_column]
        if not pd.isna(pdb_id):  # check for NaN value in PDB IDs column
            pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
            file_path = os.path.join(pdb_dir, f'pdb{pdb_id.lower()}.ent')
            if os.path.exists(file_path):
                os.rename(os.path.join(file_path), os.path.join(pdb_dir, f'{pdb_id}.pdb'))
            else:
                pass
        elif isinstance(uniprot_id, str):  # download structure using UniProt ID
            url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
            response = requests.get(url)
            if response.ok:
                filename = f'{pdb_dir}/{uniprot_id}.pdb'
                with open(filename, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded file for {uniprot_id} to {filename}")
            else:
                print(f"Failed to download file for {uniprot_id}: {response.status_code} - {response.reason}")
        else:
            print(f"No PDB ID or UniProt ID available for index {i}")
        end_time = time.time()  # Stop measuring time
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")
    pass

In [11]:
download_structures(df=df_test, pdb_column='meso_pdb', u_column='meso_pid', pdb_dir='checking')

Structure exists: 'checking/pdb1y5h.ent' 
Structure exists: 'checking/pdb6m1c.ent' 
Structure exists: 'checking/pdb6nkg.ent' 
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Structure exists: 'checking/pdb3lp6.ent' 
Structure exists: 'checking/pdb3tfx.ent' 
Structure exists: 'checking/pdb8a63.ent' 
Structure exists: 'checking/pdb5y63.ent' 
Structure exists: 'checking/pdb3eul.ent' 
Structure exists: 'checking/pdb4qwq.ent' 
Downloading PDB structure '7oii'...
Desired structure doesn't exist
Downloading PDB structure '7nhn'...
Desired structure doesn't exist
Downloading PDB structure '7nhm'...
Desired structure doesn't exist
Structure exists: 'checking/pdb3on1.ent' 
Structure exists: 'checking/pdb6ujk.ent' 
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Structure exists: 'checking/pdb2g3b.ent' 
Structure exists: 'checking/pdb6wsh.ent' 
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Structure exists: 'checking/pdb5zul.ent' 
Stru

In [24]:
download_structures(df=df_test, pdb_column='thermo_pdb', u_column='thermo_pid', pdb_dir='checking')

Downloaded file for A0A1M6N9Z6 to checking/A0A1M6N9Z6.pdb
Downloaded file for A0A1M6WSV2 to checking/A0A1M6WSV2.pdb
Downloaded file for A0A521F3Z2 to checking/A0A521F3Z2.pdb
Downloaded file for A0A2T0LBQ2 to checking/A0A2T0LBQ2.pdb
Downloaded file for C7MUW2 to checking/C7MUW2.pdb
Downloaded file for A0A7W9YPC6 to checking/A0A7W9YPC6.pdb
Downloaded file for A0A087LCG0 to checking/A0A087LCG0.pdb
Downloaded file for I3DYT5 to checking/I3DYT5.pdb
Downloaded file for A0A853ALZ0 to checking/A0A853ALZ0.pdb
Downloaded file for A0A521CL06 to checking/A0A521CL06.pdb
Downloaded file for A0A3N2H419 to checking/A0A3N2H419.pdb
Downloaded file for A0A7V9Z9X5 to checking/A0A7V9Z9X5.pdb
Downloaded file for A0A1G6PFE9 to checking/A0A1G6PFE9.pdb
Downloaded file for A0A4R8LSA2 to checking/A0A4R8LSA2.pdb
Downloaded file for A0A7W3RA64 to checking/A0A7W3RA64.pdb
Downloaded file for A0A540V080 to checking/A0A540V080.pdb
Downloaded file for A0A2G8B7Q0 to checking/A0A2G8B7Q0.pdb
Downloaded file for A0A7W8MUD5

In [3]:
df_cat = df_test.tail(3)
df_cat

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb
97,A0A120GMI5,Q5L0I9,,
98,A0A4V2YRI4,A0A3N2GW27,,
99,A0A3D9URX8,A0A3N2GYN3,,


In [26]:
def run_fatcat(df):
    p_values = []  # List to store the extracted p-values
    rows_to_drop = []  # List to store the indices of rows to be dropped

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join('./checking/', p1_file)) or not os.path.exists(os.path.join('./checking/', p2_file)):
            # Append the index of the row to the list of rows to be dropped
            rows_to_drop.append(index)
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', './checking/', '-o', f'outfile_{p1}_{p2}', '-m']
        
        # Run the FATCAT command
        subprocess.run(cmd)
        
        # Read the output from the temporary file
        with open(f'outfile_{p1}_{p2}.aln', 'r') as result_file:
            for line in result_file:
                if line.startswith('P-value'):
                    p_value = line.split()[1]
                    p_values.append(p_value)
                    break  # Break the loop if p-value is found
    
    # Drop the rows with missing structure files from the dataframe
    df = df.drop(rows_to_drop)
    
    df.loc[:, 'p_value'] = p_values  # Use .loc to set the 'p_value' column
    return df


In [28]:
df_result = run_fatcat(df_test)

In [30]:
df_result

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb,p_value
0,P9WJA3,A0A1M6N9Z6,1Y5H,,7.18e-13
1,I6XFS7,A0A1M6WSV2,6M1C,,0.00e+00
2,Q65EQ1,A0A521F3Z2,6NKG,,9.50e-12
4,P9WHM1,C7MUW2,3LP6,,0.00e+00
5,Q5FJB3,A0A7W9YPC6,3TFX,,0.00e+00
...,...,...,...,...,...
95,A0A4Z0GXN3,A0A1G7W5M9,,,0.00e+00
96,G4H893,A0A1W6VMF1,,,0.00e+00
97,A0A120GMI5,Q5L0I9,,,0.00e+00
98,A0A4V2YRI4,A0A3N2GW27,,,2.47e-12


In [35]:
df_result['p_value'] = pd.to_numeric(df_result['p_value'], errors='coerce')

# Create a new dataframe with rows where p-value > 0.05
new_df = df_result[df_result['p_value'] > 0.05]
new_df


Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb,p_value
22,P66054,A0A4R3N1Y2,8A57,,0.0871
25,P66125,A0A540V080,8A63,,0.101


In [None]:
# fatcat command: 
# FATCATQue.pl timeused pair.list -q >pair.aln (only work when files are in the same directory)
# FATCAT -p1 A0JNW5.pdb -p2 A1L1K8.pdb -o A0JNW5_A1L1K8 -m
# FATCAT -p1 P0A9P0.pdb -p2 Q9H3D4.pdb -i ./checking -o test -m (-i directory of PDB files), result file is in ./

In [None]:
def download_structures(df, pdb_dir='checking'):   
    for i, row in df.iterrows():
        pdb_id = row['PDB IDs']
        pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
        pass