In [28]:
from Bio.PDB import PDBList
import os
import requests
import pandas as pd
import subprocess
import time
import tempfile

In [2]:
df_test = pd.read_csv('./pair_sample.csv', index_col=0)
df_test

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb
0,P9WJA3,A0A1M6N9Z6,1Y5H,
1,I6XFS7,A0A1M6WSV2,6M1C,
2,Q65EQ1,A0A521F3Z2,6NKG,
3,F5HRS7,A0A2T0LBQ2,7QH4,
4,P9WHM1,C7MUW2,3LP6,
...,...,...,...,...
95,A0A4Z0GXN3,A0A1G7W5M9,,
96,G4H893,A0A1W6VMF1,,
97,A0A120GMI5,Q5L0I9,,
98,A0A4V2YRI4,A0A3N2GW27,,


In [79]:
pdbl = PDBList()
pdbl.retrieve_pdb_file('2JWS', file_format='pdb')
pdbl.retrieve_pdb_file('2JWU', file_format='pdb')

Structure exists: '/home/chau_vuong/ValidProt/notebooks/jw/pdb2jws.ent' 
Downloading PDB structure '2jwu'...


'/home/chau_vuong/ValidProt/notebooks/jw/pdb2jwu.ent'

In [10]:
def download_structures(df, pdb_column, u_column, pdb_dir):
    start_time = time.time()  # Start measuring time
    pdbl = PDBList()
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)
        
    for i, row in df.iterrows():
        pdb_id = row[pdb_column]
        uniprot_id = row[u_column]
        if not pd.isna(pdb_id):  # check for NaN value in PDB IDs column
            pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
            file_path = os.path.join(pdb_dir, f'pdb{pdb_id.lower()}.ent')
            if os.path.exists(file_path):
                os.rename(os.path.join(file_path), os.path.join(pdb_dir, f'{pdb_id}.pdb'))
            else:
                pass
        elif isinstance(uniprot_id, str):  # download structure using UniProt ID
            url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
            response = requests.get(url)
            if response.ok:
                filename = f'{pdb_dir}/{uniprot_id}.pdb'
                with open(filename, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded file for {uniprot_id} to {filename}")
            else:
                print(f"Failed to download file for {uniprot_id}: {response.status_code} - {response.reason}")
        else:
            print(f"No PDB ID or UniProt ID available for index {i}")
        end_time = time.time()  # Stop measuring time
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")
    pass

In [11]:
download_structures(df=df_test, pdb_column='meso_pdb', u_column='meso_pid', pdb_dir='checking')

Structure exists: 'checking/pdb1y5h.ent' 
Structure exists: 'checking/pdb6m1c.ent' 
Structure exists: 'checking/pdb6nkg.ent' 
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Structure exists: 'checking/pdb3lp6.ent' 
Structure exists: 'checking/pdb3tfx.ent' 
Structure exists: 'checking/pdb8a63.ent' 
Structure exists: 'checking/pdb5y63.ent' 
Structure exists: 'checking/pdb3eul.ent' 
Structure exists: 'checking/pdb4qwq.ent' 
Downloading PDB structure '7oii'...
Desired structure doesn't exist
Downloading PDB structure '7nhn'...
Desired structure doesn't exist
Downloading PDB structure '7nhm'...
Desired structure doesn't exist
Structure exists: 'checking/pdb3on1.ent' 
Structure exists: 'checking/pdb6ujk.ent' 
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Structure exists: 'checking/pdb2g3b.ent' 
Structure exists: 'checking/pdb6wsh.ent' 
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Structure exists: 'checking/pdb5zul.ent' 
Stru

In [24]:
download_structures(df=df_test, pdb_column='thermo_pdb', u_column='thermo_pid', pdb_dir='checking')

Downloaded file for A0A1M6N9Z6 to checking/A0A1M6N9Z6.pdb
Downloaded file for A0A1M6WSV2 to checking/A0A1M6WSV2.pdb
Downloaded file for A0A521F3Z2 to checking/A0A521F3Z2.pdb
Downloaded file for A0A2T0LBQ2 to checking/A0A2T0LBQ2.pdb
Downloaded file for C7MUW2 to checking/C7MUW2.pdb
Downloaded file for A0A7W9YPC6 to checking/A0A7W9YPC6.pdb
Downloaded file for A0A087LCG0 to checking/A0A087LCG0.pdb
Downloaded file for I3DYT5 to checking/I3DYT5.pdb
Downloaded file for A0A853ALZ0 to checking/A0A853ALZ0.pdb
Downloaded file for A0A521CL06 to checking/A0A521CL06.pdb
Downloaded file for A0A3N2H419 to checking/A0A3N2H419.pdb
Downloaded file for A0A7V9Z9X5 to checking/A0A7V9Z9X5.pdb
Downloaded file for A0A1G6PFE9 to checking/A0A1G6PFE9.pdb
Downloaded file for A0A4R8LSA2 to checking/A0A4R8LSA2.pdb
Downloaded file for A0A7W3RA64 to checking/A0A7W3RA64.pdb
Downloaded file for A0A540V080 to checking/A0A540V080.pdb
Downloaded file for A0A2G8B7Q0 to checking/A0A2G8B7Q0.pdb
Downloaded file for A0A7W8MUD5

In [57]:
df_cat = df_test.tail(3)
df_cat

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb
97,A0A120GMI5,Q5L0I9,,
98,A0A4V2YRI4,A0A3N2GW27,,
99,A0A3D9URX8,A0A3N2GYN3,,


In [69]:
def run_fatcat(df):
    p_values = []  # List to store the extracted p-values
    for _, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', f'{p1}.pdb', '-p2', f'{p2}.pdb', '-i', './checking/', '-o', f'outfile_{p1}_{p2}', '-m']
        
        # Run the FATCAT command
        subprocess.run(cmd)
        
        # Read the output from the temporary file
        with open(f'outfile_{p1}_{p2}.aln', 'r') as result_file:
            for line in result_file:
                if line.startswith('P-value'):
                    p_value = line.split()[1]
                    p_values.append(p_value)
                    break  # Break the loop if p-value is found
    df.loc[:, ('p_value')] = p_values  # Use .loc to set the 'p_value' column
    return df

In [67]:
def run_fatcat(df):
    p_values = []  # List to store the extracted p-values
        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', f'{p1}.pdb', '-p2', f'{p2}.pdb', '-i', './checking/', '-o', f'outfile_{p1}_{p2}', '-m']
        
        # Create a temporary file to store the output
        with tempfile.NamedTemporaryFile() as temp_file:
            temp_filename = temp_file.name

            # Run the FATCAT command and redirect the output to the temporary file
            subprocess.run(cmd, stdout=temp_file)
        
        # Read the output from the temporary file
        with open(temp_filename, 'r') as result_file:
            for line in result_file:
                if line.startswith('P-value'):
                    p_value = line.split()[1]
                    p_values.append(p_value)
                    break  # Break the loop if p-value is found
        
        # Delete the temporary file
        os.remove(temp_filename)
    
    df.loc[:, ('p_value')] = p_values  # Use .loc to set the 'p_value' column
    return df


In [80]:
cmd = ['FATCAT', '-p1', f'A0A3D9URX8.pdb', '-p2', f'A0A3N2GYN3.pdb', '-i', './checking/', '-o', f'outfile', '-m']
subprocess.run(cmd)

CompletedProcess(args=['FATCAT', '-p1', 'A0A3D9URX8.pdb', '-p2', 'A0A3N2GYN3.pdb', '-i', './checking/', '-o', 'outfile', '-m'], returncode=1)

In [None]:
# Create a temporary file for the output
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
    output_filename = temp_file.name

# Run the FATCAT program
cmd = ['FATCAT', '-p1', 'A0A3D9URX8.pdb', '-p2', 'A0A3N2GYN3.pdb', '-i', './checking/', '-o', 'outfile', '-m']
subprocess.run(cmd)

# Open the temporary file and parse the p-value
p_values = []
with open(output_filename, 'r') as result_file:
    for line in result_file:
        if line.startswith('P-value'):
            p_value = line.split()[1]
            p_values.append(p_value)

# Remove the temporary file
# Note: If you want to keep the file for further use, you can omit this step
os.remove(output_filename)


In [70]:
run_fatcat(df_cat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, ('p_value')] = p_values  # Use .loc to set the 'p_value' column


Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb,p_value
97,A0A120GMI5,Q5L0I9,,,0.0
98,A0A4V2YRI4,A0A3N2GW27,,,2.47e-12
99,A0A3D9URX8,A0A3N2GYN3,,,0.0


In [None]:
def perform_flexible_alignment(protein_pairs, structures, chunk_size):
    # Implement performing flexible alignment on the 3D structures of protein pairs in chunks
    alignment_metrics = []

    for index, row in protein_pairs.iterrows():
        # TODO: Replace this with the actual subprocess call for structural alignment
        # Example: result = subprocess.run(["your_alignment_tool", "arg1", "arg2"])
        result = None

        alignment_metrics.append({"protein_id_1": row["protein_id_1"],
                                  "protein_id_2": row["protein_id_2"],
                                  "alignment_metric": result})

    return pd.DataFrame(alignment_metrics)

In [3]:
file_directory = './checking'
files_to_rename = ['pdb2abc.ent', 'pdb1y5h.ent']

for filename in files_to_rename:
        file_path = os.path.join(file_directory, filename)
        if os.path.exists(file_path):
            new_file_path = os.path.join('./checking', '1Y5H.pdb')
            os.rename(file_path, new_file_path)
        else:
            print(f"The file {file_path} does not exist.")

The file ./checking/pdb2abc.ent does not exist.


In [None]:
# fatcat command: 
# FATCATQue.pl timeused pair.list -q >pair.aln (only work when files are in the same directory)
# FATCAT -p1 A0JNW5.pdb -p2 A1L1K8.pdb -o A0JNW5_A1L1K8 -m
# FATCAT -p1 P0A9P0.pdb -p2 Q9H3D4.pdb -i ./checking -o test -m (-i directory of PDB files), result file is in ./

In [None]:
def download_structures(df, pdb_dir='checking'):   
    for i, row in df.iterrows():
        pdb_id = row['PDB IDs']
        pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
        pass