In [1]:
from Bio.PDB import PDBList
import os
import requests
import pandas as pd
import subprocess
import time
import tempfile

import asyncio
import httpx
import nest_asyncio

import duckdb as db
import numpy as np

Unnamed: 0,thermo_pid,meso_pid,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,local_E_value,query_align_start,query_align_end,subject_align_end,subject_align_start,...,bit_score,thermo_taxid,meso_taxid,m_ogt,t_ogt,ogt_difference,m_protein_seq,t_protein_seq,meso_pdb,thermo_pdb
0,A0A2W7RQ16,A0A327VJ32,0.303279,0.291339,0.290196,2.17e-12,7,127,128,7,...,132.0,1004304,1539050,25.0,45.0,20.0,MTRINYSSGATWEDKVGYSRAVRTGNIIEVSGTVAEDEGKVVAEGN...,MHYQRISSGSVYEEQMCYSRAVVAAPFIFVSGTTGFNYATMTIADD...,,
1,A0A2W7RTL2,A0A327VMA9,0.644068,0.567164,0.584615,4.18e-24,6,64,61,3,...,199.0,1004304,1539050,25.0,45.0,20.0,MQTGVVKFFNETKGFGFIKIEGTNQEIFVHVSGIKESIGENDRVVF...,MSTKITGTVKFFNEEKGFGFIKHDDSNKETFVHANGLIDQIEANDK...,,
2,A0A2W7RYG8,A0A327W1Z1,0.538462,0.477273,0.473684,4.34e-40,12,128,130,14,...,317.0,1004304,1539050,25.0,45.0,20.0,MARKGNKNKRRNLEKLSHKYRLVIMNDETYDEVTSFKLSRMSVYIA...,MAKQIFETNTFKRLRNQYRLVIINDDTYEELVTFKLSRLSVYVAFS...,,
3,A0A2W7RR29,A0A327W5V0,0.660714,0.59919,0.609053,7.72e-113,24,247,239,16,...,818.0,1004304,1539050,25.0,45.0,20.0,MMSSMSKNINTHLVEIKDYFKVAISVDCVIFGFNNDELKVLLIESD...,MKPKSVQTEPAAQRPLITDVKALVNSYPRVPITVDCVIFGFDGEEL...,,
4,A0A2W7RR29,A0A327VM41,0.414747,0.364372,0.376569,4.44e-58,28,243,227,11,...,456.0,1004304,1539050,25.0,45.0,20.0,MNAASFYSKAPRHLVAVDCIIFGFDEGKLKLLVIKRKVAPMAGEWS...,MKPKSVQTEPAAQRPLITDVKALVNSYPRVPITVDCVIFGFDGEEL...,,


In [3]:
df_test = pd.read_csv('./pair_sample.csv', index_col=0)
df_test

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb
0,P9WJA3,A0A1M6N9Z6,1Y5H,
1,I6XFS7,A0A1M6WSV2,6M1C,
2,Q65EQ1,A0A521F3Z2,6NKG,
3,F5HRS7,A0A2T0LBQ2,7QH4,
4,P9WHM1,C7MUW2,3LP6,
...,...,...,...,...
95,A0A4Z0GXN3,A0A1G7W5M9,,
96,G4H893,A0A1W6VMF1,,
97,A0A120GMI5,Q5L0I9,,
98,A0A4V2YRI4,A0A3N2GW27,,


In [9]:
async def download_aff(session, url, filename):
    try:
        response = await session.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded file: {filename}")
            return True
        else:
            print(f"Failed to download file: {filename}. Status code: {response.status_code}")
            return False
    except httpx.RequestError as e:
        print(f"Error while downloading file: {filename}. Exception: {str(e)}")
        return False

async def download_af(row, u_column, pdb_dir):
    uniprot_id = getattr(row, u_column)
    url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
    filename = f'{pdb_dir}/{uniprot_id}.pdb'

    async with httpx.AsyncClient() as client:
        success = await download_aff(client, url, filename)
        return success

def run_download_af_all(df, u_column, pdb_dir):
    nest_asyncio.apply()

    async def download_af_all():
        tasks = []
        success_count = 0

        if not os.path.exists(pdb_dir):
            os.makedirs(pdb_dir)

        for row in df.itertuples(index=False):
            task = asyncio.create_task(download_af(row, u_column, pdb_dir))
            tasks.append(task)

        results = await asyncio.gather(*tasks)
        success_count = sum(results)

        print(f"Successfully downloaded {success_count} files out of {len(df)}")

    asyncio.run(download_af_all())

In [10]:
def download_pdb(df, pdb_column, pdb_dir):
    pdbl = PDBList()
    for i, row in df.iterrows():
        pdb_id = row[pdb_column]
        if not pd.isna(pdb_id):  # check for NaN value in PDB IDs column
            pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
            file_path = os.path.join(pdb_dir, f'pdb{pdb_id.lower()}.ent')
            if os.path.exists(file_path):
                os.rename(file_path, os.path.join(pdb_dir, f'{pdb_id}.pdb'))
            else:
                pass

In [11]:
def download_structure(df, pdb_column, u_column, pdb_dir):
    start_time = time.time()  # Start measuring time
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)
    download_pdb(df, pdb_column, pdb_dir)    
    run_download_af_all(df, u_column, pdb_dir)
    end_time = time.time()  # Stop measuring time
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")
    pass

In [28]:
download_structure(df_test, 'thermo_pdb','thermo_pid', 'af')

Downloaded file: af/A0A1M6N9Z6.pdb
Downloaded file: af/A0A1M6WSV2.pdb
Downloaded file: af/A0A521F3Z2.pdb
Downloaded file: af/A0A2T0LBQ2.pdb
Downloaded file: af/C7MUW2.pdb
Downloaded file: af/A0A087LCG0.pdb
Downloaded file: af/I3DYT5.pdb
Downloaded file: af/A0A853ALZ0.pdb
Downloaded file: af/A0A7W9YPC6.pdb
Downloaded file: af/A0A1G6PFE9.pdb
Downloaded file: af/A0A521CL06.pdb
Downloaded file: af/A0A3N2H419.pdb
Downloaded file: af/A0A7V9Z9X5.pdb
Downloaded file: af/A0A7W3RA64.pdb
Downloaded file: af/A0A540V080.pdb
Downloaded file: af/A0A4R8LSA2.pdb
Downloaded file: af/A0A2G8B7Q0.pdb
Downloaded file: af/A0A7W8MUD5.pdb
Downloaded file: af/A0A7R7TZJ8.pdb
Downloaded file: af/A0A3N2GXT6.pdb
Downloaded file: af/A0A4R3N1Y2.pdb
Downloaded file: af/A0A2T0YIG1.pdb
Downloaded file: af/A0A2T0LAM4.pdb
Downloaded file: af/D4YUW7.pdb
Downloaded file: af/A0A2G8BIW3.pdb
Downloaded file: af/I1D3S8.pdb
Downloaded file: af/A0A7W0C158.pdb
Downloaded file: af/A0A090J3Q8.pdb
Downloaded file: af/A0A540V080.pdb
D

In [29]:
download_structure(df_test, 'meso_pdb','meso_pid', 'af')

Downloading PDB structure '1y5h'...
Downloading PDB structure '6m1c'...
Downloading PDB structure '6nkg'...
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Downloading PDB structure '3lp6'...
Downloading PDB structure '3tfx'...
Downloading PDB structure '8a63'...
Downloading PDB structure '5y63'...
Downloading PDB structure '3eul'...
Downloading PDB structure '4qwq'...
Downloading PDB structure '7oii'...
Desired structure doesn't exist
Downloading PDB structure '7nhn'...
Desired structure doesn't exist
Downloading PDB structure '7nhm'...
Desired structure doesn't exist
Downloading PDB structure '3on1'...
Downloading PDB structure '6ujk'...
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Downloading PDB structure '2g3b'...
Downloading PDB structure '6wsh'...
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Downloading PDB structure '5zul'...
Downloading PDB structure '7lza'...
Downloading PDB structure '3ge6'...
Downloading PDB 

In [8]:
def run_fatcat(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Assign a p-value of 2 to the row instead of dropping it
            p_values.append(2)
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
        
        # Run the FATCAT command and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True)
        output = result.stdout

        # Find the line containing the p-value
        p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

        # Extract the p-value and convert it to numeric value
        p_value = float(p_value_line.split()[1])
        
        # Check if p-value is less than 0.05 and assign 0 or 1 accordingly
        if p_value < 0.05:
            p_values.append(0)
        else:
            p_values.append(1)

    df.loc[:, 'p_value'] = p_values  # Use .loc to set the 'p_value' column
    return df

In [None]:
conn = db.connect("./pairpro_50k.db")
df = conn.execute("""SELECT thermo_pid, meso_pid FROM pairpro.final""").df()

download_structure(df, 'thermo_pdb','thermo_pid', 'af')
download_structure(df, 'meso_pdb','meso_pid', 'af')
run_fatcat(df, 'af')

In [49]:
df_sample = df_test.sample(5)

In [50]:
df_result_new = run_fatcat(df_sample, pdb_dir='af')

In [51]:
df_result_new

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb,p_value
1,I6XFS7,A0A1M6WSV2,6M1C,,0
86,A0A0D5NPS8,A0A1I0T8W6,,,0
53,A0A366XZE1,A0A178TY03,,,0
73,A0A4R6DG32,A0A3N2H011,,,0
5,Q5FJB3,A0A7W9YPC6,3TFX,,0


In [None]:
# Quick fatcat command: 
# FATCATQue.pl timeused pair.list -q >pair.aln (only work when files are in the same directory)
# FATCAT -p1 A0JNW5.pdb -p2 A1L1K8.pdb -o A0JNW5_A1L1K8 -m
# FATCAT -p1 P0A9P0.pdb -p2 Q9H3D4.pdb -i ./checking -o test -m (-i directory of PDB files), result file is in ./

#FATCAT command help: 
FATCAT <-p1 file> <-p2 file> (the input pdb files)
  [-o output-initial] (default tmp)
  [-i string] (data directory for both structures, default ./)
  [-i1 string] (data directory for 1st structures, default ./)
  [-i2 string] (data directory for 2st structures, default ./)
  [-s1 num] (read start position of protein 1, default from the begin)
  [-s2 num] (read start position of protein 2, default from the begin)
  [-l1 num] (read length of protein 1, default whole protein)
  [-l2 num] (read length of protein 2, default whole protein)
  [-r] (force program run rigid structural alignment, default off)
  [-filter probcut] (filter the alignment quickly, set a big probcut, eg 0.2, useful in database searching, default off)
  [-sparse number[0-3]] (sparsely fragment sampling, for speeding up the calculation, default off)
  [-b] (print a basic report to stdout)
  [-f] (print a full report to stdout. When -b or -f is on, following options are all automatically off)
  [-m] (print alignment to a file)
  [-q] (print alignment to stdout, useful in database-search in queue)
  [-ab] (print the postscript graph of all AFPs and final AFP chain in black-white to a file)
  [-ac] (print the postscript graph of all AFPs and final AFP chain in color to a file)
  [-c] (print AFP chaining result to file.chain.txt)
  [-t] (print the files of transformed pdbs and corresponding rasmol scripts)
  [-s] (print the files of superimposed pdbs and corresponding rasmol scripts)
  [-time] (print the total running time, default off)

In [24]:
def run_fatcat(df, pdb_dir):
    p_values = []  # List to store the extracted p-values
    rows_to_drop = []  # List to store the indices of rows to be dropped

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Append the index of the row to the list of rows to be dropped
            rows_to_drop.append(index)
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
        
        # Run the FATCAT command and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True)
        output = result.stdout

        # Find the line containing the p-value
        p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

        # Extract the p-value
        p_value = float(p_value_line.split()[1])
        
        # Append the p-value to the list
        p_values.append(p_value)

    # Drop the rows with missing structure files from the dataframe
    df = df.drop(rows_to_drop)
    
    df.loc[:, 'p_value'] = p_values  # Use .loc to set the 'p_value' column

    cmd_2 = """CREATE OR REPLACE TEMP TABLE tmp
                AS SELECT * FROM df"""
    cmd_3 = """ALTER TABLE pairpro.final ADD COLUMN p_value FLOAT"""
    cmd_4 = """UPDATE pairpro.final SET p_value = tmp.p_value
                FROM pairpro.final
                INNER JOIN tmp ON (pairpro.final.meso_pid = pairpro.tmp.meso_pid)
                AND (pairpro.final.thermo_pid = pairpro.tmp.thermo_pid)"""
    conn.sql(cmd_2)
    conn.execute(cmd_3)
    conn.execute(cmd_4)
    conn.commit()

In [None]:
conn = db.connect("./pairpro_50k.db")
con_df = conn.execute("""SELECT * FROM pairpro.final LIMIT 100""").fetch_df()

con_df[['meso_pdb', 'thermo_pdb']]= np.nan
con_df