In [1]:
from Bio.PDB import PDBList
import os
import requests
import pandas as pd
import subprocess
import time
import tempfile

import asyncio
import httpx
import nest_asyncio

import duckdb as db
import numpy as np
import concurrent.futures

In [2]:
df_test = pd.read_csv('./pair_sample.csv', index_col=0)
df_test

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb
0,P9WJA3,A0A1M6N9Z6,1Y5H,
1,I6XFS7,A0A1M6WSV2,6M1C,
2,Q65EQ1,A0A521F3Z2,6NKG,
3,F5HRS7,A0A2T0LBQ2,7QH4,
4,P9WHM1,C7MUW2,3LP6,
...,...,...,...,...
95,A0A4Z0GXN3,A0A1G7W5M9,,
96,G4H893,A0A1W6VMF1,,
97,A0A120GMI5,Q5L0I9,,
98,A0A4V2YRI4,A0A3N2GW27,,


In [3]:
async def download_aff(session, url, filename):
    try:
        response = await session.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded file: {filename}")
            return True
        else:
            print(f"Failed to download file: {filename}. Status code: {response.status_code}")
            return False
    except httpx.RequestError as e:
        print(f"Error while downloading file: {filename}. Exception: {str(e)}")
        return False

async def download_af(row, u_column, pdb_dir):
    uniprot_id = getattr(row, u_column)
    url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
    filename = f'{pdb_dir}/{uniprot_id}.pdb'

    async with httpx.AsyncClient() as client:
        success = await download_aff(client, url, filename)
        return success

def run_download_af_all(df, pdb_column, u_column, pdb_dir):
    nest_asyncio.apply()

    async def download_af_all():
        tasks = []
        success_count = 0

        if not os.path.exists(pdb_dir):
            os.makedirs(pdb_dir)

        for row in df.itertuples(index=False):
            if pd.isna(getattr(row, pdb_column)):
                task = asyncio.create_task(download_af(row, u_column, pdb_dir))
                tasks.append(task)

        results = await asyncio.gather(*tasks)
        success_count = sum(results)

        print(f"Successfully downloaded {success_count} files out of {len(df)}")

    asyncio.run(download_af_all())

def download_pdb(df, pdb_column, pdb_dir):
    pdbl = PDBList()
    for i, row in df.iterrows():
        pdb_id = row[pdb_column]
        if not pd.isna(pdb_id):  # check for NaN value in PDB IDs column
            pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
            file_path = os.path.join(pdb_dir, f'pdb{pdb_id.lower()}.ent')
            if os.path.exists(file_path):
                os.rename(file_path, os.path.join(pdb_dir, f'{pdb_id}.pdb'))
            else:
                pass

def download_structure(df, pdb_column, u_column, pdb_dir):
    start_time = time.time()  # Start measuring time
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)
    download_pdb(df, pdb_column, pdb_dir)    
    run_download_af_all(df, pdb_column, u_column, pdb_dir)
    end_time = time.time()  # Stop measuring time
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")
    pass

In [3]:
def run_fatcat_dict(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Assign NaN as the p-value instead of dropping the row
            p_values.append({'pair_id': row['pair_id'], 'p_value': np.nan})
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
        
        # Run the FATCAT command and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True)
        output = result.stdout

        # Find the line containing the p-value
        p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

        # Extract the p-value and convert it to numeric value
        p_value = float(p_value_line.split()[1])
        
        # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
        if p_value < 0.05:
            p_values.append({'pair_id': row['pair_id'], 'p_value': 1})
        else:
            p_values.append({'pair_pid': row['pair_pid'], 'p_value': 0})

    return p_values

In [12]:
conn = db.connect("./pairpro_50k.db")
df = conn.execute("""SELECT thermo_pid, meso_pid, thermo_pdb, meso_pdb, pair_id FROM pairpro.final LIMIT 10000""").df()

In [8]:
def compare_fatcat(p1_file, p2_file, pdb_dir):
    # Set the FATCAT command and its arguments
    cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']

    # Run the FATCAT command and capture the output
    result = subprocess.run(cmd, capture_output=True, text=True)
    output = result.stdout

    # Find the line containing the p-value
    p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

    # Extract the p-value and convert it to a numeric value
    p_value = float(p_value_line.split()[1])

    return p_value

def run_fatcat_dict_2(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = []
        for index, row in df.iterrows():
            if not pd.isna(row['meso_pdb']):
                p1 = row['meso_pdb']
            else:
                p1 = row['meso_pid']

            if not pd.isna(row['thermo_pdb']):
                p2 = row['thermo_pdb']
            else:
                p2 = row['thermo_pid']

            # Check if the structure files exist in the 'checking' folder
            p1_file = f'{p1}.pdb'
            p2_file = f'{p2}.pdb'
            if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
                # Assign NaN as the p-value instead of dropping the row
                p_values.append({'pair_id': row['pair_id'], 'p_value': np.nan})
                continue

            # Submit the comparison task to the executor
            future = executor.submit(compare_fatcat, p1_file, p2_file, pdb_dir)
            futures.append((future, row['pair_id']))

        # Process the completed tasks and extract the p-values
        for future, pair_id in futures:
            try:
                p_value = future.result()
                # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
                if p_value < 0.05:
                    p_values.append({'pair_id': pair_id, 'p_value': 1})
                else:
                    p_values.append({'pair_id': pair_id, 'p_value': 0})
            except Exception as e:
                # Handle exceptions raised during execution
                p_values.append({'pair_id': pair_id, 'p_value': np.nan})
                print(f"Error processing pair {pair_id}: {str(e)}")

    return p_values

In [13]:
df.to_csv('test.csv', index=False)

In [14]:
conn.close()

In [None]:
def run_fatcat(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Assign NaN as the p-value instead of dropping the row
            p_values.append(np.nan)
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
        
        # Run the FATCAT command and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True)
        output = result.stdout

        # Find the line containing the p-value
        p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

        # Extract the p-value and convert it to numeric value
        p_value = float(p_value_line.split()[1])
        
        # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
        if p_value < 0.05:
            p_values.append(str(1))
        else:
            p_values.append(str(0))

    df.loc[:, 'p_value'] = p_values  # Use .loc to set the 'p_value' column
    return df

In [None]:
def run_fatcat_dict_2(df, pdb_dir, conn):
    cursor = conn.cursor()

    # Check if the 'p_value' column exists in the table
    cursor.execute("PRAGMA table_info(pairpro.final)")
    columns = cursor.fetchall()
    if ("p_value",) not in columns:
        # Add the 'p_value' column to the table
        cursor.execute("ALTER TABLE pairpro.final ADD COLUMN p_value REAL")
        conn.commit()

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Assign NaN as the p-value instead of dropping the row
            p_value = np.nan
        else:
            # Set the FATCAT command and its arguments
            cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
            
            # Run the FATCAT command and capture the output
            result = subprocess.run(cmd, capture_output=True, text=True)
            output = result.stdout

            # Find the line containing the p-value
            p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

            # Extract the p-value and convert it to a numeric value
            p_value = float(p_value_line.split()[1])
        
        # Check if the p-value is less than 0.05 and assign 1 or 0 accordingly
        if p_value < 0.05:
            p_value = 1
        else:
            p_value = 0
        
        pair_id = row['pair_id']

        # Update the database with the extracted p-value
        cursor.execute(f"UPDATE pairpro.final SET p_value = {p_value} WHERE pair_id = {pair_id}")

    # Commit the changes
    conn.commit()