In [1]:
from Bio.PDB import PDBList
import os
import pandas as pd
import subprocess
import time
import ssl

import asyncio
import httpx
import nest_asyncio

import duckdb as db
import numpy as np
from joblib import Parallel, delayed
import csv

In [2]:
async def download_aff(session, url, filename):
    """
    Downloads a file asynchronously using an HTTP session.

    Args:
        session (httpx.AsyncClient): An HTTP session for making requests.
        url (str): The URL of the file to download.
        filename (str): The name of the file to save.

    Returns:
        bool: True if the file is successfully downloaded, False otherwise.
    """
    try:
        response = await session.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded file: {filename}")
            return True
        else:
            print(f"Failed to download file: {filename}. Status code: {response.status_code}")
            return False
    except httpx.RequestError as e:
        print(f"Error while downloading file: {filename}. Exception: {str(e)}")
        return False

async def download_af(row, u_column, pdb_dir):
    """
    Downloads AlphaFold files for a given row asynchronously.

    Args:
        row (pd.Series): The row containing the data for the download.
        u_column (str): The column name for the UniProt ID.
        pdb_dir (str): The directory to save the downloaded files.

    Returns:
        bool: True if the download is successful, False otherwise.
    """
    uniprot_id = getattr(row, u_column)
    url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
    filename = f'{pdb_dir}/{uniprot_id}.pdb'

    async with httpx.AsyncClient(verify=False) as client:  # Disable SSL certificate verification
        success = await download_aff(client, url, filename)
        return success

def run_download_af_all(df, pdb_column, u_column, pdb_dir):
    """
    Runs the asynchronous download of AlphaFold files for all rows in a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the data for the downloads.
        pdb_column (str): The column name for the PDB ID.
        u_column (str): The column name for the UniProt ID.
        pdb_dir (str): The directory to save the downloaded files.
    
    Returns:
        files: pdb files containing structural information.
    """
    nest_asyncio.apply()

    async def download_af_all():
        tasks = []
        success_count = 0

        if not os.path.exists(pdb_dir):
            os.makedirs(pdb_dir)

        for row in df.itertuples(index=False):
            if pd.isna(getattr(row, pdb_column)):
                task = asyncio.create_task(download_af(row, u_column, pdb_dir))
                tasks.append(task)

        results = await asyncio.gather(*tasks)
        success_count = sum(results)

        print(f"Successfully downloaded {success_count} files out of {len(df)}")

    asyncio.run(download_af_all())

def download_pdb(df, pdb_column, pdb_dir):
    """
    Downloads PDB files for the given DataFrame based on PDB IDs.

    Args:
        df (pd.DataFrame): The DataFrame containing the PDB IDs.
        pdb_column (str): The column name for the PDB ID.
        pdb_dir (str): The directory to save the downloaded files.
    
    Returns: pdb files containing structural information.
    """
    pdbl = PDBList()
    pdbs = df[pdb_column].dropna().unique()
    for p in pdbs:
        pdbl.retrieve_pdb_file(p, pdir=pdb_dir, file_format='pdb')
        file_path = os.path.join(pdb_dir, f'pdb{p.lower()}.ent')
        if os.path.exists(file_path):
            os.rename(file_path, os.path.join(pdb_dir, f'{p}.pdb'))
        else:
            pass

def download_structure(df, pdb_column, u_column, pdb_dir):
    """
    Downloads structure files for a DataFrame using AlphaFold and PDB.

    Args:
        df (pd.DataFrame): The DataFrame containing the data for the downloads.
        pdb_column (str): The column name for the PDB ID.
        u_column (str): The column name for the UniProt ID.
        pdb_dir (str): The directory to save the downloaded files.

    Returns: pdb files containing structural information.
    """
    start_time = time.time()  # Start measuring time
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)
    download_pdb(df, pdb_column, pdb_dir)    
    run_download_af_all(df, pdb_column, u_column, pdb_dir)
    end_time = time.time()  # Stop measuring time
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")
    pass

In [15]:
def compare_fatcat(p1_file, p2_file, pdb_dir, pair_id):
    # Set the FATCAT command and its arguments
    cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']

    # Run the FATCAT command and capture the output
    result = subprocess.run(cmd, capture_output=True, text=True)
    output = result.stdout

    # Find the line containing the p-value
    p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

    # Extract the p-value and convert it to a numeric value
    p_value = float(p_value_line.split()[1])

    # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
    if p_value < 0.05:
        return {'pair_id': pair_id, 'p_value': 1}
    else:
        return {'pair_id': pair_id, 'p_value': 0}

def process_row(row, pdb_dir):
    if not pd.isna(row['meso_pdb']):
        p1 = row['meso_pdb']
    else:
        p1 = row['meso_pid']

    if not pd.isna(row['thermo_pdb']):
        p2 = row['thermo_pdb']
    else:
        p2 = row['thermo_pid']

    # Check if the structure files exist in the 'checking' folder
    p1_file = f'{p1}.pdb'
    p2_file = f'{p2}.pdb'
    if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
        # Assign NaN as the p-value instead of dropping the row
        return None

    return compare_fatcat(p1_file, p2_file, pdb_dir, row['pair_id'])

def run_fatcat_dict_job(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    # Parallelize the execution of the function using joblib
    p_values = Parallel(n_jobs=-1)(delayed(process_row)(row, pdb_dir) for _, row in df.iterrows())

    # Filter out None values
    p_values = [p_value for p_value in p_values if p_value is not None]

    with open('output.csv', 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['pair_id', 'p_value'])
        writer.writeheader()
        writer.writerows(p_values)
    return 'output.csv'

In [13]:
df_test = pd.read_csv('pair_sample.csv')
df_sample = df_test.sample(5)
df_sample = df_sample.drop(columns=['Unnamed: 0'])
df_sample['pair_id'] = np.random.randint(1000, 9999, size=len(df_sample))
df_sample

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb,pair_id
29,Q81IG4,A0A1I3UXR9,1YQH,,7521
6,Q8Y6Y9,A0A087LCG0,8A63,,1025
34,Q9I5C9,A0A840UQH5,3UMC,,2418
27,A0A063XEI0,A0A090J3Q8,7QGU,,4792
58,A0A7W6BCG0,A0A2S5JGT3,,,2308


In [16]:
download_structure(df_sample, 'meso_pdb', 'meso_pid', 'af')
download_structure(df_sample, 'thermo_pdb', 'thermo_pid', 'af')

Downloading PDB structure '1yqh'...
Downloading PDB structure '8a63'...
Downloading PDB structure '3umc'...
Downloading PDB structure '7qgu'...
Desired structure doesn't exist
Downloaded file: af/A0A7W6BCG0.pdb
Successfully downloaded 1 files out of 5
Execution time: 2.5884745121002197 seconds
Downloaded file: af/A0A087LCG0.pdb
Downloaded file: af/A0A1I3UXR9.pdb
Downloaded file: af/A0A090J3Q8.pdb
Downloaded file: af/A0A840UQH5.pdb
Downloaded file: af/A0A2S5JGT3.pdb
Successfully downloaded 5 files out of 5
Execution time: 0.5814106464385986 seconds


In [17]:
run_fatcat_dict_job(df_sample, 'af')

'output.csv'

In [3]:
conn = db.connect('pairpro_50k.db')
df = conn.execute('SELECT pair_id, thermo_pid, thermo_pdb, meso_pid, meso_pdb FROM pairpro.final USING SAMPLE 2').fetchdf()

In [6]:
download_structure(df, 'meso_pdb', 'meso_pid', 'af')
download_structure(df, 'thermo_pdb', 'thermo_pid', 'af')
run_fatcat_dict_job(df, 'af')

Downloaded file: af/I3CKA5.pdb
Downloaded file: af/A0A4R2LDN2.pdb
Successfully downloaded 2 files out of 2
Execution time: 0.748706579208374 seconds
Downloaded file: af/A0A1I2ISF5.pdb
Downloaded file: af/A0A1I2JES9.pdb
Successfully downloaded 2 files out of 2
Execution time: 0.16399455070495605 seconds


'output.csv'

In [9]:
db_name = 'pairpro'
conn.execute("""CREATE OR REPLACE TEMP TABLE structure_results AS SELECT * 
               FROM read_csv_auto('output.csv', HEADER=TRUE)""")
conn.execute(f"""ALTER TABLE pairpro.final ADD COLUMN structure_match INT""")
conn.execute(f"""UPDATE pairpro.final AS f
SET structure_match = structure.p_value::INT
FROM structure_results AS structure
WHERE structure.pair_id = f.pair_id
""")

<duckdb.DuckDBPyConnection at 0x7f5a58bc1430>

In [11]:
conn.execute('SELECT * FROM pairpro.final WHERE structure_match IS NOT NULL').fetchdf()

Unnamed: 0,thermo_pid,meso_pid,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,local_E_value,query_align_start,query_align_end,subject_align_end,subject_align_start,...,meso_taxid,m_ogt,t_ogt,ogt_difference,m_protein_seq,t_protein_seq,meso_pdb,thermo_pdb,pair_id,structure_match
0,A0A1I2JES9,I3CKA5,0.573333,0.554839,0.562092,7.55e-59,1,150,150,1,...,1022,29.0,50.0,21.0,MFHCVLHQPEIPPNTGNVIRLCANTQVQLHLIHPLGFSLDDKRMRR...,MFHVALYQPEIPPNTGNIIRLCANTGAQLHLIHPLGFQLTDKALRR...,,,14029,1
1,A0A1I2ISF5,A0A4R2LDN2,0.294118,0.269231,0.275591,2.85e-12,6,124,121,4,...,1133106,30.0,50.0,20.0,MASKILVVDDEPNILLSLEFLMKHAGFQVRTAGDGDAALAAVATEV...,MDKNMRILIVDDFSTMRRIVKNQLADLGYTNTVEADDGKAAWPILQ...,,,9028,1


In [4]:
conn.close()

In [3]:
def run_fatcat_dict(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Assign NaN as the p-value instead of dropping the row
            p_values.append({'pair_id': row['pair_id'], 'p_value': np.nan})
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
        
        # Run the FATCAT command and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True)
        output = result.stdout

        # Find the line containing the p-value
        p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

        # Extract the p-value and convert it to numeric value
        p_value = float(p_value_line.split()[1])
        
        # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
        if p_value < 0.05:
            p_values.append({'pair_id': row['pair_id'], 'p_value': 1})
        else:
            p_values.append({'pair_pid': row['pair_pid'], 'p_value': 0})

    return p_values

In [1]:
def compare_fatcat(p1_file, p2_file, pdb_dir):
    # Set the FATCAT command and its arguments
    cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']

    # Run the FATCAT command and capture the output
    result = subprocess.run(cmd, capture_output=True, text=True)
    output = result.stdout

    # Find the line containing the p-value
    p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

    # Extract the p-value and convert it to a numeric value
    p_value = float(p_value_line.split()[1])

    return p_value

def run_fatcat_dict_fut(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = []
        for index, row in df.iterrows():
            if not pd.isna(row['meso_pdb']):
                p1 = row['meso_pdb']
            else:
                p1 = row['meso_pid']

            if not pd.isna(row['thermo_pdb']):
                p2 = row['thermo_pdb']
            else:
                p2 = row['thermo_pid']

            # Check if the structure files exist in the 'checking' folder
            p1_file = f'{p1}.pdb'
            p2_file = f'{p2}.pdb'
            if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
                # Assign NaN as the p-value instead of dropping the row
                p_values.append({'pair_id': row['pair_id'], 'p_value': np.nan})
                continue

            # Submit the comparison task to the executor
            future = executor.submit(compare_fatcat, p1_file, p2_file, pdb_dir)
            futures.append((future, row['pair_id']))

        # Process the completed tasks and extract the p-values
        for future, pair_id in futures:
            try:
                p_value = future.result()
                # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
                if p_value < 0.05:
                    p_values.append({'pair_id': pair_id, 'p_value': 1})
                else:
                    p_values.append({'pair_id': pair_id, 'p_value': 0})
            except Exception as e:
                # Handle exceptions raised during execution
                p_values.append({'pair_id': pair_id, 'p_value': np.nan})
                print(f"Error processing pair {pair_id}: {str(e)}")

    return p_values

In [None]:
def run_fatcat(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Assign NaN as the p-value instead of dropping the row
            p_values.append(np.nan)
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
        
        # Run the FATCAT command and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True)
        output = result.stdout

        # Find the line containing the p-value
        p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

        # Extract the p-value and convert it to numeric value
        p_value = float(p_value_line.split()[1])
        
        # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
        if p_value < 0.05:
            p_values.append(str(1))
        else:
            p_values.append(str(0))

    df.loc[:, 'p_value'] = p_values  # Use .loc to set the 'p_value' column
    return df

In [None]:
def run_fatcat_dict_2(df, pdb_dir, conn):
    cursor = conn.cursor()

    # Check if the 'p_value' column exists in the table
    cursor.execute("PRAGMA table_info(pairpro.final)")
    columns = cursor.fetchall()
    if ("p_value",) not in columns:
        # Add the 'p_value' column to the table
        cursor.execute("ALTER TABLE pairpro.final ADD COLUMN p_value REAL")
        conn.commit()

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Assign NaN as the p-value instead of dropping the row
            p_value = np.nan
        else:
            # Set the FATCAT command and its arguments
            cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
            
            # Run the FATCAT command and capture the output
            result = subprocess.run(cmd, capture_output=True, text=True)
            output = result.stdout

            # Find the line containing the p-value
            p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

            # Extract the p-value and convert it to a numeric value
            p_value = float(p_value_line.split()[1])
        
        # Check if the p-value is less than 0.05 and assign 1 or 0 accordingly
        if p_value < 0.05:
            p_value = 1
        else:
            p_value = 0
        
        pair_id = row['pair_id']

        # Update the database with the extracted p-value
        cursor.execute(f"UPDATE pairpro.final SET p_value = {p_value} WHERE pair_id = {pair_id}")

    # Commit the changes
    conn.commit()