In [1]:
from Bio.PDB import PDBList
import os
import pandas as pd
import subprocess
import time

import asyncio
import httpx
import nest_asyncio

import duckdb as db
import numpy as np
from joblib import Parallel, delayed
import csv

In [7]:
df_new = pd.read_csv('chau_test_10k.csv')
df_sample = df_new.head(500)

In [12]:
async def download_aff(session, url, filename, semaphore):
    try:
        async with semaphore:
            response = await session.get(url)
            if response.status_code == 200:
                with open(filename, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded file: {filename}")
                return True
            else:
                print(f"Failed to download file: {filename}. Status code: {response.status_code}")
                return False
    except httpx.RequestError as e:
        print(f"Error while downloading file: {filename}. Exception: {str(e)}")
        return False

async def download_af(row, u_column, pdb_dir, semaphore):
    uniprot_id = getattr(row, u_column)
    url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
    filename = f'{pdb_dir}/{uniprot_id}.pdb'

    async with httpx.AsyncClient(verify=False) as client:  # Disable SSL certificate verification
        success = await download_aff(client, url, filename, semaphore)
        return success

def run_download_af_all(df, pdb_column, u_column, pdb_dir):
    nest_asyncio.apply()

    async def download_af_all():
        semaphore = asyncio.Semaphore(500) # Specify the maximum number of concurrent requests
        tasks = []
        success_count = 0

        if not os.path.exists(pdb_dir):
            os.makedirs(pdb_dir)

        for row in df.itertuples(index=False):
            if pd.isna(getattr(row, pdb_column)):
                task = asyncio.create_task(download_af(row, u_column, pdb_dir, semaphore))
                tasks.append(task)

        results = await asyncio.gather(*tasks)
        success_count = sum(results)

        print(f"Successfully downloaded {success_count} files out of {len(df)}")

    asyncio.run(download_af_all())

def download_pdb(df, pdb_column, pdb_dir):
    pdbl = PDBList()
    pdbs = df[pdb_column].dropna().unique()
    for p in pdbs:
        pdbl.retrieve_pdb_file(p, pdir=pdb_dir, file_format='pdb')
        file_path = os.path.join(pdb_dir, f'pdb{p.lower()}.ent')
        if os.path.exists(file_path):
            os.rename(file_path, os.path.join(pdb_dir, f'{p}.pdb'))
        else:
            pass

def download_structure(df, pdb_column, u_column, pdb_dir):
    start_time = time.time()  # Start measuring time
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)
    download_pdb(df, pdb_column, pdb_dir)
    run_download_af_all(df, pdb_column, u_column, pdb_dir)
    end_time = time.time()  # Stop measuring time
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")

In [13]:
download_structure(df_sample, 'meso_pdb', 'meso_pid', 'af2')

Downloaded file: af2/A0A327VMA9.pdb
Downloaded file: af2/A0A327W7S0.pdb
Downloaded file: af2/A0A327W6H3.pdb
Downloaded file: af2/A0A327W612.pdb
Downloaded file: af2/A0A327W5V0.pdb
Downloaded file: af2/A0A327VJ32.pdb
Downloaded file: af2/A0A327W1Z1.pdb
Downloaded file: af2/A0A327WD81.pdb
Downloaded file: af2/A0A327VW66.pdb
Downloaded file: af2/A0A327VIS7.pdb
Downloaded file: af2/A0A327W3Z3.pdb
Downloaded file: af2/A0A327VM41.pdb
Downloaded file: af2/A0A327WDI7.pdb
Downloaded file: af2/A0A327VWK7.pdb
Downloaded file: af2/A0A327WJQ9.pdb
Downloaded file: af2/A0A327VUP4.pdb
Downloaded file: af2/A0A327VIW4.pdb
Downloaded file: af2/A0A327WF66.pdb
Downloaded file: af2/A0A327VT31.pdb
Downloaded file: af2/A0A327VKG7.pdb
Downloaded file: af2/A0A327VME8.pdb
Downloaded file: af2/A0A327WJU0.pdb
Downloaded file: af2/A0A327VK41.pdb
Downloaded file: af2/A0A327VXU2.pdb
Downloaded file: af2/A0A327VVG6.pdb
Downloaded file: af2/A0A327WEU9.pdb
Downloaded file: af2/A0A327WCX5.pdb
Downloaded file: af2/A0A327V

In [4]:
def compare_fatcat(p1_file, p2_file, pdb_dir, pair_id):
    # Set the FATCAT command and its arguments
    cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']

    # Run the FATCAT command and capture the output
    result = subprocess.run(cmd, capture_output=True, text=True)
    output = result.stdout

    # Find the line containing the p-value
    p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

    # Extract the p-value and convert it to a numeric value
    p_value = float(p_value_line.split()[1])

    # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
    if p_value < 0.05:
        return {'pair_id': pair_id, 'p_value': 1}
    else:
        return {'pair_id': pair_id, 'p_value': 0}

def process_row(row, pdb_dir):
    if not pd.isna(row['meso_pdb']):
        p1 = row['meso_pdb']
    else:
        p1 = row['meso_pid']

    if not pd.isna(row['thermo_pdb']):
        p2 = row['thermo_pdb']
    else:
        p2 = row['thermo_pid']

    # Check if the structure files exist in the 'checking' folder
    p1_file = f'{p1}.pdb'
    p2_file = f'{p2}.pdb'
    if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
        # Assign NaN as the p-value instead of dropping the row
        return None

    return compare_fatcat(p1_file, p2_file, pdb_dir, row['pair_id'])

def run_fatcat_dict_job(df, pdb_dir, file):
    p_values = []  # List to store the extracted p-values

    # Parallelize the execution of the function using joblib
    p_values = Parallel(n_jobs=-1)(delayed(process_row)(row, pdb_dir) for _, row in df.iterrows())
    # p_values = Parallel(n_jobs=num_cores)(delayed(process_row)(row, pdb_dir) for _, row in df.iterrows())
    # Filter out None values
    p_values = [p_value for p_value in p_values if p_value is not None]

    with open(file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['pair_id', 'p_value'])
        writer.writeheader()
        writer.writerows(p_values)
    return file

In [22]:
df_new = pd.read_csv('chau_test_10k.csv')
df_new.head()

Unnamed: 0,thermo_pid,meso_pid,thermo_pdb,meso_pdb,pair_id
0,A0A2W7RQ16,A0A327VJ32,,,6767
1,A0A2W7RTL2,A0A327VMA9,,,6880
2,A0A2W7RYG8,A0A327W1Z1,,,7492
3,A0A2W7RR29,A0A327W5V0,,,7728
4,A0A2W7RR29,A0A327VM41,,,6868


In [24]:
df_sample = df_new.head(100)
df_sample.shape

(100, 5)

In [25]:
download_structure(df_sample, 'meso_pdb', 'meso_pid', 'af')
download_structure(df_sample, 'thermo_pdb', 'thermo_pid', 'af')

Downloaded file: af/A0A327VIS7.pdb
Downloaded file: af/A0A327W6H3.pdb
Downloaded file: af/A0A327VW66.pdb
Downloaded file: af/A0A327VKG7.pdb
Downloaded file: af/A0A327W5V0.pdb
Downloaded file: af/A0A327WJQ9.pdb
Downloaded file: af/A0A327VK41.pdb
Downloaded file: af/A0A327WDI7.pdb
Downloaded file: af/A0A327VUP4.pdb
Downloaded file: af/A0A327WJU0.pdb
Downloaded file: af/A0A327W750.pdb
Downloaded file: af/A0A327VK41.pdb
Downloaded file: af/A0A327WEK7.pdb
Downloaded file: af/A0A327VJ32.pdb
Downloaded file: af/A0A327VMA9.pdb
Downloaded file: af/A0A327WFA9.pdb
Downloaded file: af/A0A327VIW4.pdb
Downloaded file: af/A0A327W2Z8.pdb
Downloaded file: af/A0A327VP40.pdb
Downloaded file: af/A0A327VSK6.pdb
Downloaded file: af/A0A327W2Q8.pdb
Downloaded file: af/A0A327W1Z1.pdb
Downloaded file: af/A0A327WDJ9.pdb
Downloaded file: af/A0A327VSL5.pdb
Downloaded file: af/A0A327W4W4.pdb
Downloaded file: af/A0A327W587.pdb
Downloaded file: af/A0A327WCK1.pdb
Downloaded file: af/A0A327VUM4.pdb
Downloaded file: af/

In [26]:
run_fatcat_dict_job(df_sample, 'af')

'output.csv'

In [27]:
import dask
from dask import delayed, compute

In [30]:
def process_row_2(row, pdb_dir):
    if not pd.isna(row['meso_pdb']):
        p1 = row['meso_pdb']
    else:
        p1 = row['meso_pid']

    if not pd.isna(row['thermo_pdb']):
        p2 = row['thermo_pdb']
    else:
        p2 = row['thermo_pid']

    # Check if the structure files exist in the 'checking' folder
    p1_file = f'{p1}.pdb'
    p2_file = f'{p2}.pdb'
    if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
        # Assign NaN as the p-value instead of dropping the row
        return None

    # Return a Dask delayed object instead of directly calling compare_fatcat
    return delayed(compare_fatcat)(p1_file, p2_file, pdb_dir, row['pair_id'])

def run_fatcat_dict_job_2(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    # Convert process_row calls to delayed objects
    p_values = [process_row_2(row, pdb_dir) for _, row in df.iterrows()]

    # Filter out None values
    p_values = [p_value for p_value in p_values if p_value is not None]

    # Use Dask's compute to parallelize the processing
    results = compute(*p_values)

    # Collect the results into a list
    p_values = list(results)

    with open('output_2.csv', 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['pair_id', 'p_value'])
        writer.writeheader()
        writer.writerows(p_values)
    return 'output_2.csv'


In [31]:
run_fatcat_dict_job_2(df_sample, 'af')

'output_2.csv'

In [14]:
conn = db.connect('pairpro_50k.db')
df = conn.execute('SELECT pair_id, thermo_pid, thermo_pdb, meso_pid, meso_pdb FROM pairpro.final').fetchdf()

In [15]:
df.to_csv('chau_test_50k.csv')

In [9]:
db_name = 'pairpro'
conn.execute("""CREATE OR REPLACE TEMP TABLE structure_results AS SELECT * 
               FROM read_csv_auto('output.csv', HEADER=TRUE)""")
conn.execute(f"""ALTER TABLE pairpro.final ADD COLUMN structure_match INT""")
conn.execute(f"""UPDATE pairpro.final AS f
SET structure_match = structure.p_value::INT
FROM structure_results AS structure
WHERE structure.pair_id = f.pair_id
""")

<duckdb.DuckDBPyConnection at 0x7f5a58bc1430>

In [11]:
conn.execute('SELECT * FROM pairpro.final WHERE structure_match IS NOT NULL').fetchdf()

Unnamed: 0,thermo_pid,meso_pid,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,local_E_value,query_align_start,query_align_end,subject_align_end,subject_align_start,...,meso_taxid,m_ogt,t_ogt,ogt_difference,m_protein_seq,t_protein_seq,meso_pdb,thermo_pdb,pair_id,structure_match
0,A0A1I2JES9,I3CKA5,0.573333,0.554839,0.562092,7.55e-59,1,150,150,1,...,1022,29.0,50.0,21.0,MFHCVLHQPEIPPNTGNVIRLCANTQVQLHLIHPLGFSLDDKRMRR...,MFHVALYQPEIPPNTGNIIRLCANTGAQLHLIHPLGFQLTDKALRR...,,,14029,1
1,A0A1I2ISF5,A0A4R2LDN2,0.294118,0.269231,0.275591,2.85e-12,6,124,121,4,...,1133106,30.0,50.0,20.0,MASKILVVDDEPNILLSLEFLMKHAGFQVRTAGDGDAALAAVATEV...,MDKNMRILIVDDFSTMRRIVKNQLADLGYTNTVEADDGKAAWPILQ...,,,9028,1


In [16]:
conn.close()

In [3]:
def run_fatcat_dict(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Assign NaN as the p-value instead of dropping the row
            p_values.append({'pair_id': row['pair_id'], 'p_value': np.nan})
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
        
        # Run the FATCAT command and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True)
        output = result.stdout

        # Find the line containing the p-value
        p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

        # Extract the p-value and convert it to numeric value
        p_value = float(p_value_line.split()[1])
        
        # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
        if p_value < 0.05:
            p_values.append({'pair_id': row['pair_id'], 'p_value': 1})
        else:
            p_values.append({'pair_pid': row['pair_pid'], 'p_value': 0})

    return p_values

In [1]:
def compare_fatcat(p1_file, p2_file, pdb_dir):
    # Set the FATCAT command and its arguments
    cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']

    # Run the FATCAT command and capture the output
    result = subprocess.run(cmd, capture_output=True, text=True)
    output = result.stdout

    # Find the line containing the p-value
    p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

    # Extract the p-value and convert it to a numeric value
    p_value = float(p_value_line.split()[1])

    return p_value

def run_fatcat_dict_fut(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = []
        for index, row in df.iterrows():
            if not pd.isna(row['meso_pdb']):
                p1 = row['meso_pdb']
            else:
                p1 = row['meso_pid']

            if not pd.isna(row['thermo_pdb']):
                p2 = row['thermo_pdb']
            else:
                p2 = row['thermo_pid']

            # Check if the structure files exist in the 'checking' folder
            p1_file = f'{p1}.pdb'
            p2_file = f'{p2}.pdb'
            if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
                # Assign NaN as the p-value instead of dropping the row
                p_values.append({'pair_id': row['pair_id'], 'p_value': np.nan})
                continue

            # Submit the comparison task to the executor
            future = executor.submit(compare_fatcat, p1_file, p2_file, pdb_dir)
            futures.append((future, row['pair_id']))

        # Process the completed tasks and extract the p-values
        for future, pair_id in futures:
            try:
                p_value = future.result()
                # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
                if p_value < 0.05:
                    p_values.append({'pair_id': pair_id, 'p_value': 1})
                else:
                    p_values.append({'pair_id': pair_id, 'p_value': 0})
            except Exception as e:
                # Handle exceptions raised during execution
                p_values.append({'pair_id': pair_id, 'p_value': np.nan})
                print(f"Error processing pair {pair_id}: {str(e)}")

    return p_values

In [None]:
def run_fatcat(df, pdb_dir):
    p_values = []  # List to store the extracted p-values

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Assign NaN as the p-value instead of dropping the row
            p_values.append(np.nan)
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
        
        # Run the FATCAT command and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True)
        output = result.stdout

        # Find the line containing the p-value
        p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

        # Extract the p-value and convert it to numeric value
        p_value = float(p_value_line.split()[1])
        
        # Check if p-value is less than 0.05 and assign 1 or 0 accordingly
        if p_value < 0.05:
            p_values.append(str(1))
        else:
            p_values.append(str(0))

    df.loc[:, 'p_value'] = p_values  # Use .loc to set the 'p_value' column
    return df

In [None]:
def run_fatcat_dict_2(df, pdb_dir, conn):
    cursor = conn.cursor()

    # Check if the 'p_value' column exists in the table
    cursor.execute("PRAGMA table_info(pairpro.final)")
    columns = cursor.fetchall()
    if ("p_value",) not in columns:
        # Add the 'p_value' column to the table
        cursor.execute("ALTER TABLE pairpro.final ADD COLUMN p_value REAL")
        conn.commit()

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Assign NaN as the p-value instead of dropping the row
            p_value = np.nan
        else:
            # Set the FATCAT command and its arguments
            cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
            
            # Run the FATCAT command and capture the output
            result = subprocess.run(cmd, capture_output=True, text=True)
            output = result.stdout

            # Find the line containing the p-value
            p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

            # Extract the p-value and convert it to a numeric value
            p_value = float(p_value_line.split()[1])
        
        # Check if the p-value is less than 0.05 and assign 1 or 0 accordingly
        if p_value < 0.05:
            p_value = 1
        else:
            p_value = 0
        
        pair_id = row['pair_id']

        # Update the database with the extracted p-value
        cursor.execute(f"UPDATE pairpro.final SET p_value = {p_value} WHERE pair_id = {pair_id}")

    # Commit the changes
    conn.commit()

In [None]:
# Original which causes issues with 50k samples
async def download_aff(session, url, filename):
    try:
        response = await session.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded file: {filename}")
            return True
        else:
            print(f"Failed to download file: {filename}. Status code: {response.status_code}")
            return False
    except httpx.RequestError as e:
        print(f"Error while downloading file: {filename}. Exception: {str(e)}")
        return False

async def download_af(row, u_column, pdb_dir):
    uniprot_id = getattr(row, u_column)
    url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
    filename = f'{pdb_dir}/{uniprot_id}.pdb'

    async with httpx.AsyncClient(verify=False) as client:  # Disable SSL certificate verification
        success = await download_aff(client, url, filename)
        return success

def run_download_af_all(df, pdb_column, u_column, pdb_dir):
    nest_asyncio.apply()

    async def download_af_all():
        tasks = []
        success_count = 0

        if not os.path.exists(pdb_dir):
            os.makedirs(pdb_dir)

        for row in df.itertuples(index=False):
            if pd.isna(getattr(row, pdb_column)):
                task = asyncio.create_task(download_af(row, u_column, pdb_dir))
                tasks.append(task)

        results = await asyncio.gather(*tasks)
        success_count = sum(results)

        print(f"Successfully downloaded {success_count} files out of {len(df)}")

    asyncio.run(download_af_all())

def download_pdb(df, pdb_column, pdb_dir):
    pdbl = PDBList()
    pdbs = df[pdb_column].dropna().unique()
    for p in pdbs:
        pdbl.retrieve_pdb_file(p, pdir=pdb_dir, file_format='pdb')
        file_path = os.path.join(pdb_dir, f'pdb{p.lower()}.ent')
        if os.path.exists(file_path):
            os.rename(file_path, os.path.join(pdb_dir, f'{p}.pdb'))
        else:
            pass

def download_structure(df, pdb_column, u_column, pdb_dir):
    start_time = time.time()  # Start measuring time
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)
    download_pdb(df, pdb_column, pdb_dir)    
    run_download_af_all(df, pdb_column, u_column, pdb_dir)
    end_time = time.time()  # Stop measuring time
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")
    pass