In [8]:
from Bio.PDB import PDBList
import os
import requests
import pandas as pd
import subprocess
import time
import tempfile

import asyncio
import httpx
import nest_asyncio

import duckdb as db
import numpy as np

In [2]:
conn = db.connect("./pairpro_50k.db")
con_df = conn.execute("""SELECT * FROM pairpro.final LIMIT 100""").fetch_df()

In [9]:
con_df[['meso_pdb', 'thermo_pdb']]= np.nan
con_df

Unnamed: 0,thermo_pid,meso_pid,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,local_E_value,query_align_start,query_align_end,subject_align_end,subject_align_start,...,bit_score,thermo_taxid,meso_taxid,m_ogt,t_ogt,ogt_difference,m_protein_seq,t_protein_seq,meso_pdb,thermo_pdb
0,A0A2W7RQ16,A0A327VJ32,0.303279,0.291339,0.290196,2.170000e-12,7,127,128,7,...,132.0,1004304,1539050,25.0,45.0,20.0,MTRINYSSGATWEDKVGYSRAVRTGNIIEVSGTVAEDEGKVVAEGN...,MHYQRISSGSVYEEQMCYSRAVVAAPFIFVSGTTGFNYATMTIADD...,,
1,A0A2W7RTL2,A0A327VMA9,0.644068,0.567164,0.584615,4.180000e-24,6,64,61,3,...,199.0,1004304,1539050,25.0,45.0,20.0,MQTGVVKFFNETKGFGFIKIEGTNQEIFVHVSGIKESIGENDRVVF...,MSTKITGTVKFFNEEKGFGFIKHDDSNKETFVHANGLIDQIEANDK...,,
2,A0A2W7RYG8,A0A327W1Z1,0.538462,0.477273,0.473684,4.340000e-40,12,128,130,14,...,317.0,1004304,1539050,25.0,45.0,20.0,MARKGNKNKRRNLEKLSHKYRLVIMNDETYDEVTSFKLSRMSVYIA...,MAKQIFETNTFKRLRNQYRLVIINDDTYEELVTFKLSRLSVYVAFS...,,
3,A0A2W7RR29,A0A327W5V0,0.660714,0.599190,0.609053,7.720000e-113,24,247,239,16,...,818.0,1004304,1539050,25.0,45.0,20.0,MMSSMSKNINTHLVEIKDYFKVAISVDCVIFGFNNDELKVLLIESD...,MKPKSVQTEPAAQRPLITDVKALVNSYPRVPITVDCVIFGFDGEEL...,,
4,A0A2W7RR29,A0A327VM41,0.414747,0.364372,0.376569,4.440000e-58,28,243,227,11,...,456.0,1004304,1539050,25.0,45.0,20.0,MNAASFYSKAPRHLVAVDCIIFGFDEGKLKLLVIKRKVAPMAGEWS...,MKPKSVQTEPAAQRPLITDVKALVNSYPRVPITVDCVIFGFDGEEL...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,A0A2W7RQL8,A0A327VKE6,0.311927,0.246377,0.232877,7.010000e-09,33,138,149,24,...,110.0,1004304,1539050,25.0,45.0,20.0,MLAIVMLTGITNVFAQEDDAGKKLKSIQVAFMARNLKLTPEEADRF...,MKKFVIIGLTMVFVFIGFTLIAQPANNNRYANIQGLKVAYITKQLN...,,
96,A0A2W7RN43,A0A327WBR1,0.526316,0.505051,0.507614,3.640000e-62,1,189,190,1,...,475.0,1004304,1539050,25.0,45.0,20.0,MNKARLEAFSDGVLAIIITIMVLEIKVPHGAEWSDLLKQYPVFFSY...,MRKGRLEAFSDGVLAIIITIMVLELKVPHMVSWEALLEIWPVFLSY...,,
97,A0A2W7RZJ0,A0A327W4Z3,0.500000,0.463855,0.478261,2.760000e-59,12,165,155,2,...,449.0,1004304,1539050,25.0,45.0,20.0,METPLVLDEKDMNILKLLQQDAKMTMRDIAAQLNMSTTPVYERIRK...,MPKQKGQSTDYEIPNGLDAVDLEILRLLETNARLTIKEIADKVHLS...,,
98,A0A2W7RZJ0,A0A327VTV7,0.441379,0.385542,0.400000,3.300000e-40,17,161,148,4,...,323.0,1004304,1539050,25.0,45.0,20.0,MDTLDKTDRHILQVLQQDAKLNTKEIAYRIGLSVTPTYERLKKIEK...,MPKQKGQSTDYEIPNGLDAVDLEILRLLETNARLTIKEIADKVHLS...,,


In [2]:
df_test = pd.read_csv('./pair_sample.csv', index_col=0)
df_test

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb
0,P9WJA3,A0A1M6N9Z6,1Y5H,
1,I6XFS7,A0A1M6WSV2,6M1C,
2,Q65EQ1,A0A521F3Z2,6NKG,
3,F5HRS7,A0A2T0LBQ2,7QH4,
4,P9WHM1,C7MUW2,3LP6,
...,...,...,...,...
95,A0A4Z0GXN3,A0A1G7W5M9,,
96,G4H893,A0A1W6VMF1,,
97,A0A120GMI5,Q5L0I9,,
98,A0A4V2YRI4,A0A3N2GW27,,


In [4]:
def download_structures(df, pdb_column, u_column, pdb_dir):
    start_time = time.time()  # Start measuring time
    pdbl = PDBList()
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)
        
    for i, row in df.iterrows():
        pdb_id = row[pdb_column]
        uniprot_id = row[u_column]
        if not pd.isna(pdb_id):  # check for NaN value in PDB IDs column
            pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
            file_path = os.path.join(pdb_dir, f'pdb{pdb_id.lower()}.ent')
            if os.path.exists(file_path):
                os.rename(os.path.join(file_path), os.path.join(pdb_dir, f'{pdb_id}.pdb'))
            else:
                pass
        elif isinstance(uniprot_id, str):  # download structure using UniProt ID
            url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
            response = requests.get(url)
            if response.ok:
                filename = f'{pdb_dir}/{uniprot_id}.pdb'
                with open(filename, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded file for {uniprot_id} to {filename}")
            else:
                print(f"Failed to download file for {uniprot_id}: {response.status_code} - {response.reason}")
        else:
            print(f"No PDB ID or UniProt ID available for index {i}")
        end_time = time.time()  # Stop measuring time
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")
    pass

In [5]:
def download_af(df, u_column, pdb_dir):
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)
    for i, row in df.iterrows():
        uniprot_id = row[u_column]
        url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
        response = requests.get(url)
        if response.ok:
            filename = f'{pdb_dir}/{uniprot_id}.pdb'
            with open(filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded file for {uniprot_id} to {filename}")
        else:
            print(f"Failed to download file for {uniprot_id}: {response.status_code} - {response.reason}")
    pass

In [6]:
download_af(con_df, 'meso_pid', './af3')

Downloaded file for A0A327VJ32 to ./af3/A0A327VJ32.pdb
Downloaded file for A0A327VMA9 to ./af3/A0A327VMA9.pdb
Downloaded file for A0A327W1Z1 to ./af3/A0A327W1Z1.pdb
Downloaded file for A0A327W5V0 to ./af3/A0A327W5V0.pdb
Downloaded file for A0A327VM41 to ./af3/A0A327VM41.pdb
Downloaded file for A0A327W612 to ./af3/A0A327W612.pdb
Downloaded file for A0A327WJQ9 to ./af3/A0A327WJQ9.pdb
Downloaded file for A0A327W7S0 to ./af3/A0A327W7S0.pdb
Downloaded file for A0A327VIS7 to ./af3/A0A327VIS7.pdb
Downloaded file for A0A327WD81 to ./af3/A0A327WD81.pdb
Downloaded file for A0A327WEK7 to ./af3/A0A327WEK7.pdb
Downloaded file for A0A327VIW4 to ./af3/A0A327VIW4.pdb
Downloaded file for A0A327W6H3 to ./af3/A0A327W6H3.pdb
Downloaded file for A0A327VW66 to ./af3/A0A327VW66.pdb
Downloaded file for A0A327WDI7 to ./af3/A0A327WDI7.pdb
Downloaded file for A0A327VKG7 to ./af3/A0A327VKG7.pdb
Downloaded file for A0A327VWK7 to ./af3/A0A327VWK7.pdb
Downloaded file for A0A327VUP4 to ./af3/A0A327VUP4.pdb
Downloaded

In [7]:
download_af(con_df, 'thermo_pid', './af3')

Downloaded file for A0A2W7RQ16 to ./af3/A0A2W7RQ16.pdb
Downloaded file for A0A2W7RTL2 to ./af3/A0A2W7RTL2.pdb
Downloaded file for A0A2W7RYG8 to ./af3/A0A2W7RYG8.pdb
Downloaded file for A0A2W7RR29 to ./af3/A0A2W7RR29.pdb
Downloaded file for A0A2W7RR29 to ./af3/A0A2W7RR29.pdb
Downloaded file for A0A2W7RR29 to ./af3/A0A2W7RR29.pdb
Downloaded file for A0A2W7RR29 to ./af3/A0A2W7RR29.pdb
Downloaded file for A0A2W7RVP4 to ./af3/A0A2W7RVP4.pdb
Downloaded file for A0A2W7RI91 to ./af3/A0A2W7RI91.pdb
Downloaded file for A0A2W7RVM8 to ./af3/A0A2W7RVM8.pdb
Downloaded file for A0A2W7SNW2 to ./af3/A0A2W7SNW2.pdb
Downloaded file for A0A2W7SNW2 to ./af3/A0A2W7SNW2.pdb
Downloaded file for A0A2W7RPX2 to ./af3/A0A2W7RPX2.pdb
Downloaded file for A0A2W7TRM4 to ./af3/A0A2W7TRM4.pdb
Downloaded file for A0A2W7RYN1 to ./af3/A0A2W7RYN1.pdb
Downloaded file for A0A2W7S251 to ./af3/A0A2W7S251.pdb
Downloaded file for A0A2W7RQ18 to ./af3/A0A2W7RQ18.pdb
Downloaded file for A0A2W7RQ18 to ./af3/A0A2W7RQ18.pdb
Downloaded

In [5]:
%time download_af(df_test, 'meso_pid', 'af')

Downloaded file for P9WJA3 to af/P9WJA3.pdb
Downloaded file for I6XFS7 to af/I6XFS7.pdb
Downloaded file for Q65EQ1 to af/Q65EQ1.pdb
Downloaded file for F5HRS7 to af/F5HRS7.pdb
Downloaded file for P9WHM1 to af/P9WHM1.pdb
Downloaded file for Q5FJB3 to af/Q5FJB3.pdb
Downloaded file for Q8Y6Y9 to af/Q8Y6Y9.pdb
Downloaded file for H7C7A0 to af/H7C7A0.pdb
Downloaded file for P9WGM5 to af/P9WGM5.pdb
Downloaded file for Q2G2G2 to af/Q2G2G2.pdb
Downloaded file for A0A1H2D2H5 to af/A0A1H2D2H5.pdb
Downloaded file for P66372 to af/P66372.pdb
Downloaded file for Q2FW38 to af/Q2FW38.pdb
Downloaded file for Q9KA76 to af/Q9KA76.pdb
Downloaded file for P9WGQ7 to af/P9WGQ7.pdb
Downloaded file for A0A063XF22 to af/A0A063XF22.pdb
Downloaded file for Q0S8G3 to af/Q0S8G3.pdb
Downloaded file for Q834L5 to af/Q834L5.pdb
Downloaded file for A0A063X745 to af/A0A063X745.pdb
Downloaded file for B2HF11 to af/B2HF11.pdb
Downloaded file for Q8CJW1 to af/Q8CJW1.pdb
Downloaded file for B1YG32 to af/B1YG32.pdb
Download

In [6]:
async def download_structure(session, url, filename):
    response = await session.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded file: {filename}")
    else:
        print(f"Failed to download file: {filename}. Status code: {response.status_code}")

async def download_af(df, u_column, pdb_dir):
    tasks = []
    
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)

    async with httpx.AsyncClient() as client:
        for row in df.itertuples(index=False):
            uniprot_id = getattr(row, u_column)
            url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
            filename = f'{pdb_dir}/{uniprot_id}.pdb'
            task = asyncio.create_task(download_structure(client, url, filename))
            tasks.append(task)

        await asyncio.gather(*tasks)

In [7]:
async def main():
    await download_af(df_test, u_column="meso_pid", pdb_dir="af2")

nest_asyncio.apply()
loop = asyncio.get_event_loop()
loop.run_until_complete(main()) #Runtime is 4 seconds

Downloaded file: af2/P9WHM1.pdb
Downloaded file: af2/A0A1H2D2H5.pdb
Downloaded file: af2/H7C7A0.pdb
Downloaded file: af2/Q8Y6Y9.pdb
Downloaded file: af2/P9WJA3.pdb
Downloaded file: af2/P9WGM5.pdb
Downloaded file: af2/Q65EQ1.pdb
Downloaded file: af2/P66372.pdb
Downloaded file: af2/F5HRS7.pdb
Downloaded file: af2/I6XFS7.pdb
Downloaded file: af2/Q2G2G2.pdb
Downloaded file: af2/P9WGQ7.pdb
Downloaded file: af2/Q2FW38.pdb
Downloaded file: af2/Q9KA76.pdb
Downloaded file: af2/A0A063XF22.pdb
Downloaded file: af2/Q5FJB3.pdb
Downloaded file: af2/Q818P3.pdb
Downloaded file: af2/P66125.pdb
Downloaded file: af2/Q81IG4.pdb
Downloaded file: af2/Q92C58.pdb
Downloaded file: af2/B1YG32.pdb
Downloaded file: af2/Q8CJW1.pdb
Downloaded file: af2/A0A077UGA7.pdb
Downloaded file: af2/Q0S8G3.pdb
Downloaded file: af2/Q9K8A4.pdb
Downloaded file: af2/Q834L5.pdb
Downloaded file: af2/Q9KJN4.pdb
Downloaded file: af2/Q9KJN4.pdb
Downloaded file: af2/Q9RK42.pdb
Downloaded file: af2/Q2G0R9.pdb
Downloaded file: af2/Q0SHW9.

In [26]:
# Check which rows have the value "AF3456" in the "meso_pid" column
rows_with_value = df_test[df_test['meso_pid'] == 'A0A178MPG3']

# Print the rows that match the condition
print(rows_with_value)

      meso_pid  thermo_pid meso_pdb  thermo_pdb
84  A0A178MPG3  A0A2S5JEM5      NaN         NaN


In [6]:
download_structures(df=df_test, pdb_column='meso_pdb', u_column='meso_pid', pdb_dir='checking')

Downloading PDB structure '1y5h'...
Downloading PDB structure '6m1c'...
Downloading PDB structure '6nkg'...
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Downloading PDB structure '3lp6'...
Downloading PDB structure '3tfx'...
Downloading PDB structure '8a63'...
Downloading PDB structure '5y63'...
Downloading PDB structure '3eul'...
Downloading PDB structure '4qwq'...
Downloading PDB structure '7oii'...
Desired structure doesn't exist
Downloading PDB structure '7nhn'...
Desired structure doesn't exist
Downloading PDB structure '7nhm'...
Desired structure doesn't exist
Downloading PDB structure '3on1'...
Downloading PDB structure '6ujk'...
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Downloading PDB structure '2g3b'...
Downloading PDB structure '6wsh'...
Downloading PDB structure '7qh4'...
Desired structure doesn't exist
Downloading PDB structure '5zul'...
Downloading PDB structure '7lza'...
Downloading PDB structure '3ge6'...
Downloading PDB 

In [7]:
download_structures(df=df_test, pdb_column='thermo_pdb', u_column='thermo_pid', pdb_dir='checking')

Downloaded file for A0A1M6N9Z6 to checking/A0A1M6N9Z6.pdb
Downloaded file for A0A1M6WSV2 to checking/A0A1M6WSV2.pdb
Downloaded file for A0A521F3Z2 to checking/A0A521F3Z2.pdb
Downloaded file for A0A2T0LBQ2 to checking/A0A2T0LBQ2.pdb
Downloaded file for C7MUW2 to checking/C7MUW2.pdb
Downloaded file for A0A7W9YPC6 to checking/A0A7W9YPC6.pdb
Downloaded file for A0A087LCG0 to checking/A0A087LCG0.pdb
Downloaded file for I3DYT5 to checking/I3DYT5.pdb
Downloaded file for A0A853ALZ0 to checking/A0A853ALZ0.pdb
Downloaded file for A0A521CL06 to checking/A0A521CL06.pdb
Downloaded file for A0A3N2H419 to checking/A0A3N2H419.pdb
Downloaded file for A0A7V9Z9X5 to checking/A0A7V9Z9X5.pdb
Downloaded file for A0A1G6PFE9 to checking/A0A1G6PFE9.pdb
Downloaded file for A0A4R8LSA2 to checking/A0A4R8LSA2.pdb
Downloaded file for A0A7W3RA64 to checking/A0A7W3RA64.pdb
Downloaded file for A0A540V080 to checking/A0A540V080.pdb
Downloaded file for A0A2G8B7Q0 to checking/A0A2G8B7Q0.pdb
Downloaded file for A0A7W8MUD5

In [1]:
def run_fatcat(df, pdb_dir):
    p_values = []  # List to store the extracted p-values
    rows_to_drop = []  # List to store the indices of rows to be dropped

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Append the index of the row to the list of rows to be dropped
            rows_to_drop.append(index)
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
        
        # Run the FATCAT command and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True)
        output = result.stdout

        # Find the line containing the p-value
        p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

        # Extract the p-value
        p_value = float(p_value_line.split()[1])
        
        # Append the p-value to the list
        p_values.append(p_value)

    # Drop the rows with missing structure files from the dataframe
    df = df.drop(rows_to_drop)
    
    df.loc[:, 'p_value'] = p_values  # Use .loc to set the 'p_value' column
    return df

In [4]:
df_result_new = run_fatcat(df_test, pdb_dir='checking')

In [5]:
df_result_new

Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb,p_value
0,P9WJA3,A0A1M6N9Z6,1Y5H,,7.180000e-13
1,I6XFS7,A0A1M6WSV2,6M1C,,0.000000e+00
2,Q65EQ1,A0A521F3Z2,6NKG,,9.500000e-12
4,P9WHM1,C7MUW2,3LP6,,0.000000e+00
5,Q5FJB3,A0A7W9YPC6,3TFX,,0.000000e+00
...,...,...,...,...,...
95,A0A4Z0GXN3,A0A1G7W5M9,,,0.000000e+00
96,G4H893,A0A1W6VMF1,,,0.000000e+00
97,A0A120GMI5,Q5L0I9,,,0.000000e+00
98,A0A4V2YRI4,A0A3N2GW27,,,2.470000e-12


In [16]:
df_result_new['p_value'] = pd.to_numeric(df_result_new['p_value'], errors='coerce')

# Create a new dataframe with rows where p-value > 0.05
new_df_new = df_result_new[df_result_new['p_value'] > 0.05]
new_df_new


Unnamed: 0,meso_pid,thermo_pid,meso_pdb,thermo_pdb,p_value
22,P66054,A0A4R3N1Y2,8A57,,0.0871
25,P66125,A0A540V080,8A63,,0.101


In [None]:
# Quick fatcat command: 
# FATCATQue.pl timeused pair.list -q >pair.aln (only work when files are in the same directory)
# FATCAT -p1 A0JNW5.pdb -p2 A1L1K8.pdb -o A0JNW5_A1L1K8 -m
# FATCAT -p1 P0A9P0.pdb -p2 Q9H3D4.pdb -i ./checking -o test -m (-i directory of PDB files), result file is in ./

#FATCAT command help: 
FATCAT <-p1 file> <-p2 file> (the input pdb files)
  [-o output-initial] (default tmp)
  [-i string] (data directory for both structures, default ./)
  [-i1 string] (data directory for 1st structures, default ./)
  [-i2 string] (data directory for 2st structures, default ./)
  [-s1 num] (read start position of protein 1, default from the begin)
  [-s2 num] (read start position of protein 2, default from the begin)
  [-l1 num] (read length of protein 1, default whole protein)
  [-l2 num] (read length of protein 2, default whole protein)
  [-r] (force program run rigid structural alignment, default off)
  [-filter probcut] (filter the alignment quickly, set a big probcut, eg 0.2, useful in database searching, default off)
  [-sparse number[0-3]] (sparsely fragment sampling, for speeding up the calculation, default off)
  [-b] (print a basic report to stdout)
  [-f] (print a full report to stdout. When -b or -f is on, following options are all automatically off)
  [-m] (print alignment to a file)
  [-q] (print alignment to stdout, useful in database-search in queue)
  [-ab] (print the postscript graph of all AFPs and final AFP chain in black-white to a file)
  [-ac] (print the postscript graph of all AFPs and final AFP chain in color to a file)
  [-c] (print AFP chaining result to file.chain.txt)
  [-t] (print the files of transformed pdbs and corresponding rasmol scripts)
  [-s] (print the files of superimposed pdbs and corresponding rasmol scripts)
  [-time] (print the total running time, default off)

In [24]:
def run_fatcat(df, pdb_dir):
    p_values = []  # List to store the extracted p-values
    rows_to_drop = []  # List to store the indices of rows to be dropped

    for index, row in df.iterrows():
        if not pd.isna(row['meso_pdb']):
            p1 = row['meso_pdb']
        else:
            p1 = row['meso_pid']
        
        if not pd.isna(row['thermo_pdb']):
            p2 = row['thermo_pdb']
        else:
            p2 = row['thermo_pid']
        
        # Check if the structure files exist in the 'checking' folder
        p1_file = f'{p1}.pdb'
        p2_file = f'{p2}.pdb'
        if not os.path.exists(os.path.join(pdb_dir, p1_file)) or not os.path.exists(os.path.join(pdb_dir, p2_file)):
            # Append the index of the row to the list of rows to be dropped
            rows_to_drop.append(index)
            continue

        # Set the FATCAT command and its arguments
        cmd = ['FATCAT', '-p1', p1_file, '-p2', p2_file, '-i', pdb_dir, '-q']
        
        # Run the FATCAT command and capture the output
        result = subprocess.run(cmd, capture_output=True, text=True)
        output = result.stdout

        # Find the line containing the p-value
        p_value_line = next(line for line in output.split('\n') if line.startswith("P-value"))

        # Extract the p-value
        p_value = float(p_value_line.split()[1])
        
        # Append the p-value to the list
        p_values.append(p_value)

    # Drop the rows with missing structure files from the dataframe
    df = df.drop(rows_to_drop)
    
    df.loc[:, 'p_value'] = p_values  # Use .loc to set the 'p_value' column

    cmd_2 = """CREATE OR REPLACE TEMP TABLE tmp
                AS SELECT * FROM df"""
    cmd_3 = """ALTER TABLE pairpro.final ADD COLUMN p_value FLOAT"""
    cmd_4 = """UPDATE pairpro.final SET p_value = tmp.p_value
                FROM pairpro.final
                INNER JOIN tmp ON (pairpro.final.meso_pid = pairpro.tmp.meso_pid)
                AND (pairpro.final.thermo_pid = pairpro.tmp.thermo_pid)"""
    conn.sql(cmd_2)
    conn.execute(cmd_3)
    conn.execute(cmd_4)
    conn.commit()

In [23]:
conn.execute("""SELECT * FROM tmp LIMIT 1""").df()

CatalogException: Catalog Error: Table with name tmp does not exist!
Did you mean "pg_type"?
LINE 1: SELECT * FROM tmp LIMIT 1
                      ^

In [25]:
result = conn.execute("""SELECT * FROM pairpro.final LIMIT 1""").df()

In [26]:
result.columns

Index(['thermo_pid', 'meso_pid', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'local_E_value', 'query_align_start', 'query_align_end',
       'subject_align_end', 'subject_align_start', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'thermo_taxid', 'meso_taxid', 'm_ogt', 't_ogt',
       'ogt_difference', 'm_protein_seq', 't_protein_seq'],
      dtype='object')