### Overview

This notebook contains cells for intake of new l2t database 5/2023 as well as edits to the original preprocessing component for building FAFSA.

In [1]:
import time

import duckdb
import pandas as pd

import os

import numpy as np

In [2]:
con = duckdb.connect('/mnt/s/PairProphet/l2t_50k.db')

In [9]:
con.execute("""SELECT ROW_NUMBER() OVER(ORDER BY meso_pid, thermo_pid) AS pair_id, * FROM protein_pairs LIMIT 100""").df()

Unnamed: 0,pair_id,thermo_pid,meso_pid,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,local_E_value,query_align_start,query_align_end,subject_align_end,subject_align_start,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_taxid,meso_taxid
0,1,A0A1I2I6J5,A0A059WC85,0.312925,0.277108,0.289308,2.180000e-22,11,155,146,1,145,0.873494,146,0.960526,187.0,1076937,1194168
1,2,A0A1I2I6J5,A0A059WID5,0.303226,0.283133,0.291022,9.540000e-22,8,164,156,3,157,0.945783,154,0.980892,183.0,1076937,1194168
2,3,A0A1I2IU30,A0A069N935,0.469388,0.414414,0.386555,7.360000e-32,4,101,114,17,98,0.882883,98,0.771654,260.0,1076937,1071679
3,4,A0A1I2GX53,A0A069N9C8,0.500000,0.436620,0.439716,2.030000e-15,1,62,62,1,62,0.873239,62,0.885714,143.0,1076937,1071679
4,5,A0A1I2HRU4,A0A069N9R7,0.373913,0.300699,0.294521,1.200000e-18,24,137,147,25,114,0.797203,123,0.825503,177.0,1076937,1071679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,A0A1I2HXZ6,A0A069NHU0,0.301802,0.297778,0.283898,6.860000e-27,1,219,222,1,219,0.973333,222,0.898785,246.0,1076937,1071679
96,97,A0A1I2HYV0,A0A069NHU0,0.325792,0.313043,0.301887,1.380000e-33,6,225,222,3,220,0.956522,220,0.890688,292.0,1076937,1071679
97,98,A0A1I2K7J4,A0A069NHU0,0.281250,0.261411,0.258197,2.160000e-24,4,231,224,2,228,0.946058,223,0.902834,230.0,1076937,1071679
98,99,A0A1I2KCV8,A0A069NHU0,0.295964,0.284483,0.275574,8.910000e-25,4,225,223,2,222,0.956897,222,0.898785,232.0,1076937,1071679


In [2]:
# Paths for parquet files from extracted database
folder = '/mnt/s/FAFSA'

prot_pair_path = folder + '/protein_pairs'
prot_path = folder + '/proteins'
taxa_pair_al_path = folder + '/taxa_pairs/alignment'
taxa_pair_lab_path = folder + '/taxa_pairs/pair_labels'

### Example of protein pairs parquet

In [61]:
duckdb.read_parquet(prot_pair_path + '/align_taxa_128-406548.parquet').df().head()

Unnamed: 0,thermo_pid,meso_pid,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,local_E_value,query_align_start,query_align_end,subject_align_end,subject_align_start,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_taxid,meso_taxid
0,E8R444,A0A5B9W4X6,0.54,0.457627,0.482143,2.29e-34,15,114,101,2,100,0.847458,100,0.943396,276.0,128,406548
1,E8R3P9,A0A5B9W2H7,0.569444,0.445652,0.465909,5.51e-29,18,92,84,14,75,0.815217,71,0.845238,236.0,128,406548
2,E8R0N8,A0A5B9W0K1,0.469231,0.308081,0.346591,3.14e-24,11,184,141,15,174,0.878788,127,0.824675,220.0,128,406548
3,E8R1F9,A0A5B9W1N8,0.71519,0.624309,0.631285,1.29e-81,15,172,166,9,158,0.872928,158,0.892655,600.0,128,406548
4,E8QWA5,A0A5B9W5Q2,0.58,0.471545,0.505447,3.9600000000000005e-69,44,245,207,7,202,0.821138,201,0.943662,528.0,128,406548


### Example of proteins parquet

In [60]:
duckdb.read_parquet(prot_path + '/uniprot_chunk_0.parquet').df().head()

Unnamed: 0,pid,taxid,pdb_id,alphafold_id,proteome,protein_seq
0,A0A2A5R0G1,373386,,A0A2A5R0G1,UP000219689,MLADLLSESYATDLEESWENERTATPVRAFAVRLHQTGCSLRETTT...
1,A0A2A5R046,373386,,A0A2A5R046,UP000219689,MPENDRLNGCLDEINLEFVEREATPRLLMKLSIQLHLAGLSLSNTV...
2,A0A1H7MLA0,302484,,A0A1H7MLA0,UP000183894,MGIVSSKTQALQEVASVDDFLNVAATETVPLFEHLEFEFLLEYDVF...
3,E1REH3,54120,,,UP000006565,MVKGFDHSYAFFLGCIAPNRYPGCEASAIRTSAKLGIELLPLKGAS...
4,E1REH4,54120,,E1REH4,UP000006565,MAVEKNYGNPDLEKKLADRNYYTSDSHKDFSKRVEKISGTMSHMCF...


### Example of taxa pairs alignment parquet

In [62]:
duckdb.read_parquet(taxa_pair_al_path + '/taxa_pair_blast_chunk_0.parquet').df().head()

Unnamed: 0,query_id,subject_id,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,local_E_value,query_align_start,query_align_end,subject_align_end,subject_align_start,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,__index_level_0__
0,1281767,1413211,0.971045,0.865324,0.892278,0.0,178,1589,1486,73,1412,0.888609,1414,0.947086,1283.0,0
1,1281767,545502,0.93084,0.830082,0.867763,0.0,178,1589,1437,29,1412,0.888609,1409,0.971054,1104.0,1
2,1281767,157733,0.927311,0.826935,0.847469,0.0,178,1589,1493,80,1412,0.888609,1414,0.935185,1096.0,2
3,1281767,86665,0.926606,0.826306,0.847644,0.0,178,1589,1491,80,1412,0.888609,1412,0.935719,1095.0,3
4,1281767,79885,0.926606,0.826306,0.848465,0.0,178,1589,1486,73,1412,0.888609,1414,0.938911,1091.0,4


### Example of taxa pairs label parquet

In [63]:
duckdb.read_parquet(taxa_pair_lab_path + '/taxa_pair_blast_chunk_0.parquet').df().head()

Unnamed: 0,is_pair,__index_level_0__
0,False,0
1,False,1
2,False,2
3,False,3
4,False,4


### Uniprot IDs file

In [11]:
uniprot = pd.read_csv(folder + '/uniprot/proteome_metadata.csv')

In [12]:
uniprot.head()

Unnamed: 0.1,Unnamed: 0,pid,species_taxid,strain_taxid,qualifier,completeness,num_proteins
0,0,UP000464621,9,9,Other proteome,Outlier (low value),2
1,1,UP000325048,14,14,Redundant proteome,Unknown,1778
2,2,UP000321374,17,17,Other proteome,Close to standard (low value),1125
3,3,UP000095230,23,23,Other proteome,Standard,3900
4,4,UP000252468,24,24,Other proteome,Close to standard (high value),4310


### Example of taxa parquet

In [64]:
duckdb.read_parquet(folder + '/taxa.parquet').df().head()

Unnamed: 0,taxid,16s_seq,16s_len,temperature,superkingdom,phylum,class,order,family,genus,__index_level_0__
0,617123,GATGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGAAGCT...,1594,37.0,2.0,1239.0,186801.0,186802.0,186803.0,1164882.0,0
1,1281767,GCTCAGGACGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAAT...,1589,50.0,2.0,1239.0,91061.0,1385.0,186817.0,1494427.0,1
2,1329262,CAGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCGTGCTTAAC...,1589,30.0,2.0,1239.0,186801.0,186802.0,186803.0,1780122.0,2
3,402877,GACGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAACGGACTA...,1589,37.0,2.0,1239.0,186801.0,186802.0,68298.0,862.0,3
4,1485586,GCTGCCCTTCAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGC...,1588,28.0,2.0,201174.0,1760.0,85011.0,2062.0,1883.0,4


### Build learn2therm database

This code took about 1.5 hr to run on my home desktop with 8c/16t and 32 Gb of memory.

In [5]:
con = duckdb.connect(folder+'/FAFSA.db')

In [19]:
# Protein pairs
all_pairs = prot_pair_path + '/*.parquet'

t1 = time.monotonic()

con.execute(f"""CREATE OR REPLACE TABLE protein_pairs AS SELECT * FROM '{all_pairs}'""")

elapsed = time.monotonic() - t1

print(f'{elapsed} seconds')

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

6043.670356133 seconds


In [20]:
# Proteins
all_prot = prot_path + '/*.parquet'

t1 = time.monotonic()

con.execute(f"""CREATE OR REPLACE TABLE proteins AS SELECT * FROM '{all_prot}'""")

elapsed = time.monotonic() - t1

print(f'{elapsed} seconds')

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

64.46746825300033 seconds


In [21]:
# Taxa pair al

all_taxa_pair_al = taxa_pair_al_path + '/*.parquet'

t1 = time.monotonic()

con.execute(f"""CREATE OR REPLACE TABLE taxa_pairs_al AS SELECT * FROM '{all_taxa_pair_al}'""")

elapsed = time.monotonic() - t1

print(f'{elapsed} seconds')

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

5.828336216000025 seconds


In [22]:
# Taxa pair lab

all_taxa_pair_lab = taxa_pair_lab_path + '/*.parquet'

t1 = time.monotonic()

con.execute(f"""CREATE OR REPLACE TABLE taxa_pairs_lab AS SELECT * FROM '{all_taxa_pair_lab}'""")

elapsed = time.monotonic() - t1

print(f'{elapsed} seconds')

0.7024612730001536 seconds


In [23]:
# Taxa 

all_taxa = folder + '/taxa.parquet'

t1 = time.monotonic()

con.execute(f"""CREATE OR REPLACE TABLE taxa AS SELECT * FROM '{all_taxa}'""")

elapsed = time.monotonic() - t1

print(f'{elapsed} seconds')

0.6066281989988056 seconds


In [25]:
con.execute("""SELECT COUNT(thermo_pid) FROM protein_pairs""").df()

Unnamed: 0,count(thermo_pid)
0,82174541


*Total pairs in l2t is now 82 million* 

In [27]:
# Build uniprot table (not necessary)
uniprot_path = folder + '/uniprot/proteome_metadata.csv'
con.execute(f"""CREATE OR REPLACE TABLE uniprot AS SELECT * FROM '{uniprot_path}'""")

<duckdb.DuckDBPyConnection at 0x7fdba99fc870>

In [6]:
con.close()

### Build FAFSA database

In [4]:
# Updated version of original function in preprocessing
def connect_db(path: str):
    '''
    Runs duckdb.connect() function on database path. Returns a
    duckdb.DuckDBPyConnection object and prints execution time.

    Args:
        path (str): Path to DuckDB database file containing learn2therm.

    Returns:
        con (duckdb.DuckDBPyConnection): A DuckDB connection object linking script to
        learn2therm database.

    Raises:
        VersionError: DuckDB installation is not one of 0.7.0 or 0.7.1.
        AttributeError: Input database contains no tables.
    '''
    s_time = time.time()

    version = duckdb.__version__
    print(f'DuckDB version {version} detected.')

    # Checks for compatible installation of duckdb.
    if not version in ['0.7.0', '0.7.1']:
        raise VersionError("""learn2therm was generated using DuckDB storage version 39. It is only
                           compatible with duckdb versions 0.6.0 and 0.6.1. Please check your
                           installation. Refer to https://duckdb.org/internals/storage.html for more
                           details.""")

        e_time = time.time()
        elapsed_time = e_time - s_time
        print(f'Finished with VersionError. Execution time: {elapsed_time} seconds')

    print('Connecting to database...')
    con = duckdb.connect(path)

    tables = con.execute("""SELECT TABLE_NAME
                            FROM INFORMATION_SCHEMA.TABLES
                            WHERE TABLE_TYPE='BASE TABLE'""").df()

    if tables.shape[0] < 1:
        raise AttributeError('Input database is empty.')

    e_time = time.time()
    elapsed_time = e_time - s_time
    print(f'Connection established! Execution time: {elapsed_time} seconds')
    return con, tables

In [5]:
con, tables = connect_db('/mnt/s/FAFSA/FAFSA.db')

DuckDB version 0.7.1 detected.
Connecting to database...
Connection established! Execution time: 0.5428915023803711 seconds


In [12]:
# Should include fafsa tables now that construction has been done once
tables

Unnamed: 0,table_name
0,fafsa_final
1,fafsa_proteins
2,fafsa_protein_pairs
3,fafsa_ogt_taxa_pairs
4,fafsa_taxa
5,fafsa_taxa_pairs
6,taxa
7,taxa_pairs_lab
8,taxa_pairs
9,proteins


In [36]:
# Fix table name for consistency with original database
con.execute("""ALTER TABLE taxa_pairs_al RENAME TO taxa_pairs""")

<duckdb.DuckDBPyConnection at 0x7fbaeccd8ab0>

In [38]:
# Updated from original preprocessing function
def build_fafsa(con, min_ogt_diff: int = 20, min_16s: int = 1300,
                    plots: bool = False):
    '''
    Converts learn2therm DuckDB database into a DuckDB database for FAFSA by adding filtered and
    constructed tables.

    Args:
        con (duckdb.DuckDBPyConnection): DuckDB connection object. Links script to DuckDB SQL 
        database.
        min_ogt_diff (int): Cutoff for minimum difference in optimal growth temperature between 
        thermophile and mesophile pairs. Default 20 deg C.
        min_16s (int): Cutoff for minimum 16S read length for taxa. Default 1300 bp. Filters out 
        organisms with poor or incomplete 16S sequencing.
        plots (bool): Boolean to determine whether the user wants Sankey plots diagramming the fate
        of learn2therm samples during FAFSA construction to be saved in ./plots.

    Returns:
        None. Database object is modified in place.

    Raises:
        ValueError: Optimal growth temperature difference must be positive.
        ValueError: Minimum 16S sequence read is 1 bp.
        AttributeError: Database must be in the learn2therm format.
    '''

    if min_ogt_diff < 0:
        raise ValueError('Optimal growth temperature difference must be positive.')

    if min_16s < 1:
        raise ValueError('16S must have at least 1 bp read.')

    tables = con.execute("""SELECT TABLE_NAME
                            FROM INFORMATION_SCHEMA.TABLES
                            WHERE TABLE_TYPE='BASE TABLE'""").df()

    # Check if proper tables exist in database. If they do not, raise an error.
    if (item in tables for item in ['proteins', 'protein_pairs', 'taxa', 'taxa_pairs']):
        pass

    else:
        raise AttributeError('Database is not formatted for learn2therm.')

    s_time = time.time()
    print('Constructing fafsa_taxa_pairs...')

    # Builds FAFSA taxa pair table using only paired taxa from learn2therm
    taxa_pairs_cmd = """CREATE OR REPLACE TABLE fafsa_taxa_pairs AS
                        SELECT *
                        FROM taxa_pairs
                        INNER JOIN taxa_pairs_lab ON (taxa_pairs.__index_level_0__ = taxa_pairs_lab.__index_level_0__)
                        WHERE taxa_pairs_lab.is_pair = True"""
    con.execute(taxa_pairs_cmd)

    e_time = time.time()
    elapsed_time = e_time - s_time
    print(f'Finished constructing fafsa_taxa_pairs. Execution time: {elapsed_time} seconds')
    print('Constructing fafsa_taxa...')

    # Builds FAFSA taxa table using only paired taxa from learn2therm.
    taxa_cmd = f"""CREATE OR REPLACE TABLE fafsa_taxa AS
                   SELECT *
                   FROM taxa
                   WHERE taxid IN 
                   (SELECT DISTINCT query_id FROM fafsa_taxa_pairs)
                   OR taxid IN
                   (SELECT DISTINCT subject_id FROM fafsa_taxa_pairs)"""
    con.execute(taxa_cmd)

    e_time2 = time.time()
    elapsed_time = e_time2 - e_time
    print(f'Finished constructing fafsa_taxa. Execution time: {elapsed_time} seconds')
    print('Filtering on ogt and 16S sequence parameters...')

    # Builds FAFSA table containing taxa pairs and their associated optimal growth temperatures
    # (ogt). Excludes 16S sequences and ogt difference below cutoff values from function input.
    ogt_pairs_cmd = f"""CREATE OR REPLACE TABLE fafsa_ogt_taxa_pairs AS SELECT fafsa_taxa_pairs.*,
                        taxa_m.temperature AS meso_ogt,
                        taxa_t.temperature AS thermo_ogt,
                        taxa_t.temperature - taxa_m.temperature AS ogt_diff,
                        taxa_m."16s_len" AS meso_16s_len,
                        taxa_t."16s_len" AS thermo_16s_len
                        FROM fafsa_taxa_pairs
                        JOIN fafsa_taxa AS taxa_m ON (fafsa_taxa_pairs.subject_id = taxa_m.taxid)
                        JOIN fafsa_taxa AS taxa_t ON (fafsa_taxa_pairs.query_id = taxa_t.taxid)
                        WHERE ogt_diff >= {min_ogt_diff}
                        AND meso_16s_len >= {min_16s}
                        AND thermo_16s_len >= {min_16s}"""
    con.execute(ogt_pairs_cmd)

    e_time3 = time.time()
    elapsed_time = e_time3 - e_time2
    print(f'Finished filtering. Execution time: {elapsed_time} seconds')
    print('Constructing fafsa_protein_pairs...')

    # Builds FAFSA table containing protein pairs
    protein_pair_cmd = """CREATE OR REPLACE TABLE fafsa_protein_pairs AS
                          SELECT 
                          protein_pairs.meso_pid,
                          protein_pairs.thermo_pid,
                          protein_pairs.bit_score,
                          protein_pairs.local_gap_compressed_percent_id,
                          protein_pairs.scaled_local_query_percent_id,
                          protein_pairs.scaled_local_symmetric_percent_id,
                          protein_pairs.query_align_len,
                          protein_pairs.query_align_cov,
                          protein_pairs.subject_align_len,
                          protein_pairs.subject_align_cov,
                          otp.meso_ogt AS m_ogt,
                          otp.thermo_ogt AS t_ogt,
                          otp.ogt_diff AS ogt_difference
                          FROM protein_pairs
                          INNER JOIN fafsa_ogt_taxa_pairs AS otp
                          ON (protein_pairs.thermo_taxid = otp.query_id)
                          AND (protein_pairs.meso_taxid = otp.subject_id)"""
    con.execute(protein_pair_cmd)

    e_time4 = time.time()
    elapsed_time = e_time4 - e_time3
    print(f'Finished constructing fafsa_protein_pairs. Execution time: {elapsed_time} seconds')
    print('Constructing fafsa_proteins...')

    # Builds FAFSA table containing proteins that belong to taxa from fafsa_taxa_pairs.
    prot_filt_cmd = """CREATE OR REPLACE TABLE fafsa_proteins AS SELECT *
                       FROM proteins
                       WHERE pid IN (SELECT DISTINCT meso_pid FROM fafsa_protein_pairs) OR
                       pid IN (SELECT DISTINCT thermo_pid FROM fafsa_protein_pairs)
                    """
    con.execute(prot_filt_cmd)

    e_time5 = time.time()
    elapsed_time = e_time5 - e_time4
    print(f'Finished constructing fafsa_proteins. Execution time: {elapsed_time} seconds')
    print('Constructing final dataset...')

    # Builds final FAFSA data table for downstream sampling.
    big_table_cmd = """CREATE OR REPLACE TABLE fafsa_final AS
                       SELECT 
                       proteins_m.protein_seq AS m_protein_seq,
                       proteins_t.protein_seq AS t_protein_seq,
                       proteins_m.alphafold_id AS meso_alphafold_id,
                       proteins_t.alphafold_id AS thermo_alphafold_id,
                       fafsa_protein_pairs.*,
                       LENGTH(proteins_m.protein_seq) AS m_protein_len,
                       LENGTH(proteins_t.protein_seq) AS t_protein_len
                       FROM fafsa_protein_pairs
                       JOIN fafsa_proteins AS proteins_m
                       ON (fafsa_protein_pairs.meso_pid = proteins_m.pid)
                       JOIN fafsa_proteins AS proteins_t
                       ON (fafsa_protein_pairs.thermo_pid =
                           proteins_t.pid)"""
    con.execute(big_table_cmd)
    print('Finishing up...')
    con.commit()
    con.close()

    et_final = time.time()
    elapsed_time = et_final - e_time5
    print(f'Finished. Total execution time: {elapsed_time} seconds')

In [39]:
build_fafsa(con)

Constructing fafsa_taxa_pairs...
Finished constructing fafsa_taxa_pairs. Execution time: 0.4976813793182373 seconds
Constructing fafsa_taxa...
Finished constructing fafsa_taxa. Execution time: 0.3858222961425781 seconds
Filtering on ogt and 16S sequence parameters...
Finished filtering. Execution time: 1.0931081771850586 seconds
Constructing fafsa_protein_pairs...


FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

Finished constructing fafsa_protein_pairs. Execution time: 10.692678689956665 seconds
Constructing fafsa_proteins...


FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

Finished constructing fafsa_proteins. Execution time: 12.06689977645874 seconds
Constructing final dataset...


FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

Finishing up...
Finished. Total execution time: 65.12362670898438 seconds


In [40]:
con, tables = connect_db('/mnt/s/FAFSA/FAFSA.db')

con.execute("""SELECT * FROM fafsa_final LIMIT 1""").df()

DuckDB version 0.7.1 detected.
Connecting to database...
Connection established! Execution time: 0.27312803268432617 seconds


Unnamed: 0,m_protein_seq,t_protein_seq,meso_alphafold_id,thermo_alphafold_id,meso_pid,thermo_pid,bit_score,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,m_ogt,t_ogt,ogt_difference,m_protein_len,t_protein_len
0,MQKRILIVDDEPAIRDMVAFALRKGEYEPVHAGDALEAQTAIADRV...,MRILLVEDEAPLRETLAARLKREGFAVDAAQDGEEGLYMGREVPFD...,A0A0R0CEE8,A0A562DKC7,A0A0R0CEE8,A0A562DKC7,320.0,0.349776,0.343612,0.342105,220,0.969163,223,0.973799,28.0,49.0,21.0,229,227


**Looks good**

### Build FAFSA_lite, a version of FAFSA with only 1 million pairs

In [45]:
fafsa_lite = con.execute("""SELECT * FROM fafsa_final USING SAMPLE 1000000""").df()

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

In [51]:
fafsa_lite.to_parquet('/mnt/s/FAFSA/fafsa_lite.parquet')
con.close()

In [53]:
# Connect to new database and create fafsa_final table using FAFSA_lite data
con = duckdb.connect('/mnt/s/FAFSA/FAFSA_lite.db')

con.execute("""CREATE OR REPLACE TABLE fafsa_final AS SELECT * FROM '/mnt/s/FAFSA/fafsa_lite.parquet'""")

In [15]:


# Create table of taxa pairs
cmd = """CREATE OR REPLACE TEMP TABLE pids_in_pairs AS
        SELECT pid,
        FROM proteins
        WHERE pid IN (SELECT meso_pid FROM protein_pairs) OR
        pid IN (SELECT thermo_pid FROM protein_pairs)"""
con.execute(cmd)

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

<duckdb.DuckDBPyConnection at 0x7fb789b380b0>

In [17]:
pids = con.execute("""SELECT * FROM pids_in_pairs""").df()

In [19]:
pids.shape

(4934810, 1)