In [40]:
import duckdb

In [16]:
def build_sample_l2t(db_in, db_out, size):
    '''
    Generates a sample l2t relational database of given size. Note that the final
    size will about 30% of 'size' due to pair filtering.

    Args:
        db_in (str): Path to full size l2t database
        db_out (int): Path to sample l2t database to be created
        size (str): Number of pairs to sample for test database. 
                    Final size will be about 30% of this.

    Returns:
        None. Database file is saved at db_out.

    Raises:
        None.
    '''
    con = duckdb.connect(db_in)    
    
    # Make sample protein pairs table
    cmd1 = f"""CREATE TEMP TABLE samp_protein_pairs AS
             SELECT * FROM protein_pairs 
             USING SAMPLE {size}"""

    con.execute(cmd1)
    
    # Make sample proteins table
    cmd2 = """CREATE TEMP TABLE samp_proteins AS
             SELECT * FROM proteins
             WHERE proteins.pid IN 
             (SELECT DISTINCT meso_pid FROM samp_protein_pairs)
             OR proteins.pid IN
             (SELECT DISTINCT thermo_pid FROM samp_protein_pairs)"""

    con.execute(cmd2)
    
    # Make sample taxa table
    cmd3 = """CREATE TEMP TABLE samp_taxa AS
             SELECT * FROM taxa
             WHERE taxa.taxid IN 
             (SELECT DISTINCT meso_taxid FROM samp_protein_pairs)
             OR taxa.taxid IN
             (SELECT DISTINCT thermo_taxid FROM samp_protein_pairs)"""

    con.execute(cmd3)
    
    # Make sample taxa_pairs table
    cmd4 = """CREATE TEMP TABLE samp_taxa_pairs AS
             SELECT * FROM taxa_pairs
             WHERE taxa_pairs.query_id IN 
             (SELECT DISTINCT taxid FROM samp_taxa)
             AND taxa_pairs.subject_id IN 
             (SELECT DISTINCT taxid FROM samp_taxa)"""

    con.execute(cmd4)
    
    # Make sample taxa_lab table
    cmd5 = """CREATE TEMP TABLE samp_taxa_pairs_lab AS
             SELECT * FROM taxa_pairs_lab
             WHERE taxa_pairs_lab.__index_level_0__ IN 
             (SELECT __index_level_0__ FROM samp_taxa_pairs)
          """

    con.execute(cmd5)
    
    # Grab new tables as df and close connection to large database
    samp_protein_pairs = con.execute("""SELECT * FROM samp_protein_pairs""").df()
    samp_proteins = con.execute("""SELECT * FROM samp_proteins""").df()
    samp_taxa = con.execute("""SELECT * FROM samp_taxa""").df()
    samp_taxa_pairs = con.execute("""SELECT * FROM samp_taxa_pairs""").df()
    samp_taxa_pairs_lab = con.execute("""SELECT * FROM samp_taxa_pairs_lab""").df()
    con.close()
    
    con2 = duckdb.connect(db_out)
    con2.execute("""CREATE OR REPLACE TABLE protein_pairs AS SELECT * FROM samp_protein_pairs""")
    con2.execute("""CREATE OR REPLACE TABLE proteins AS SELECT * FROM samp_proteins""")
    con2.execute("""CREATE OR REPLACE TABLE taxa AS SELECT * FROM samp_taxa""")
    con2.execute("""CREATE OR REPLACE TABLE taxa_pairs AS SELECT * FROM samp_taxa_pairs""")
    con2.execute("""CREATE OR REPLACE TABLE taxa_pairs_lab AS SELECT * FROM samp_taxa_pairs_lab""")
    del samp_protein_pairs, samp_proteins, samp_taxa, samp_taxa_pairs, samp_taxa_pairs_lab
    con2.close()

In [17]:
db_in = '/mnt/s/PairProphet/PairProphet.db'

for s in range(10):
    size = 1000*(s+1)
    db_out = f'/mnt/s/PairProphet/l2t_samples/l2t_{s+1}k.db'
    build_sample_l2t(db_in, db_out, size)

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

InvalidInputException: Invalid Input Error: Failed to cast value: Unimplemented type for cast (VARCHAR -> NULL)

In [36]:
con = duckdb.connect('/mnt/s/PairProphet/l2t_samples/l2t_2k.db')

print(con.execute("""SELECT COUNT(*) FROM proteins""").df())


   count_star()
0          3839


In [22]:
con.execute("""SELECT * FROM proteins LIMIT 10""").df()

Unnamed: 0,pid,taxid,pdb_id,alphafold_id,proteome,protein_seq
0,L0AJ51,44930,,,UP000010468,MVSRPAKYCPHCGTGLERITFDGHDRSHCPSCERVVWHNPVPCAGV...
1,A0A0K1IT22,35746,,,UP000663064,MELPTPQDLRERRNSLELTQSKLADMAGVSQPLIARIEGGDVDPRL...
2,M0B459,129789,,M0B459,UP000011591,MVENDSTTRRTVLKISGAVGAAGLLAGCADNGGGGGNGGGNGGGNG...
3,A0A8I1AET8,2024,,,UP000633619,MGATEQRAVLDHQAHDHEHHEDQDRYVLGFWVFLASDLVLFASIIA...
4,A0A8I1DDH1,2024,,,UP000633619,MSQNKEVYLTEEGLEKVKEELEYLRTEKRQQVAQRLKEAIAQGDLS...
5,A0A3B0B641,400777,,A0A3B0B641,UP000282311,MPQKLLLVDDERKVLEFMEPFLRSEGYETLTAESGLEALAKARTYR...
6,A0A8I1AFY4,2024,,,UP000633619,MARIRISRVGEQIKKELSQLIQQELKDPRIGFVTVTGVEMSGDLQI...
7,A0A8I1A8H8,2024,,,UP000633619,MSKKILVVDDEPSIVKLVQFNLKKEGYQVEVAYDGEMALEMADQFH...
8,A0A8I1DG37,2024,,,UP000633619,MIQGLYEAHLPVSDLDRSIEFYENLGLELAHRSENLAFFWIQKGKS...
9,A0A846LYF9,678932,,A0A846LYF9,UP000564817,MNPIKIVLADDQALVRQGLAALLDLEQDLQVVAQLPDGASVVEAIA...


In [38]:
con.close()

In [8]:
con, _ = connect_db('/mnt/s/PairProphet/l2t_50k.db')

Connecting to database...
Connection established! Execution time: 0.030165433883666992 seconds


In [9]:
build_pairpro(con)

Constructing pairpro_taxa_pairs...
Finished constructing pairpro_taxa_pairs. Execution time:
          0.013725519180297852 seconds
Constructing pairpro_taxa...
Finished constructing pairpro_taxa. Execution time:
          0.011551618576049805 seconds
Filtering on ogt and 16S sequence parameters...
Finished filtering. Execution time: 0.008179664611816406 seconds
Constructing pairpro_protein_pairs...
Finished constructing pairpro_protein_pairs. Execution time:
          0.06876039505004883 seconds
Constructing pairpro_proteins...
Finished constructing pairpro_proteins. Execution time:
          0.08545327186584473 seconds
Constructing final dataset...
Finishing up...
Finished. Total execution time: 0.3300788402557373 seconds


In [45]:
con = duckdb.connect('/mnt/s/PairProphet/PairProphet_lite.db')

In [71]:
df = con.execute("""SELECT m_protein_seq, t_protein_seq FROM fafsa_final LIMIT 5""").df()
df

Unnamed: 0,m_protein_seq,t_protein_seq
0,MGIALIFKSFFLALSQLGDPRFRRVLGLGIILTFALLIASYAGLLW...,MIADALAALSDVVSAPFRRVLLRSLGLTIAVLVGLWLLLVSVIGSY...
1,MEGKVKWFNAEKGYGFIETSEGGDVFVHFSAIQTDGFKTLDEGQSV...,MVGKVKWFNSEKGFGFIECEDGNDVFVHYTAINENGFKSLEEGQSV...
2,MGFPILETERLKLRELTLLDAETMFYYFEKASVIRYFGMDSFQNME...,MAVLETKRLILRQYEDEDIIPLHCIFSDPETMKFYPSPFSIQQTQD...
3,MSARILVVDNYDSFVFNLVQYLYQLGAECEVLRNDEVALSHAQDGF...,MALAKRVVILDYGSGNLRSAERAIARAGAEVEVTSDFDAAVEADGL...
4,MRINKYLAETGVVSRRGADAWIEAGRITINDELATLGSKVEDGDVV...,MERLQKVIAQAGIASRRKAEQLILEGKVKVNGEVVKALGTKVSRSD...


In [72]:
df.to_csv('../data/make_blast_df_test.csv', index = False)

In [50]:
con.execute("""SELECT * FROM fafsa_final LIMIT 5""").df()

Unnamed: 0,m_protein_seq,t_protein_seq,meso_alphafold_id,thermo_alphafold_id,meso_pid,thermo_pid,bit_score,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,m_ogt,t_ogt,ogt_difference,m_protein_len,t_protein_len
0,MGIALIFKSFFLALSQLGDPRFRRVLGLGIILTFALLIASYAGLLW...,MIADALAALSDVVSAPFRRVLLRSLGLTIAVLVGLWLLLVSVIGSY...,A0A4U7N8C1,A0A4R3MEU4,A0A4U7N8C1,A0A4R3MEU4,246.0,0.368664,0.349345,0.338983,218,0.951965,226,0.930041,28.0,50.0,22.0,243,229
1,MEGKVKWFNAEKGYGFIETSEGGDVFVHFSAIQTDGFKTLDEGQSV...,MVGKVKWFNSEKGFGFIECEDGNDVFVHYTAINENGFKSLEEGQSV...,A0A098MCX1,A0A419SWC1,A0A098MCX1,A0A419SWC1,264.0,0.723077,0.723077,0.723077,65,1.0,65,1.0,22.0,65.0,43.0,65,65
2,MGFPILETERLKLRELTLLDAETMFYYFEKASVIRYFGMDSFQNME...,MAVLETKRLILRQYEDEDIIPLHCIFSDPETMKFYPSPFSIQQTQD...,A0A2B6IPT3,A0A178TR74,A0A2B6IPT3,A0A178TR74,190.0,0.310811,0.275449,0.26513,147,0.88024,148,0.822222,30.0,55.0,25.0,180,167
3,MSARILVVDNYDSFVFNLVQYLYQLGAECEVLRNDEVALSHAQDGF...,MALAKRVVILDYGSGNLRSAERAIARAGAEVEVTSDFDAAVEADGL...,E2PWL5,D6YA12,E2PWL5,D6YA12,105.0,0.31383,0.278302,0.278302,207,0.976415,188,0.886792,28.0,52.5,24.5,212,212
4,MRINKYLAETGVVSRRGADAWIEAGRITINDELATLGSKVEDGDVV...,MERLQKVIAQAGIASRRKAEQLILEGKVKVNGEVVKALGTKVSRSD...,A0A1C0YLC9,A0A4P6URQ6,A0A1C0YLC9,A0A4P6URQ6,307.0,0.381579,0.356557,0.36478,232,0.95082,223,0.957082,29.0,50.0,21.0,233,244


In [73]:
pd.read_csv('../data/make_blast_df_test.csv')

Unnamed: 0,m_protein_seq,t_protein_seq
0,MGIALIFKSFFLALSQLGDPRFRRVLGLGIILTFALLIASYAGLLW...,MIADALAALSDVVSAPFRRVLLRSLGLTIAVLVGLWLLLVSVIGSY...
1,MEGKVKWFNAEKGYGFIETSEGGDVFVHFSAIQTDGFKTLDEGQSV...,MVGKVKWFNSEKGFGFIECEDGNDVFVHYTAINENGFKSLEEGQSV...
2,MGFPILETERLKLRELTLLDAETMFYYFEKASVIRYFGMDSFQNME...,MAVLETKRLILRQYEDEDIIPLHCIFSDPETMKFYPSPFSIQQTQD...
3,MSARILVVDNYDSFVFNLVQYLYQLGAECEVLRNDEVALSHAQDGF...,MALAKRVVILDYGSGNLRSAERAIARAGAEVEVTSDFDAAVEADGL...
4,MRINKYLAETGVVSRRGADAWIEAGRITINDELATLGSKVEDGDVV...,MERLQKVIAQAGIASRRKAEQLILEGKVKVNGEVVKALGTKVSRSD...


In [74]:
con.close()