In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import duckdb

con = duckdb.connect('/mnt/s/Projects/Learn2Therm/database')
con.execute("""SET memory_limit='14GB'""")
con.execute("""SELECT * FROM duckdb_settings()""").df()

Unnamed: 0,name,value,description,input_type
0,access_mode,automatic,"Access mode of the database (AUTOMATIC, READ_O...",VARCHAR
1,checkpoint_threshold,16.7MB,The WAL size threshold at which to automatical...,VARCHAR
2,debug_checkpoint_abort,,DEBUG SETTING: trigger an abort while checkpoi...,VARCHAR
3,debug_force_external,false,DEBUG SETTING: force out-of-core computation f...,BOOLEAN
4,debug_force_no_cross_product,false,DEBUG SETTING: Force disable cross product gen...,BOOLEAN
5,debug_window_mode,,DEBUG SETTING: switch window mode to use,VARCHAR
6,default_collation,,The collation setting used when none is specified,VARCHAR
7,default_order,asc,The order type used when none is specified (AS...,VARCHAR
8,default_null_order,nulls_first,Null ordering used when none is specified (NUL...,VARCHAR
9,disabled_optimizers,,DEBUG SETTING: disable a specific set of optim...,VARCHAR


In [3]:
tables = ['protein_pairs','proteins','taxa']

for t in tables:
    d = con.execute(f"""SELECT * FROM {t} LIMIT 1""").df()
    print(f'{t} has {d.columns}')

protein_pairs has Index(['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'local_E_value',
       'query_align_start', 'query_align_end', 'subject_align_end',
       'subject_align_start', 'query_align_len', 'query_align_cov',
       'subject_align_len', 'subject_align_cov', 'bit_score',
       'prot_pair_index', 'meso_protein_int_index', 'thermo_protein_int_index',
       'taxa_pair_index'],
      dtype='object')
proteins has Index(['protein_seq', 'protein_desc', 'protein_len', 'protein_int_index'], dtype='object')
taxa has Index(['taxa_index', 'ncbi_taxid', 'taxonomy', 'organism', 'bacdive_id',
       'seq_16srRNA', 'len_16s', 'ogt'],
      dtype='object')


# Useful columns from each table in relational database
taxa_imp = ['ncbi_taxid','organism','seq_16srRNA']
taxa_pairs_imp = ['thermo_index','meso_index','local_gap_compressed_percent_id','scaled_local_query_percent_id','local_E_value','query_align_start','query_align_end','subject_align_start','subject_align_end','query_align_len','query_align_cov','subject_align_len','subject_align_cov','bit_score','taxa_pair_index']
proteins_imp = ['taxa_index','protein_index','protein_seq','protein_desc']
protein_pairs_imp = ['thermo_protein_index','meso_protein_index','local_gap_compressed_percent_id','scaled_local_query_percent_id','scaled_local_symmetric_percent_id','']

In [3]:
con.execute("""SELECT DISTINCT meso_protein_int_index FROM protein_pairs LIMIT 10""").df()

Unnamed: 0,meso_protein_int_index
0,36007426
1,36011201
2,36010750
3,36009464
4,36009047
5,36008744
6,36008872
7,36011799
8,36007401
9,36008658


In [26]:
con.execute("""CREATE OR REPLACE TABLE mpoi AS 
    (SELECT protein_int_index 
     FROM proteins 
     WHERE protein_int_index IN 
         (SELECT DISTINCT meso_protein_int_index FROM protein_pairs))""")  

con.execute("""CREATE OR REPLACE TABLE tpoi AS 
    (SELECT protein_int_index 
     FROM proteins 
     WHERE protein_int_index IN 
         (SELECT DISTINCT thermo_protein_int_index FROM protein_pairs))""")

Unnamed: 0,Count
0,301597


In [41]:
con.execute("""CREATE OR REPLACE TABLE validprot AS
    SELECT prot_pair_index,

    (SELECT protein_seq AS meso_seq,
    FROM proteins
    WHERE protein_int_index IN
        (SELECT protein_int_index
         FROM mpoi)),
         
    (SELECT protein_seq AS thermo_seq,
    FROM proteins
    WHERE protein_int_index IN
        (SELECT protein_int_index
         FROM tpoi))
    FROM protein_pairs
    LIMIT 10""")


con.execute("""SELECT * FROM validprot LIMIT 1""").df()

Unnamed: 0,prot_pair_index,(SELECT protein_seq AS meso_seq FROM proteins WHERE (protein_int_index = ANY(SELECT protein_int_index FROM mpoi))),(SELECT protein_seq AS thermo_seq FROM proteins WHERE (protein_int_index = ANY(SELECT protein_int_index FROM tpoi)))
0,0,MAKQTSMATMVANLELRSTQYKREMAQAAARNKQLTREMKSTSSAG...,MVALDVHEDMQRLADSGVVAVMRGADADTIIDVADALYEGGITAYE...


In [45]:
con.execute("""SELECT * FROM protein_pairs WHERE prot_pair_index = 6""").df()

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,local_E_value,query_align_start,query_align_end,subject_align_end,subject_align_start,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,prot_pair_index,meso_protein_int_index,thermo_protein_int_index,taxa_pair_index
0,0.393393,0.341146,0.352625,0.0,29,360,344,8,332,0.864583,337,0.938719,504,6,17227683,28678179,652803


In [46]:
con.execute("""SELECT * FROM proteins WHERE protein_int_index = 17227683""").df()

Unnamed: 0,protein_seq,protein_desc,protein_len,protein_int_index
0,MFESVATLQAEHDDLQRQLSDPELHGDAARSKRVNRRYAELSRIVA...,peptide chain release factor 1,359,17227683


In [None]:
con.execute("""WITH protein_ids AS (SELECT protein_int_index
    FROM proteins
    WHERE)""")

In [None]:
con.execute("""CREATE OR REPLACE TABLE validprot_data 
    AS (
    SELECT
        proteins_m.protein_seq AS meso_seq,
        proteins_t.protein_seq AS thermo_seq,
        protein_pairs.scaled_local_symmetric_percent_id
    FROM protein_pairs
    INNER JOIN proteins AS proteins_m ON (protein_pairs.meso_protein_int_index=proteins_m.protein_int_index)
    INNER JOIN proteins AS proteins_t ON (protein_pairs.thermo_protein_int_index=proteins_t.protein_int_index)
    LIMIT 10)""")

In [67]:
df = con.execute("""SELECT * FROM validprot_data""").df()
df.head()

Unnamed: 0,meso_ogt,thermo_ogt,scaled_local_symmetric_percent_id
0,28.0,52.5,0.470016
1,28.0,52.5,0.285246
2,28.0,52.5,0.449918
3,28.0,52.5,0.140845
4,28.0,52.5,0.230937


In [1]:
con.execute("""CREATE INDEX pidx ON proteins (protein_int_index)""")
#con.execute("""CREATE UNIQUE INDEX midx ON protein_pairs (meso_protein_int_index)""")
#con.execute("""CREATE UNIQUE INDEX tidx ON protein_pairs (thermo_protein_int_index)""")

NameError: name 'con' is not defined

In [None]:
con.execute("""CREATE INDEX pairs ON protein_pairs ()""")

In [None]:
con.execute("""CREATE INDEX pidx ON proteins (protein_int_index)""")
con.execute("""CREATE OR REPLACE TABLE validprot AS
    SELECT * FROM protein_pairs USING SAMPLE 1000""")

In [2]:
# Example table
con.execute("""
    SELECT 
        proteins_m.protein_seq AS meso_seq,
        proteins_t.protein_seq AS thermo_seq,
        validprot.scaled_local_symmetric_percent_id
    FROM validprot
    INNER JOIN proteins AS proteins_m ON (validprot.meso_protein_int_index=proteins_m.protein_int_index)
    INNER JOIN proteins AS proteins_t ON (validprot.thermo_protein_int_index=proteins_t.protein_int_index)
    LIMIT 10
""").df()

Unnamed: 0,explain_key,explain_value
0,physical_plan,┌───────────────────────────┐ ...
