In [1]:
# dependecies
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import scipy.stats

# database (make sure to install this. I will make an virtual enviroment for all the dependecies later)
import duckdb

In [2]:
# establishing connection to the database (from local repo)
con = duckdb.connect('/Users/humoodalanzi/learn2therm/data/database')

In [3]:
con

<duckdb.DuckDBPyConnection at 0x7feb5209b7f0>

In [16]:
# repeat above but print out what we would need for ML, eg. protein sequence and OGT
test = con.execute("""
    SELECT 
        protein_pairs.prot_pair_index,
        proteins_m.protein_seq AS meso_seq,
        proteins_t.protein_seq AS thermo_seq,
        taxa_m.ogt AS meso_ogt,
        taxa_t.ogt AS thermo_ogt,
        protein_pairs.scaled_local_symmetric_percent_id,
        protein_pairs.local_E_value,
        protein_pairs.scaled_local_query_percent_id,
        protein_pairs.local_gap_compressed_percent_id,
    FROM protein_pairs
    INNER JOIN taxa AS taxa_m ON (protein_pairs.meso_index=taxa_m.taxa_index)
    INNER JOIN taxa AS taxa_t ON (protein_pairs.thermo_index=taxa_t.taxa_index)
    INNER JOIN proteins AS proteins_m ON (protein_pairs.meso_protein_int_index=proteins_m.protein_int_index)
    INNER JOIN proteins AS proteins_t ON (protein_pairs.thermo_protein_int_index=proteins_t.protein_int_index)
    WHERE
        taxa_m.len_16s>1300
        AND taxa_t.len_16s>1300
        AND protein_pairs.local_E_Value< 1e-46
        AND protein_pairs.query_align_cov > 0.95
        AND protein_pairs.subject_align_cov > 0.95
        AND protein_pairs.scaled_local_symmetric_percent_id > 0.7
        AND protein_pairs.scaled_local_query_percent_id > 0.7
        AND protein_pairs.local_gap_compressed_percent_id > 0.7
""").df()

In [17]:
data = test.sample(10000, random_state=420)

In [18]:
data

Unnamed: 0,prot_pair_index,meso_seq,thermo_seq,meso_ogt,thermo_ogt,scaled_local_symmetric_percent_id,local_E_value,scaled_local_query_percent_id,local_gap_compressed_percent_id
1256842,126227630,MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNH...,MLLSDRDLRKELESGRLELDPFDPAMLQPSSIDVRLDRFFRVFDNT...,27.5,45.0,0.777202,0.0,0.773196,0.802139
1456567,169784592,MRFEGTSGYVATDDLKVAVNAAIALERPLLVKGEPGTGKTVLAVEV...,MKFTGSDSYVATEDLMIAVNAAVTLERPLLVKGEPGTGKTELARQV...,30.0,54.0,0.782143,0.0,0.784946,0.784946
874464,31933768,MAYETINVDVQDHVCLIKLHRPEALNALNAALVSELCTALEEADAS...,MAYKTIIVEIEDHVALIKLNRPEALNALNSELLGELAQAVTEADAN...,19.5,54.0,0.775194,0.0,0.775194,0.775194
560201,32409414,MAIRKYKPTTPGRRGSSVADFAEITRSTPEKSLLRPLSKTGGRNNQ...,MGIRKYKPTTPGRRGASVADFVELTRREPEKSLLRPLPKKGGRNNR...,28.0,52.5,0.787770,0.0,0.790614,0.802198
33257,175862226,MLQRLQDRVAVVTGGGSGIGLATVRRFAAEGAKVVVADIDAAAGEA...,MSEDIICRRLTGRTAVVTGAGSGIGLASARRLASEGANVVCADVDE...,28.0,45.0,0.788350,0.0,0.780769,0.802372
...,...,...,...,...,...,...,...,...,...
458544,42061208,MRFVIARCQVDYVGRLTAHLPMANRLVMVKSDGSVLVHSDGGSYKP...,MRLVIARCQVDYVGRLTAHLPMAQRLLLIKADGSVSVHSDDRAYKP...,28.0,45.0,0.872146,0.0,0.872146,0.872146
469612,165123678,MKPIVGSIVALITPMHEDGSVDYPALRKLIDWHIAEGTDCIGVVGT...,MTSSRVTLTGSIVALVTPMHEDGSVDYPTLRKLIDWHIAQGTDCIS...,30.0,48.0,0.764805,0.0,0.758389,0.776632
226964,71308777,MSFFAPKTVVSAHCDLPCGVYDPAQARIEAESIKAVAEKYQANTDP...,MLSRLFAPTVEVSAHCDLPCGVYDPAQARIEAQSIKAIIEKYHASD...,30.0,52.5,0.777358,0.0,0.768657,0.804688
123183,137720058,MRLVIARCSVDYVGRLTAHLPMATRLLLVKADGSVSVHADDRAYKP...,MRLVIARCQVDYHGRLTAHLPMATRLVLIKADGSVSIHSDDRAYKP...,32.0,50.0,0.737557,0.0,0.744292,0.740909


In [19]:
data.to_csv(r'Sample.csv')