## Section 1
#### Demonstrates use of the make_blast_df function from user_blast
*make_blast_df assumes that a paired sequence set already exists. If one needs to be created, refer to Section 2.*

In [15]:
import pandas as pd
import sys
import os
sys.path.append('../')

from pairpro import user_blast, utils

In [2]:
# Sequence .csv file should be saved in data folder of local repo
# Creates a sample dataframe containing paired sequences and 
sequences = pd.read_csv('../data/50k_paired_seq.csv').sample(100)

In [3]:
sequences.head()

Unnamed: 0,m_protein_seq,t_protein_seq
2603,MARQSFDDNHYLQLFISGSEDAFDAIFKRYYEGLLQFAKVLLPYPT...,MANDAHILNCNIAELQYQIASYEDEVAYKKLFYCVFPSLQNHAFAI...
7019,MQLKGSKTEQSLIQAFIAESQANRRYLYFAAKADIEGYGEIAMLFR...,MANLKGSKTEANLKAAFAGESQANRRYLYFAQKADVEGYNDVAAVF...
12151,MSLTALYVQCYSRVSSFIKDREAASGIEYALVAAMVAVALVAFVPG...,MLKKCLELFRAFAKDEEGATAIEYGLIVGLIAIALIAVLLLVGGEQ...
5549,MTLKTIEGTFIAPKGRYALVVGRFNSFVVESLVSGAVDALVRHGVS...,MKKSKTANPAAVPADAKTGYAAPQIPDRTELAGVRIAVLATRWNVG...
10873,MQRKIPEALNLHRQSQQLELKYSSEETYQLSSEMLRVHSPSAEVRG...,MSTPAPTSIRLHRQSRLLEVSWPDGVRHALPCEYLRVFSPSAELQG...


In [4]:
sequences.shape

(100, 2)

In [26]:
# This is a temporary directory for storing the output database of the make_blast_df function
if not os.path.exists('tmp'):
    os.mkdir('tmp')

In [10]:
# This function calculates all alignment metrics and returns both a dataframe 
# (likely deprecated in future release) and a DuckDB connection object.
blast_df, con = user_blast.make_blast_df(sequences, path='./tmp/example_blast_db.db')

In [11]:
# Outputs include original paired sequences and alignment metrics as well as relevant indexing at the pair and sequence level.
blast_df.head()

Unnamed: 0,query,subject,query_id,subject_id,pair_id,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score
0,MARQSFDDNHYLQLFISGSEDAFDAIFKRYYEGLLQFAKVLLPYPT...,MANDAHILNCNIAELQYQIASYEDEVAYKKLFYCVFPSLQNHAFAI...,6,4,2603,0.284211,0.278351,0.283465,194,0.96,187,0.97,188.0
1,MQLKGSKTEQSLIQAFIAESQANRRYLYFAAKADIEGYGEIAMLFR...,MANLKGSKTEANLKAAFAGESQANRRYLYFAQKADVEGYNDVAAVF...,52,6,7019,0.657143,0.661871,0.661871,139,1.0,139,1.0,483.0
2,MSLTALYVQCYSRVSSFIKDREAASGIEYALVAAMVAVALVAFVPG...,MLKKCLELFRAFAKDEEGATAIEYGLIVGLIAIALIAVLLLVGGEQ...,72,31,12151,0.253521,0.268657,0.276923,67,0.972222,63,1.0,88.0
3,MTLKTIEGTFIAPKGRYALVVGRFNSFVVESLVSGAVDALVRHGVS...,MKKSKTANPAAVPADAKTGYAAPQIPDRTELAGVRIAVLATRWNVG...,86,22,5549,0.40678,0.411429,0.432432,175,1.0,158,1.0,343.0
4,MQRKIPEALNLHRQSQQLELKYSSEETYQLSSEMLRVHSPSAEVRG...,MSTPAPTSIRLHRQSRLLEVSWPDGVRHALPCEYLRVFSPSAELQG...,58,72,10873,0.408451,0.408451,0.431227,142,0.986111,127,0.986111,272.0


In [12]:
con.execute("""SELECT * FROM protein_pairs LIMIT 5""").df()

Unnamed: 0,query,subject,query_id,subject_id,pair_id,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score
0,MARQSFDDNHYLQLFISGSEDAFDAIFKRYYEGLLQFAKVLLPYPT...,MANDAHILNCNIAELQYQIASYEDEVAYKKLFYCVFPSLQNHAFAI...,6,4,2603,0.284211,0.278351,0.283465,194,0.96,187,0.97,188.0
1,MQLKGSKTEQSLIQAFIAESQANRRYLYFAAKADIEGYGEIAMLFR...,MANLKGSKTEANLKAAFAGESQANRRYLYFAQKADVEGYNDVAAVF...,52,6,7019,0.657143,0.661871,0.661871,139,1.0,139,1.0,483.0
2,MSLTALYVQCYSRVSSFIKDREAASGIEYALVAAMVAVALVAFVPG...,MLKKCLELFRAFAKDEEGATAIEYGLIVGLIAIALIAVLLLVGGEQ...,72,31,12151,0.253521,0.268657,0.276923,67,0.972222,63,1.0,88.0
3,MTLKTIEGTFIAPKGRYALVVGRFNSFVVESLVSGAVDALVRHGVS...,MKKSKTANPAAVPADAKTGYAAPQIPDRTELAGVRIAVLATRWNVG...,86,22,5549,0.40678,0.411429,0.432432,175,1.0,158,1.0,343.0
4,MQRKIPEALNLHRQSQQLELKYSSEETYQLSSEMLRVHSPSAEVRG...,MSTPAPTSIRLHRQSRLLEVSWPDGVRHALPCEYLRVFSPSAELQG...,58,72,10873,0.408451,0.408451,0.431227,142,0.986111,127,0.986111,272.0


**Execute cell below to clean up tmp and database files**

In [28]:
con.close()
os.remove('tmp/example_blast_db.db')
os.rmdir('tmp')

## Section 2
#### Demonstrates use of the make_pairs function for a 1D list of sequences
*To do: update this section once filtering criteria are implemented*

In [16]:
sequences = pd.read_csv('../data/50k_paired_seq.csv').sample(10)
seq1_list = sequences['m_protein_seq'].values
seq2_list = sequences['t_protein_seq'].values

In [18]:
combined_df = utils.make_pairs(seq1_list, seq2_list, save=False)

In [19]:
# This output is ready to be given to make_blast_df for alignment calculations
combined_df.head()

Unnamed: 0,seq1,seq2
0,MLCGKVKWFNNVKGYGFIVADEGSEDLFAHYSAIQMDGYRTLKAGQ...,MGNRETGRVKWFDNAKGYGFIQRGAGQEDVFVHFRQIVGEGYRTLQ...
1,MLCGKVKWFNNVKGYGFIVADEGSEDLFAHYSAIQMDGYRTLKAGQ...,MKAKVLLVDDDQRLRQMVAEYLRRHGIDSEGVGTAAAARRCLQRGH...
2,MLCGKVKWFNNVKGYGFIVADEGSEDLFAHYSAIQMDGYRTLKAGQ...,MRALVIEDDPALREQVVRFLTADGFVVDAASDGNQGAYMAQEYPAD...
3,MLCGKVKWFNNVKGYGFIVADEGSEDLFAHYSAIQMDGYRTLKAGQ...,MLVAKNIIKNYDQLKVLKGVNLEIKQGEIVSIVGSSGAGKSTLLHI...
4,MLCGKVKWFNNVKGYGFIVADEGSEDLFAHYSAIQMDGYRTLKAGQ...,MLTEEDIKAMPDEDYMNDAQLEFFRRRLLQMRQEVLQREMDVKERL...


In [20]:
# A combinatorial set of sequences (10x10 in this case) has been generated
combined_df.shape

(100, 2)