### Notes

- Evan used Smith-Waterman for original library generation. Let's stick with this for now.
- How much of an issue is speed? Need to compute this for several packages if biopython is slow for millions of pairs.
- adapted code from https://www.biostars.org/p/42687/ (old python 2 code. May not actually use)
- https://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity useful description of compressed id
- https://www.youtube.com/watch?v=oxHuUdYLA7E&t=587s&ab_channel=LanaCaldarevic

**Found Evan's BLAST code. Turns out all of the metrics were computed by hand from alignments. This squares with my reading about BioPython. Looks like we can just adapt some of his code, then just use BioPython to obtain the alignments themselves.**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io

In [2]:
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# https://biopython.org/docs/1.75/api/Bio.pairwise2.html
from Bio import pairwise2
from Bio.Align import substitution_matrices

In [3]:
s50k = pd.read_csv('../learn2therm_sample_50k_exploration.csv')
print(s50k.iloc[0])

Unnamed: 0                                                                               0
local_gap_compressed_percent_id                                                   0.287582
scaled_local_query_percent_id                                                     0.217822
scaled_local_symmetric_percent_id                                                 0.215686
query_align_len                                                                        160
query_align_cov                                                                   0.792079
subject_align_len                                                                      152
subject_align_cov                                                                 0.737864
bit_score                                                                              131
thermo_index                                                                           875
meso_index                                                                           12897

In [4]:
metrics = ['local_gap_compressed_percent_id', 'scaled_local_query_percent_id','scaled_local_symmetric_percent_id',
           'query_align_len','query_align_cov','subject_align_len','subject_align_cov','bit_score']

In [5]:
print(s50k.iloc[0][metrics])

local_gap_compressed_percent_id      0.287582
scaled_local_query_percent_id        0.217822
scaled_local_symmetric_percent_id    0.215686
query_align_len                           160
query_align_cov                      0.792079
subject_align_len                         152
subject_align_cov                    0.737864
bit_score                                 131
Name: 0, dtype: object


In [6]:
seq_s1 = s50k.iloc[0][['m_protein_seq','t_protein_seq']]

In [7]:
seq_s1[0]

'MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA'

In [8]:
# # Create two sequence files
# seq1 = SeqRecord(Seq(seq_s1[0]), id="seq1")
# seq2 = SeqRecord(Seq(seq_s1[1]), id="seq2")

# SeqIO.write(seq1, "seq1.fasta", "fasta")
# SeqIO.write(seq2, "seq2.fasta", "fasta")

# # Run BLAST and parse the output as XML
# output = NcbiblastpCommandline(query="seq1.fasta", subject="seq2.fasta", outfmt=5)()[0]
# blast_result_record = NCBIXML.read(StringIO(output))

# # Print some information on the result
# for alignment in blast_result_record.alignments:
#     for hsp in alignment.hsps:
#         print('****Alignment****')
#         print('sequence:', alignment.title)
#         print('length:', alignment.length)
#         print('e value:', hsp.expect)
#         print(hsp.query)
#         print(hsp.match)
#         print(hsp.sbjct)

In [9]:
matrix = substitution_matrices.load('BLOSUM62')
alignments = pairwise2.align.localds(seq_s1[0], seq_s1[1], match_dict = matrix, open = -11, extend = -1)

In [10]:
alignments[0]

Alignment(seqA='MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQK---------AADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA-------------', seqB='-------------------------MPSQITESERIELAERFERDALPLLDQLYSAALRMTRNPADAEDLVQETYLKAYNAFASFKEGT-NLKAWLYRILTNTYINSYRKKQRQPLQQPAEDITDWQLAQAESHTSRGLRSAEVEALEQLPDADVKDALQRLPEDFRLAVYLADVEGFSYKEIAEIMGTPIGTVMSRLHRGRGRLRELLADVARERGFIKQGVEVAKR', score=142.0, start=48, end=209)

In [11]:
len(alignments)

3