In [1]:
from Bio import pairwise2, Align
import re
import duckdb



In [107]:
def get_matches_gaps(query, subject):
    
    n_columns = len(query)
    
    n_gaps = sum(i == j and i == '-' for i,j in zip(query, subject))
    print('n_gaps:',n_gaps)
    n_matches = sum(i == j and i != '-' for i,j in zip(query, subject))
    print('n_match:',n_matches)
    pattern = r'(-{1,})'
    instances = re.findall(pattern, query) + re.findall(pattern, subject)
    n_comp_gaps = len(instances)
    print('n_comp_gaps:',n_comp_gaps)
    return n_matches, n_gaps, n_columns, n_comp_gaps

def gap_compressed_percent_id(n_matches, n_gaps, n_columns, n_comp_gaps):
    """Percent matches in sequence, including but compressing gaps.

    Parameters
    ----------
    n_matches : int, number of matches in match columns
    n_gaps : number of gaps in match columns
    n_columns : total number of alignment match columns
    n_compressed_gaps : number of compressed gaps in match columns
    """
    return n_matches / (n_columns - n_gaps + n_comp_gaps)

In [2]:
path = '/mnt/s/FAFSA/FAFSA_lite.db'

con = duckdb.connect(path)

In [3]:
con.execute("""SELECT TABLE_NAME
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_TYPE='BASE TABLE'""").df()

Unnamed: 0,table_name
0,fafsa_final


In [13]:
con.execute("""SELECT * FROM fafsa_final LIMIT 5""").df()


Unnamed: 0,m_protein_seq,t_protein_seq,meso_alphafold_id,thermo_alphafold_id,meso_pid,thermo_pid,bit_score,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,m_ogt,t_ogt,ogt_difference,m_protein_len,t_protein_len
0,MGIALIFKSFFLALSQLGDPRFRRVLGLGIILTFALLIASYAGLLW...,MIADALAALSDVVSAPFRRVLLRSLGLTIAVLVGLWLLLVSVIGSY...,A0A4U7N8C1,A0A4R3MEU4,A0A4U7N8C1,A0A4R3MEU4,246.0,0.368664,0.349345,0.338983,218,0.951965,226,0.930041,28.0,50.0,22.0,243,229
1,MEGKVKWFNAEKGYGFIETSEGGDVFVHFSAIQTDGFKTLDEGQSV...,MVGKVKWFNSEKGFGFIECEDGNDVFVHYTAINENGFKSLEEGQSV...,A0A098MCX1,A0A419SWC1,A0A098MCX1,A0A419SWC1,264.0,0.723077,0.723077,0.723077,65,1.0,65,1.0,22.0,65.0,43.0,65,65
2,MGFPILETERLKLRELTLLDAETMFYYFEKASVIRYFGMDSFQNME...,MAVLETKRLILRQYEDEDIIPLHCIFSDPETMKFYPSPFSIQQTQD...,A0A2B6IPT3,A0A178TR74,A0A2B6IPT3,A0A178TR74,190.0,0.310811,0.275449,0.26513,147,0.88024,148,0.822222,30.0,55.0,25.0,180,167
3,MSARILVVDNYDSFVFNLVQYLYQLGAECEVLRNDEVALSHAQDGF...,MALAKRVVILDYGSGNLRSAERAIARAGAEVEVTSDFDAAVEADGL...,E2PWL5,D6YA12,E2PWL5,D6YA12,105.0,0.31383,0.278302,0.278302,207,0.976415,188,0.886792,28.0,52.5,24.5,212,212
4,MRINKYLAETGVVSRRGADAWIEAGRITINDELATLGSKVEDGDVV...,MERLQKVIAQAGIASRRKAEQLILEGKVKVNGEVVKALGTKVSRSD...,A0A1C0YLC9,A0A4P6URQ6,A0A1C0YLC9,A0A4P6URQ6,307.0,0.381579,0.356557,0.36478,232,0.95082,223,0.957082,29.0,50.0,21.0,233,244


In [80]:
sequences1 = con.execute("""SELECT * FROM fafsa_final LIMIT 5""").df()['m_protein_seq']
sequences2 = con.execute("""SELECT * FROM fafsa_final LIMIT 5""").df()['t_protein_seq']

In [116]:
# Set the gap penalties
# query is sequence 2 in this code
gap_open = -11
gap_extend = -1

# Perform pairwise local sequence alignment
alignments = []
for i, seq1 in enumerate(sequences1):
    #for seq2 in sequences2:
    alignment = pairwise2.align.localds(seq1, sequences2[i], substitution_matrix, gap_open, gap_extend)
    best_alignment = max(alignment, key=lambda x: x[2])
    alignments.append(best_alignment)

# Calculate alignment coverage
for i, alignment in enumerate(alignments):
    seq1_aligned = alignment[0]
    seq2_aligned = alignment[1]

    # Calculate coverage for sequence 1
    coverage1 = sum(c != '-' for c in seq1_aligned) / len(seq1_aligned) * 100

    # Calculate coverage for sequence 2
    coverage2 = sum(c != '-' for c in seq2_aligned) / len(seq2_aligned) * 100
    matches = sum(c1 == c2 and c1 != '-' for c1, c2 in zip(seq1_aligned, seq2_aligned))
    length = len(seq1_aligned.replace('-', ''))
    
    print(seq1_aligned)
    print(seq2_aligned)
    percent_identity = matches / length * 100
    n_matches, n_gaps, n_columns, n_comp_gaps = get_matches_gaps(seq2_aligned, seq1_aligned)
    gap_comp_pct_id = gap_compressed_percent_id(n_matches, n_gaps, n_columns, n_comp_gaps)
    
    scaled_local_symmetric_percent_id = 2 * n_matches / (len(sequences1[i]) + len(sequences2[i]))
    scaled_local_query_percent_id = n_matches / len(sequences2[i])
    
    print('Local gap comp id:', gap_comp_pct_id)
    print('Scaled sym id:', scaled_local_symmetric_percent_id)
    print('Scaled query id:', scaled_local_query_percent_id)
    print("Seq1 alignment coverage:", coverage1)
    print("Seq2 alignment coverage:", coverage2)
    #print('Pct ID:', percent_identity)

MGIALIFKSFFLALSQLGDPRFRRVLGLGIILTFALLIASYAGLLWVLDLLVGEDAYVPVIGQVTWLDDLLSFSSFIFMFILSIFLMVPVASAITSMFLDDVAQAVEDRHYP-NLPAVDPVPFSDALRDTLYFLGVLIIANLLAFILYAFFSALSVFIFWGLNGFLLGREYFQLAAMRRLGRQGAAALRKKNKGTIWLAGILMAVPLSIPLVNLLIPIVGAATFTHLFHQI-QAADDPSSQSRVQ
-----MIADALAALSDVVSAPFRRVLLRSLGLTIAVLV----GLWLLLVSVIGSYLVLP----WGWLETLVDWLAGAGLLVGMVFLVAPVTSLVAGLHLDEIAETVETTAFPGDRPGV-ALPIGQSVVLSLKFSGLVILANLIALVL-LLVPGVNLVAFYLANAYLLGREYFELAALRYRSYEDARNLRRANGGRVFLAGLLVALMVSIPIVNLLTPLFATSLMVRLHRRIGRAADLPQTVLSG-
n_gaps: 0
n_match: 80
n_comp_gaps: 8
Local gap comp id: 0.31620553359683795
Scaled sym id: 0.3389830508474576
Scaled query id: 0.34934497816593885
Seq1 alignment coverage: 99.18367346938776
Seq2 alignment coverage: 93.46938775510203
MEGKVKWFNAEKGYGFIETSEGGDVFVHFSAIQTDGFKTLDEGQSVEFDIVEGARGPQAANVIKL
MVGKVKWFNSEKGFGFIECEDGNDVFVHYTAINENGFKSLEEGQSVEFDVVEHAKGPQAANVVKL
n_gaps: 0
n_match: 47
n_comp_gaps: 0
Local gap comp id: 0.7230769230769231
Scaled sym id: 0.7230769230769231
Scaled query id: 0.723076923076923

In [87]:

# Count the number of groups of '-'
groups_count = count_dash_groups(text)

print("Number of groups of '-':", groups_count)


Number of groups of '-': 7


In [85]:
alignment

Alignment(seqA='-MRINKYLAETGVVSRRGADAWIEAGRITINDELA-TLGSKVEDGDVVRVDGNIVTREQQLVYIALNKPVGITST-TEQHIKGNVVD-FVNH-PLRIFHIGRLDKDSEGLLLLTNDGDIVNEILRAENHHEKEYIVQVDKPIDEAFLNKMSSGVEILDTTTLPCKVE------KISSNVFRIILEQGLNRQIRRMCSACGYNVKRLQRIRIMNIQLGNLKVGQWRDLTDKERNELFQLLNYKQN-', seqB='MERLQKVIAQAGIASRRKAEQLILEGKVKVNGEVVKALGTKVSRSDVVEVNGVKVEREKK-VYYLFYKPRGVVSTVSDDKGRKTVMDYFKNHVEERIFPVGRLDYDTSGLLLLTNDGEFANLMTHPKYKIEKTYIARLKGIPQFEDIKRLRRGIMLEDGMTAPAKVELKKIDRKANKAICEITIHEGRNRQVRRMFEAIGTPVVKLKRERFAFLDLRGLNAGEFRKLTPHEVKQLRVLAETGKIG', score=335.0, start=2, end=235)