<a href="https://colab.research.google.com/github/iammuhammad41/Feature-Integration-and-Data-Alignment/blob/main/biosequence-alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import pandas as pd
import itertools
from itertools import islice
from re import sub, search
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import os


def pprint(dlist):
    ldf = pd.DataFrame(dlist)
    display(ldf)

In [None]:
!pip install /kaggle/input/biopylib/biopylib-0.0.7-py3-none-any.whl

Processing /kaggle/input/biopylib/biopylib-0.0.7-py3-none-any.whl
Installing collected packages: biopylib
Successfully installed biopylib-0.0.7


In [None]:
# import sequence related classes
from biopylib.sequence import SQ
from biopylib.read_sequence import readSeq
from biopylib.sequence_alignment import SUBM,ALI,PSA,MSA

In [None]:
# Read Substitution Matrix from File
subm = SUBM()
subm.read("/input/bioinformatics/substitution_matrices/blosum62.mat")

# Get all unique characters
print(f'alphabet: {subm.abc}')

# Get Character Scores
print(f"score pair W & N : {subm.score_pair('W','N')}") # Two Characters aren't identical
print(f"score pair W & N : {subm.score_pair('W','W')}") # Two Characters are identical
print(f"direct score pair W & N : {subm['W','W']}")     # used in some class functions

# Make out own substitution matrix
subm = SUBM()
subm.make('ATCG',2,-1)   # Constant Match Score of 2, mismatch score of -1 (all alphabet)
subm.head()

alphabet: CSTPAGNDEQHRKMILVFYW
score pair W & N : -4
score pair W & N : 11
direct score pair W & N : 11


[('AA', 2), ('AT', -1), ('AC', -1), ('AG', -1), ('TA', -1)]

In [None]:
# Having aligned three protein sequences we get
seq1 = "MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW"
seq2 = "MH--IFIYQIGYALKSGYIQSIRSPEY-NW"
seq3 = "MHQAIFI-QIGYALKSGY-QSIRSPEYDNW"

# define an alignment (done during psa,msa)
alig2 = ALI(lst_seqs = [seq1,seq2,seq3],
             al_type = 'aa')

print(f'Return one of the alignments: {alig2[0]}\n')
print(f'Return alignmnent column {alig2.col(0)}\n')

# print alignment via print
print(alig2)

print(f'Consensus sequence: {alig2.consensus()}')

print(f'\nAlignment visualisation w/ bokeh')
alig2.view()

Return one of the alignments: MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW

Return alignmnent column ['M', 'M', 'M']

Alignment with 3 rows and 30 columns.
MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW
MH--IFIYQIGYALKSGYIQSIRSPEY-NW
MHQAIFI-QIGYALKSGY-QSIRSPEYDNW

Consensus sequence: MHQAIFIYQIGYALKSGYIQSIRSPEYDNW

Alignment visualisation w/ bokeh


In [None]:
seq3 = SQ(seq="KPFKRTCYKPF",seq_type='aa')
seq4 = SQ(seq="AKPKKPYI",seq_type='aa')

# Define Substitution Matrix
blosum62 = '/kaggle/input/bioinformatics/substitution_matrices/blosum62.mat'

# # Global Sequence Alignment
gpsa_prot = PSA(seqs=[seq3,seq4],
                subm=SUBM().read(blosum62),
                g=-2)

gpsa_prot.initial_condition(ids='nW') # by default included in .nW method

In [None]:
sm = SUBM()
sm.read(blosum62)

gpsa_prot = PSA(seqs=[seq3,seq4],
                subm=sm,
                g=-2)

gpsa_prot.nW()
print('Finalised Score Matrix')

Finalised Score Matrix


In [None]:
print(f'global alignment score: {gpsa_prot.score}')

global alignment score: 7


In [None]:
# Traceback Movement interpretation/mapping
# 0 - done, 1 - diagonal, 2 - up, 3 - left
gpsa_prot.pprint(ids='TM')

Unnamed: 0,gap,A,K,P,K.1,K.2,P.1,Y,I
gap,0,3,3,3,3,3,3,3,3
K,2,1,1,3,3,3,3,3,3
P,2,2,2,1,3,3,3,3,3
F,2,2,2,2,1,3,3,1,3
K,2,2,1,3,1,1,3,3,3
R,2,2,2,1,2,1,3,3,3
T,2,1,2,1,2,2,1,3,3
C,2,2,2,2,2,2,2,1,1
Y,2,2,2,2,2,2,2,1,3
K,2,2,1,2,1,1,3,2,1


In [None]:
realign = gpsa_prot.realign() # deduce alignment from TM matrix
print(realign)

Alignment with 2 rows and 12 columns.
-KPFKRTCYKPF
AKP-KKP-Y--I



In [None]:
seq3 = SQ(seq="HGWAG",seq_type='aa')
seq4 = SQ(seq="PHSWG",seq_type='aa')

sm = SUBM()
sm.read(blosum62)

lpsa_prot = PSA(seqs=[seq3,seq4],
                subm=sm,
                g=-2)

lpsa_prot.initial_condition(ids='sW')
lpsa_prot.pprint(ids='SM')

Unnamed: 0,gap,P,H,S,W,G
gap,0,0.0,0.0,0.0,0.0,0.0
H,0,,,,,
G,0,,,,,
W,0,,,,,
A,0,,,,,
G,0,,,,,


In [None]:
seq3 = SQ(seq="KPFKRTCYKPF",seq_type='aa')
seq4 = SQ(seq="AKPKKPYI",seq_type='aa')

sm = SUBM()
sm.read(blosum62)

# # Global Sequence Alignment
lpsa_prot = PSA(seqs=[seq3,seq4],
                subm=sm,
                g=-8)
lpsa_prot.sW()
print('Finalised Score Matrix')
lpsa_prot.pprint(ids='SM')
print(f'\nlength of symmetric matrix {len(lpsa_prot.SM)}')
print(f'best alignment score: {lpsa_prot.score}')

Finalised Score Matrix


Unnamed: 0,gap,A,K,P,K.1,K.2,P.1,Y,I
gap,0,0,0,0,0,0,0,0,0
K,0,0,5,0,5,5,0,0,0
P,0,0,0,4,0,3,4,0,0
F,0,0,0,0,1,0,0,7,0
K,0,0,5,0,5,6,0,0,4
R,0,0,2,3,2,7,4,0,0
T,0,0,0,1,2,1,6,2,0
C,0,0,0,0,0,0,0,4,1
Y,0,0,0,0,0,0,0,7,3
K,0,0,5,0,5,5,0,0,4



length of symmetric matrix 12
best alignment score: 7


In [None]:
# Traceback Movement interpretation/mapping
# 0 - done, 1 - diagonal, 2 - up, 3 - left
lpsa_prot.pprint(ids='TM')

Unnamed: 0,gap,A,K,P,K.1,K.2,P.1,Y,I
gap,0,0,0,0,0,0,0,0,0
K,0,0,1,0,1,1,0,0,0
P,0,0,0,1,0,1,1,0,0
F,0,0,0,0,1,0,0,1,0
K,0,0,1,0,1,1,0,0,1
R,0,0,1,1,1,1,1,0,0
T,0,0,0,1,1,1,1,1,0
C,0,0,0,0,0,0,0,1,1
Y,0,0,0,0,0,0,0,1,1
K,0,0,1,0,1,1,0,0,1


In [None]:
realign = lpsa_prot.realign(ids='sW') # deduce alignment from TM matrix
print(realign) # print ALN class object

Alignment with 2 rows and 3 columns.
KPF
KPY



In [None]:
# Nucleotide Sequence Example (Doesn't have to be identical length)
seq1 = SQ(seq="tccCAGATATGTCAGGGGACACGAGcatgcagagac",seq_type='dna')
seq2 = SQ(seq="CATCATCATCATCATCATCATCATCATCAT",seq_type='dna')

# Create a substitute matrix (nucleotides)
sm = SUBM()
sm.make("ACGT",1,0)    # match +1 / mismatch -1
g = -2                 # gap penalty -2

# Global PSA alignment (get score only)
gpsa_nucl = PSA(seqs=[seq1,seq2],
                subm=sm,
                g=g)

gpsa_nucl.nW() # calculate score, SM & TM
realign = gpsa_nucl.realign() # deduce alignment from TM matrix

In [None]:
gpsa_nucl.view() # visualise alignment

In [None]:
human = "/input/bioinformatics/sequences/AAH12844.1.faa"
mouse = "/input/bioinformatics/sequences/NP_001265185.1.faa"

# Read sequences from FASTA files
human_faa = readSeq(human).store(); print(f'length of sequence: {len(human_faa)}')
mouse_faa = readSeq(mouse).store(); print(f'length of sequence: {len(mouse_faa)}')

[note] read -> FASTA [amino acid] | #seq: 1
length of sequence: 253
[note] read -> FASTA [amino acid] | #seq: 1
length of sequence: 254


In [None]:
# Instantiate substitution matrix
subm62 = SUBM()
subm62.read("/input/bioinformatics/substitution_matrices/blosum62.mat")

alin = PSA(seqs=[human_faa,mouse_faa], # define sequences
           subm=subm62,g=-3,         # use BLOSUM62 substitution matrix
           colcod = 'charge_aa') # use colourcoding in view

alin.nW() # calculte SM, TM matrices
alin.realign(ids='nW') # get the alignment from TM matrix
alin.view() # visualise alignment

### <b><span style='color:#E888BB'> 6.3 |</span> Examples </b>

In [None]:
# Define some sequences
s1 = SQ("ATAGCACTCATCCGGCCAG")
s2 = SQ("AACCCTGCAACATAGAGCA")
s3 = SQ("ATGACCATGGAGACCCTGA")
s4 = SQ("ATGGCCCATGGGCGTACTG")
s5 = SQ("ATGGCAACTACGGGACACC")

# Like in PSA, set a substitution matrix
sm = SUBM()
sm.make("ACGT",1,-1)

# Initialisation of MSA problem
ma = MSA(seqs=[s1,s2,s3,s4,s5], # set the sequences
         subm=sm,               # set the substitution matrix
         g=-1)                  # set the gap penalty model

# Get the alignment
al = ma.align_consensus() # Iteratively align sequences
al.view() # visualise alignment

In [None]:
from Bio.pairwise2 import format_alignment  # formatted pairwise display
from Bio import pairwise2 # pairwise sequece alignment
from Bio.Seq import Seq # basic sequence class
from Bio.SeqRecord import SeqRecord # more detailed sequence class

# Basic String Format
str_seq = 'ACAAATTCAATTCCATAACATTTATCCCAT'
basic_seq = Seq(str_seq)
print(basic_seq,'\n')

ACAAATTCAATTCCATAACATTTATCCCAT 



In [None]:
# Advanced String Format
print(type(basic_seq))
adv_seq = SeqRecord(basic_seq,id='basic_seq')
print(adv_seq)
print(type(adv_seq))

<class 'Bio.Seq.Seq'>
ID: basic_seq
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('ACAAATTCAATTCCATAACATTTATCCCAT')
<class 'Bio.SeqRecord.SeqRecord'>


In [None]:
''' Global Alignment of two DNA sequences'''

# define an instance of Seq
cseq1 = Seq('ATGGCAGATAGA')
cseq2 = Seq('ATAGAGAATAGATGGCAGATAGA')

# - (match score = 1, missmatch = 0), gap penalties = 0
GDNA = pairwise2.align.globalxx(cseq1,cseq2)
print(f'PSA class: {type(GDNA[0])}')
print (f'# Alternative optimal alignments: {len(GDNA)}\n')
print('Aligned Sequence #0')
print(GDNA[0])

print('\nShowing all score ties:')

# print all the alignments (including ties)
for i in GDNA:
    print(format_alignment(*i,full_sequences=True))

PSA class: <class 'Bio.pairwise2.Alignment'>
# Alternative optimal alignments: 11

Aligned Sequence #0
Alignment(seqA='AT-G-G---------CAGATAGA', seqB='ATAGAGAATAGATGGCAGATAGA', score=12.0, start=0, end=23)

Showing all score ties:
AT-G-G---------CAGATAGA
|| | |         ||||||||
ATAGAGAATAGATGGCAGATAGA
  Score=12

AT-G------G----CAGATAGA
|| |      |    ||||||||
ATAGAGAATAGATGGCAGATAGA
  Score=12

AT---G----G----CAGATAGA
||   |    |    ||||||||
ATAGAGAATAGATGGCAGATAGA
  Score=12

AT-G---------G-CAGATAGA
|| |         | ||||||||
ATAGAGAATAGATGGCAGATAGA
  Score=12

AT---G-------G-CAGATAGA
||   |       | ||||||||
ATAGAGAATAGATGGCAGATAGA
  Score=12

AT--------G--G-CAGATAGA
||        |  | ||||||||
ATAGAGAATAGATGGCAGATAGA
  Score=12

A-------T-G--G-CAGATAGA
|       | |  | ||||||||
ATAGAGAATAGATGGCAGATAGA
  Score=12

--A-----T-G--G-CAGATAGA
  |     | |  | ||||||||
ATAGAGAATAGATGGCAGATAGA
  Score=12

----A---T-G--G-CAGATAGA
    |   | |  | ||||||||
ATAGAGAATAGATGGCAGATAGA
  Score=12

------A-T-G--

In [None]:
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment as MSABIO

# String format of alignment
aa1 = "MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW"
aa2 = "MH__IFIYQIGYALKSGYIQSIRSPEY_NW"
aa3 = "MHQAIFI_QIGYALKSGY_QSIRSPEYDNW"
aa4 = "MHQAMHI_KSGYA__SGY_QSIRSPEYDNW"

# Sequence Class instances
seq_aa1 = Seq(aa1)
seq_aa2 = Seq(aa2)
seq_aa3 = Seq(aa3)
seq_aa4 = Seq(aa4)

# Create a sequence record w/ a defined ID
seqr1 = SeqRecord(seq_aa1,id="seq1")
seqr2 = SeqRecord(seq_aa2,id="seq2")
seqr3 = SeqRecord(seq_aa3,id="seq3")
seqr4 = SeqRecord(seq_aa4,id="seq4")
lst_seq = [seqr1,seqr2,seqr3,seqr4]

# Multiple Sequence Alignment
alin = MSABIO(lst_seq)
print(alin)

Alignment with 4 rows and 30 columns
MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW seq1
MH__IFIYQIGYALKSGYIQSIRSPEY_NW seq2
MHQAIFI_QIGYALKSGY_QSIRSPEYDNW seq3
MHQAMHI_KSGYA__SGY_QSIRSPEYDNW seq4


In [None]:
''' ALIGN TWO SEQUENCES '''

# Path to two amino acid chains
human = "/kaggle/input/bioinformatics/sequences/AAH12844.1.faa"
mouse = "/kaggle/input/bioinformatics/sequences/NP_001265185.1.faa"

# Read sequences from FASTA files
human_faa = readSeq(human).store(); print(f'length of sequence: {len(human_faa)}')
mouse_faa = readSeq(mouse).store(); print(f'length of sequence: {len(mouse_faa)}')

# Instantiate substitution matrix
subm62 = SUBM()
subm62.read("/kaggle/input/bioinformatics/substitution_matrices/blosum62.mat")

# Pairwise Sequence Alignment
alin = PSA(seqs=[human_faa,mouse_faa], # define sequences
           subm=subm62,g=-3,         # use BLOSUM62 substitution matrix
           colcod = 'charge_aa') # use colourcoding in view

# Needleman Wunsch Global Alignment
alin.nW() # calculte SM, TM matrices
alin.realign(ids='nW') # get the alignment from TM matrix
alin.view() # visualise alignment

[note] read -> FASTA [amino acid] | #seq: 1
length of sequence: 253
[note] read -> FASTA [amino acid] | #seq: 1
length of sequence: 254


In [None]:
''' ALIGN MULTIPLE SEQUENCES '''

# Define some sequences
s1 = SQ("ATAGCACTCATCCGGCCAG")
s2 = SQ("AACCCTGCAACATAGAGCA")
s3 = SQ("ATGACCATGGAGACCCTGA")
s4 = SQ("ATGGCCCATGGGCGTACTG")
s5 = SQ("ATGGCAACTACGGGACACC")

# Like in PSA, set a substitution matrix
sm = SUBM()
sm.make("ACGT",1,-1)

# Initialisation of MSA problem
ma = MSA(seqs=[s1,s2,s3,s4,s5], # set the sequences
         subm=sm,               # set the substitution matrix
         g=-1)                  # set the gap penalty model

# Get the alignment
al = ma.align_consensus() # Iteratively align sequences
al.view() # visualise alignment