
# 🧬 Lab 4 — Pairwise Alignment & Homology Search


## Exercise 1 — Implement Needleman–Wunsch and Smith–Waterman (Code)


In [None]:

# Needleman-Wunsch and Smith-Waterman implementations (reference)
import numpy as np
from typing import Tuple, List

def needleman_wunsch(seq1: str, seq2: str, match: int=1, mismatch: int=-1, gap: int=-2) -> Tuple[str, str, np.ndarray]:
    n, m = len(seq1), len(seq2)
    score_matrix = np.zeros((n+1, m+1), dtype=float)

    aligned1 = ''
    aligned2 = ''
    return aligned1, aligned2, score_matrix
    

def smith_waterman(seq1: str, seq2: str, match: int=2, mismatch: int=-1, gap: int=-2) -> Tuple[str, str, np.ndarray, float]:
    # uses 0 floor
    n, m = len(seq1), len(seq2)
    score_matrix = np.zeros((n+1, m+1), dtype=float)
    max_i = max_j = 0
    max_score = 0.0
    aligned1 = ''
    aligned2 = ''

    return aligned1, aligned2, score_matrix, max_score
    
# Demo run
seqA = "GATTACA"
seqB = "GCATGCU"

alnA, alnB, S_nw = needleman_wunsch(seqA, seqB, match=1, mismatch=-1, gap=-2)
print("Needleman-Wunsch alignment:")
print(alnA)
print(alnB)
print("\nSmith-Waterman (local):")
alnA2, alnB2, S_sw, max_score = smith_waterman(seqA, seqB, match=2, mismatch=-1, gap=-2)
print(alnA2)
print(alnB2)
print("\nNW score matrix (snippet):")
print(S_nw)



## Exercise 2 — Biopython `pairwise2` (Protein & Nucleotide examples)

**Goal:** Use `pairwise2` API for local/global alignment using substitution matrices. Print the matrices for different cases (PAM, PAM250, BLOSUM62)


In [None]:

from Bio import pairwise2
from Bio.Align import substitution_matrices

# Protein example (small Ras sequence fragment)
prot1 = "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQV"
prot2 = "MTEYKLVVVGAGGVGKSALTIKLIGRNHFVDEYDPTIEDSYRKQV"  # small change

# Nucleotide example
seq1 = "ATGCGTACGTTAG"
seq2 = "ATGTACGTAG"

# TODO load matrix using substitution_matrices.load(), 
# align protein sequences and nucleotide sequences using globalds, globalms. 
# Use different scoring functions and different gap penalties. Print matrices




## Exercise 3 — BLAST via NCBIWWW (Protein and Nucleotide)

**Goal:** Submit BLASTp and BLASTn jobs programmatically and parse results. Plot E-value vs Bit score, Identity vs Query Coverage and Bit Score vs Identity.
**Warning:** These cells will make live network calls to NCBI. If running in a restricted environment, comment out the network cells and use the provided offline fallback below.


In [None]:

# Protein BLAST (BLASTp) example - [NETWORK REQUIRED]
from Bio.Blast import NCBIWWW, NCBIXML
prot_query = 'MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQV'  # RAS fragment

# TODO use NCBIWWW.qblast() to submit BLASTp query, use NCBIXML.read() to parse results
# Print top 5 hits, plot E-value vs Bit score, Identity vs Query Coverage and Bit Score vs Identity
# Plot E-value vs Bit score
# Plot Identity vs Query Coverage
# Plot Bit Score vs Identity



In [None]:
# TODO perform BLASTn search for the query. Print top5 results. Print E-value and bit score for each hit.
# Nucleotide BLAST (BLASTn) example - [NETWORK REQUIRED]
from Bio.Blast import NCBIWWW, NCBIXML
nt_query = 'ATGCGTACGTTAGCTAGCTAGCTAGCTAG'  # synthetic



## Exercise 4 — Sequence Embeddings (Protein & Nucleotide)
**Goal:** Provide an example of using pretrained models to compute embeddings and compare similarity without alignment.
- Protein models: ESM, ProtTrans (HuggingFace)  


In [None]:

# Example with HuggingFace transformers (protein model) - [NETWORK REQUIRED]
# First install: pip install transformers sentencepiece torch

try:
    from transformers import AutoTokenizer, AutoModel
    import torch
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity
    
    print("Loading ESM2 model from HuggingFace...")
    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
    model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D")
    
    # Two protein sequences to compare
    seq1 = "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQV"
    seq2 = "MTEYKLVVVGAGGVGKSALTIKLIGRNHFVDEYDPTIEDSYRKQV"  # Same sequence for demo
    
    print(f"Sequence 1: {seq1}")
    print(f"Sequence 2: {seq2}")
    
    # TODO Tokenize inputs using ESM2 tokenizer
    # TODO Get embeddings using ESM2 model
    # TODO Calculate cosine similarity between embeddings
    # TODO Print similarity score
