# Week 3

http://regulomics.mimuw.edu.pl/wp/2019/03/wbo-3-uliniowienie-par-sekwencji-2/

### Datasets 

In [3]:
import gzip
import requests
import shutil

In [4]:
def download_gz(url, target_file):
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(target_file, 'wb') as f:
            r.raw.decode_content = True  # just in case transport encoding was applied
            gzip_file = gzip.GzipFile(fileobj=r.raw)
            shutil.copyfileobj(gzip_file, f)

In [5]:
download_gz('http://regulomics.mimuw.edu.pl/wp/wp-content/uploads/2018/03/histones.fa_.gz', 'data/histones.fa')
download_gz('http://regulomics.mimuw.edu.pl/wp/wp-content/uploads/2018/03/bzips.fa_.gz', 'data/bzips.fa')

### Assignments

1 and 2: Reading the data, basic pairwise2 operations and visualization:

In [6]:
from Bio import SeqIO, Seq, pairwise2
import numpy as np
from tqdm import tqdm

In [7]:
seqs = list(SeqIO.parse('data/histones.fa', format='fasta'))
seqs[0]

SeqRecord(seq=Seq('ATGTCCGGTGGTAAAGGTGGTAAAGCTGGTTCAGCTGCTAAAGCTTCTCAATCT...TAA', SingleLetterAlphabet()), id='YDR225W', name='YDR225W', description='YDR225W cdna chromosome:R64-1-1:IV:915530:915928:1 gene:YDR225W gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:HTA1 description:Histone H2A; core histone protein required for chromatin assembly and chromosome function; one of two nearly identical subtypes (see also HTA2); DNA damage-dependent phosphorylation by Mec1p facilitates DNA repair; acetylated by Nat4p; N-terminally propionylated in vivo [Source:SGD;Acc:S000002633]', dbxrefs=[])

In [8]:
bzips = list(SeqIO.parse('data/bzips.fa', format='fasta'))
bzips[0]

SeqRecord(seq=Seq('ATGTCCGAATATCAGCCAAGTTTATTTGCTTTAAATCCAATGGGTTTCTCACCA...TGA', SingleLetterAlphabet()), id='YEL009C', name='YEL009C', description='YEL009C cdna chromosome:R64-1-1:V:138918:139763:-1 gene:YEL009C gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:GCN4 description:bZIP transcriptional activator of amino acid biosynthetic genes;  activator responds to amino acid starvation; expression is tightly regulated at both the transcriptional and translational levels [Source:SGD;Acc:S000000735]', dbxrefs=[])

In [9]:
alignments = pairwise2.align.globalxx("ACCGT", "ACG")
print(alignments, '\n')
print(pairwise2.format_alignment(*alignments[0]))

[('ACCGT', 'A-CG-', 3.0, 0, 5), ('ACCGT', 'AC-G-', 3.0, 0, 5)] 

ACCGT
| || 
A-CG-
  Score=3



##### Assignment 3:

For each pair of sequences in a file, calculate:
- global and local alignment
- match: +1, mismatch: -1 substitution matrix
- average score within files and between files

In [10]:
pairwise2.align.globalms("ACCGT", "ACG", 1, -1, -1, -0.5)
# this could've been done more explicitly using align with callback parameters:
# match = pairwise2.identity_match(match=1, mismatch=-1)
# gap_penalty = pairwise2.affine_penalty(open=-1, extend=-0.5)

[('ACCGT', 'A-CG-', 1.0, 0, 5), ('ACCGT', 'AC-G-', 1.0, 0, 5)]

In [11]:
def _alignment_score(
        collection1, 
        collection2, 
        strategy, 
        strategy_args=[1, -1, -1, -0.5], 
        strategy_kwargs={'score_only': True}
):
    n = len(collection1)
    m = len(collection2)
    scores = np.zeros(n*m)
    for i, seq1 in tqdm(enumerate(collection1), total=n):
        for j, seq2 in enumerate(collection2):
            scores[i*n + j] = strategy(seq1, seq2, *strategy_args, **strategy_kwargs)
    return np.mean(scores)

def mean_global_alignment_score(collection1, collection2):
    return _alignment_score(collection1, collection2, strategy=pairwise2.align.globalms)

def mean_local_alignment_score(collection1, collection2):
    return _alignment_score(collection1, collection2, strategy=pairwise2.align.localms)

Global alignment scores:

In [11]:
mean_global_alignment_score(seqs, seqs)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  4.49it/s]


345.328125

In [12]:
mean_global_alignment_score(bzips, bzips)

100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:05<00:00,  2.34it/s]


217.1656804733728

In [13]:
mean_global_alignment_score(seqs, bzips)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.54it/s]


81.875

Local alignment scores:

In [14]:
mean_local_alignment_score(seqs, seqs)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:25<00:00,  3.05s/it]


345.421875

In [15]:
mean_local_alignment_score(bzips, bzips)

100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [01:06<00:00,  5.74s/it]


227.63313609467457

In [16]:
mean_local_alignment_score(seqs, bzips)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:37<00:00,  4.62s/it]


92.11057692307692

##### Assignment 4

Translating DNA sequences into protein sequences and using BLOSUM substitution matrix to calculate their alignments.

In [12]:
from Bio.SubsMat.MatrixInfo import blosum30

In [13]:
seq_proteins = [seq.translate() for seq in seqs]
bzips_proteins = [seq.translate() for seq in bzips]

In [14]:
pairwise2.align.globalds(seq_proteins[0].seq[:-1], seq_proteins[1].seq[:-1], blosum30, -1, -0.5)

[('MSGGKGGKAGSAAKASQSRSAKAGLTFPVGRVHRLLRRGNYAQRIGSGAPVYLTAVLEYLAAEILELAGNAARDNKKTRIIPRHLQLAIRNDDELNKLLGNVTIAQGGVLPNIHQNLLPKKSAKAT-KASQEL',
  'MSGGKGGKAGSAAKASQSRSAKAGLTFPVGRVHRLLRRGNYAQRIGSGAPVYLTAVLEYLAAEILELAGNAARDNKKTRIIPRHLQLAIRNDDELNKLLGNVTIAQGGVLPNIHQNLLPKKSAK-TAKASQEL',
  812.0,
  0,
  133)]

In [15]:
def mean_proteine_global_alignment_score(collection1, collection2):
    def _preproc_seq_strategy(s1, s2, *args, **kwargs):
        return pairwise2.align.globalds(s1.seq[:-1], s2.seq[:-1], *args, **kwargs)
    
    return _alignment_score(
        collection1,
        collection2,
        strategy=_preproc_seq_strategy,
        strategy_args=[blosum30, -1, -0.5]
    )

def mean_proteine_local_alignment_score(collection1, collection2):
    def _preproc_seq_strategy(s1, s2, *args, **kwargs):
        return pairwise2.align.localds(s1.seq[:-1], s2.seq[:-1], *args, **kwargs)
    
    return _alignment_score(
        collection1,
        collection2,
        strategy=_preproc_seq_strategy,
        strategy_args=[blosum30, -1, -0.5]
    )

Proteine, global:

In [21]:
mean_proteine_global_alignment_score(seq_proteins, seq_proteins)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  9.23it/s]


380.640625

In [22]:
mean_proteine_global_alignment_score(bzips_proteins, bzips_proteins)

100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:15<00:00,  1.08s/it]


822.1715976331361

In [23]:
mean_proteine_global_alignment_score(seq_proteins, bzips_proteins)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.37it/s]


192.89903846153845

Proteine, local:

In [24]:
mean_proteine_local_alignment_score(seq_proteins, seq_proteins)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.11it/s]


382.890625

In [25]:
mean_proteine_local_alignment_score(bzips_proteins, bzips_proteins)

100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:33<00:00,  2.30s/it]


826.7100591715977

In [26]:
mean_proteine_local_alignment_score(seq_proteins, bzips_proteins)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:06<00:00,  1.28it/s]


204.5528846153846

## Home assignment

Implementation:

In [18]:
# %%writefile "assignment2.py"
"""
Home assignment 19-03-2019.
Krzysztof Kowalczyk kk385830
"""
import Bio
from Bio.Seq import Seq
from Bio.pairwise2 import align
from Bio.SubsMat.MatrixInfo import blosum30

import math
from itertools import product
from typing import Iterable, Callable, List, NamedTuple
from collections import defaultdict


class Alignment(NamedTuple):
    align1: str
    align2: str
    score: float
    begin: int
    end: int


def _translated_offsets(seq: Bio.Seq.Seq, max_range=3) -> Iterable[Bio.Seq.Seq]:
    for offset in range(max_range):
        seq_end = 3 * ((len(seq)-offset)//3)
        yield seq[offset:offset+seq_end].translate()

def default_align_strategy(seq1, seq2) -> Alignment:
    return Alignment(*align.localxx(seq1, seq2)[0])

def blosum_align_strategy(seq1, seq2) -> Alignment:
    blosum_extended = defaultdict(lambda: -math.inf)
    blosum_extended.update(blosum30)
    return Alignment(*align.localds(seq1, seq2, blosum_extended, -1, -0.5)[0])


def optimal_alignment(
        dna1: Bio.Seq.Seq, 
        dna2: Bio.Seq.Seq, 
        align_strategy: Callable[[Bio.Seq.Seq, Bio.Seq.Seq], Alignment]=default_align_strategy
) -> Alignment:
    """
    Calculates optimal alignment of 2 translated sequences, checking all 9 possible offsets in
    the translated DNA with the supplied alignment strategy.
    """
    translated_pairs = product(_translated_offsets(dna1), _translated_offsets(dna2))
    return max(
        (align_strategy(seq1, seq2) for seq1, seq2 in translated_pairs), 
        key = lambda alignment: alignment.score
    )

Usage demonstration:

In [20]:
dna1, dna2 = seqs[0], seqs[1]

al = optimal_alignment(dna1, dna2)
print(pairwise2.format_alignment(*al))

al = optimal_alignment(dna1, dna2, blosum_align_strategy)
print(pairwise2.format_alignment(*al))

ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('MSGGKGGKAGSAAKASQSRSAKAGLTFPVGRVHRLLRRGNYAQRIGSGAPVYLT...EL*', HasStopCodon(ExtendedIUPACProtein(), '*'))
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('MSGGKGGKAGSAAKASQSRSAKAGLTFPVGRVHRLLRRGNYAQRIGSGAPVYLT...EL*', HasStopCodon(ExtendedIUPACProtein(), '*'))
  Score=205

ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('VRW*-RW*-SWFSC*-SFSI*-IC*-GWFDIPSR*-SAQIAKKR*-LRPKNWFWCSSLL-DC...---RII', ---Has--Stop---Codon(-----Extended-------IU-PACProtein(), '*'))
                                                                                                    |||  ||  |||||  ||||  ||  .||.|