# Edit Distance using dynamaic programming 

In [11]:
def editDistRecursive(x, y):
    if len(x)==0:
        return len(y)
    elif len(y)==0:
        return len(x)
    else:
        distHor = editDistRecursive(x[:-1], y) + 1
        distVer = editDistRecursive(x, y[:-1]) + 1
        if x[-1] == y[-1]:
            distDiag = editDistRecursive(x[:-1], y[:-1])
        else:
            distDiag = editDistRecursive(x[:-1], y[:-1]) + 1
        return min(distHor, distVer, distDiag)
        

In [12]:
def editDistance(x, y):
    D = []
    for i in range(len(x)+1):
        D.append([0]*(len(y)+1))                     #2d array bananor jonno. len(x)+1 karon empty string o ache
    
    for i in range(len(x)+1):
        D[i][0] = i                                  #first row 0,1,2,3,4 evabe fillup
    
    for i in range(len(y)+1):
        D[0][i] = i                                  #first column 0,1,2,3,4 evabe fillup
    
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            distHor = D[i][j-1] + 1
            distVer = D[i-1][j] + 1
            if x[i-1] == y[j-1]:
                distDiag = D[i-1][j-1]
            else:
                distDiag = D[i-1][j-1] + 1
            
            D[i][j] = min(distHor, distVer, distDiag) #dist jetay kom hbe
    return D[-1][-1]                                  #last er index 
                
        

In [13]:
%%time
x = 'shake spea'
y = 'Shakespear'
print(editDistRecursive(x, y))

3
Wall time: 5.39 s


In [14]:
%%time
x = 'shake spea'
y = 'Shakespear'
print(editDistance(x, y))

3
Wall time: 0 ns


# Implementing Global Alignment

In [20]:
alphabet = ['A', 'C', 'G', 'T']
score = [[0 ,4, 2, 4, 8],
         [4, 0, 4, 2, 8],
         [2, 4, 0, 4, 8],
         [4, 2, 4, 0, 8],
         [8, 8, 8, 8, 8]]

In [26]:
def globalAlignment(x, y):
    # Create distance matrix
    D = []
    for i in range(len(x)+1):
        D.append([0] * (len(y)+1))
        
    # Initialize first column
    for i in range(1, len(x)+1):
        D[i][0] = D[i-1][0] + score[alphabet.index(x[i-1])][-1]  #alphabet.index('A') mane alphabet list e A er index

    # Initialize first row
    for j in range(1,len(y)+1):
        D[0][j] = D[0][j-1] + score[-1][alphabet.index(y[j-1])]
        
    # Fill rest of the matrix
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            distHor = D[i][j-1] + score[-1][alphabet.index(y[j-1])]  # D[i][j] = D[i][j-1] + S[-][j-1], means y[i-1] er kono vumika nai
            distVer = D[i-1][j] + score[alphabet.index(x[i-1])][-1]  # D[i][j] = D[i-1][j] + S[i-1][-], means y[j-1] er kono vumika nai
            
            if x[i-1] == y[j-1]:
                distDiag = D[i-1][j-1]
            else:
                distDiag = D[i-1][j-1] + score[alphabet.index(x[i-1])][alphabet.index(y[j-1])]
            D[i][j] = min(distHor, distVer, distDiag)
    return D[-1][-1]  # return value in bottom right corner

In [27]:
x = 'TACCAGATTCGA'
y = 'TACCAGATTCGA'
globalAlignment(x, y)

0

In [28]:
x = 'TACCAGATTCGA'
y = 'TACCAATTCGA'
globalAlignment(x, y)

8

In [29]:
x = 'TATGTCATGC'
y = 'TATGGCAGC'
print(globalAlignment(x,y))

12


# Overlaps between pairs and reads

In [9]:
def overlap(a, b, min_length=3):
    start = 0                                   # a er 0 theke scan suru
    
    while True:
        start = a.find(b[:min_length], start)   # a er start theke suru kore a er moddhe b[0..2] find korbe 
        if start == -1:                         # start jodi ses e chole jay tahole payni
            return 0
        if b.startswith(a[start:]):             # start jodi peye jay ar b jodi seta diye start hoy
            return len(a)-start                 # etotuku overlap hoyeche
        start += 1                              # 1 kore pichabe. karon a er suffix ar b er prefix er overlap dorkar
        

In [10]:
overlap('TTACGT', 'CGTACCGT')

3

In [11]:
overlap('TTACGT', 'GTACCGT')

0

In [12]:
overlap('TTACGT', 'ACGTACCGT')

4

In [13]:
overlap('TTACGTT', 'CGTACCGT')

0

In [14]:
from itertools import permutations

list(permutations([1,2,3], 1))

[(1,), (2,), (3,)]

In [16]:
list(permutations([1,2,3], 2))

[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]

In [17]:
def naive_overlap_map(reads, k):             # k = min koyta overlap
    olaps = {}
    for a,b in permutations(reads, 2):       # reads er permutation korbe 2 ta kore. then ekta a, arekta b 
        olen = overlap(a, b, min_length=k)   # a, b overlap korate hobe
        if olen > 0:                         #jodi overlap successfull hoy
            olaps[(a,b)] = olen              #overlap er mapping kore dite hbe
    return olaps
        
    

In [19]:
reads = ['ACGGATGATC', 'GATCAAGT', 'TTCACGGA']
print(naive_overlap_map(reads, 3))

{('ACGGATGATC', 'GATCAAGT'): 4, ('TTCACGGA', 'ACGGATGATC'): 5}


# Quiz Week 3

In [1]:
import urllib.request

url = 'https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/chr1.GRCh38.excerpt.fasta'
filename = 'chr1.GRCh38.excerpt.fasta'
urllib.request.urlretrieve(url, filename)
print(filename)

chr1.GRCh38.excerpt.fasta


In [2]:
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            if not line[0] == '>':
                genome += line.rstrip()
    return genome
genome = readGenome('chr1.GRCh38.excerpt.fasta')
genome[:100]

'TTGAATGCTGAAATCAGCAGGTAATATATGATAATAGAGAAAGCTATCCCGAAGGTGCATAGGTCAACAATACTTGAGCCTAACTCAGTAGATCCTAAAA'

In [10]:
def editDistance(x, y):
    D = []
    for i in range(len(x)+1):
        D.append([0]*(len(y)+1))                     #2d array bananor jonno. len(x)+1 karon empty string o ache
    
    for i in range(len(x)+1):
        D[i][0] = i                                  #first row 0,1,2,3,4 evabe fillup
    
    for i in range(len(y)+1):
        D[0][i] = 0                                  #first column 0,0,0 evabe fillup
    
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            distHor = D[i][j-1] + 1
            distVer = D[i-1][j] + 1
            if x[i-1] == y[j-1]:
                distDiag = D[i-1][j-1]
            else:
                distDiag = D[i-1][j-1] + 1
            
            D[i][j] = min(distHor, distVer, distDiag) #dist jetay kom hbe
    return min(D[-1])                                #last er index 

In [11]:
print('Week 3 Question 1 :')
print(editDistance('GCTGATCGATCGTACG', genome))

Week 3 Question 1 :
3


In [12]:
print('Week 3 Question 2 :')
print(editDistance('GATTTACCAGATTGAG', genome))

Week 3 Question 2 :
2


In [13]:
import urllib.request

url = 'https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR266411_1.for_asm.fastq'
filename = 'ERR266411_1.for_asm.fastq'
urllib.request.urlretrieve(url, filename)
print(filename)

ERR266411_1.for_asm.fastq


In [14]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip()  # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

In [15]:
seqs, quals = readFastq('ERR266411_1.for_asm.fastq')

In [16]:
from collections import defaultdict

In [17]:
def overlap(a, b, min_length=3):
    start = 0                                   # a er 0 theke scan suru
    
    while True:
        start = a.find(b[:min_length], start)   # a er start theke suru kore a er moddhe b[0..2] find korbe 
        if start == -1:                         # start jodi ses e chole jay tahole payni
            return 0
        if b.startswith(a[start:]):             # start jodi peye jay ar b jodi seta diye start hoy
            return len(a)-start                 # etotuku overlap hoyeche
        start += 1                              # 1 kore pichabe. karon a er suffix ar b er prefix er overlap dorkar

def overlap_graph(reads, k):
    index = defaultdict(set)
    for read in reads:
        for i in range(len(read)-k+1):         #sob reads er last k ta nibe
            index[read[i:i+k]].add(read)       #k ta newar por segula k map kore rakhbe
    
    graph = defaultdict(set)
    for r in reads:
        for o in index[r[-k:]]:                #sob read er last k ta nibe 
            if r != o:                         #r ar o same na hole 
                if overlap(r, o, k):           #overlap korbe r ar o er
                    graph[r].add(o)            #graph e add kore dibe overlap 
    
    edges = 0
    for read in graph:
        edges += len(graph[read])             #jotogula overlap totogula edge
    return (edges, len(graph))
            

In [18]:
edges, suffixes = overlap_graph(seqs, 30)
print('Week 3 Question 3:')
print(edges)

print('Week 4 Question 4:')
print(suffixes)

Week 3 Question 3:
904746
Week 4 Question 4:
7161
