# Implementing Shortest Common Superstring (SCS)

In [2]:
def overlap(a, b, min_length=3):
    start = 0
    while True:
        start = a.find(b[:min_length], start)
        if start == -1:
            return 0
        if b.startswith(a[start:]):
            return len(a)-start
        start +=1 

In [3]:
import itertools

def scs(ss):                                                      #ss hocche seisob string er list jader scs ber krte hbe
    shortest_sup = None
    for ssperm in itertools.permutations(ss):                     #ss str gular permutation krbe
        sup = ssperm[0]                                           # proti permutation er prothom str nibe
        for i in range(len(ss)-1):
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)  # i th and (i+1)th er moddhe koyta overlap hocche dekhbe
            sup += ssperm[i+1][olen:]                             # i and (i+1)th er jotogula overlap hoise, i+1 th er prothom totogula baad diye sup er sathe append krbe
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup                                    #shortest ta update kora
    return shortest_sup
            

In [4]:
scs(['ACGGTACGAGC', 'GAGCTTCGGA', 'GACACGG'])

'GACACGGTACGAGCTTCGGA'

# Implementing Greedy Shortest Common Superstring (GSCS)

In [5]:
def pick_maximal_overlap(reads, k):
    reada, readb = None, None
    best_olen = 0
    for a, b in itertools.permutations(reads, 2):     # reads str gula theke 2 ta kore niye permutation korbe
        olen = overlap(a, b, min_length=k)            # a, b er moddhe overlap check
        if olen > best_olen:
            reada, readb = a, b                       # best overlap hole a, b, best_olen nibe
            best_olen = olen
    return reada, readb, best_olen

In [8]:
def greedy_scs(reads, k):
    read_a, read_b, olen = pick_maximal_overlap(reads, k)      # best overlap nibe
    while olen > 0:                                            # jotokkhon 0+ overlap pawa jacche
        reads.remove(read_a) 
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])                   # a,b remove kore a+b kore dibe
        read_a, read_b, olen = pick_maximal_overlap(reads, k)  # a+b kore abar chalabe
    return ''.join(reads)                                      # final reads gulo join kore dibe eksathe

In [9]:
greedy_scs(['ABC', 'BCA', 'CAB'], 2)

'CABCA'

In [10]:
greedy_scs(['ABC', 'BCA', 'CAB'], 1)

'CABCA'

In [11]:
greedy_scs(['ABCD', 'CDBC', 'BCDA'], 1)

'CDBCABCDA'

In [16]:
# Greedy SCS sometimes inappropriate ans dite pare, but it's faster than normal SCS

print(greedy_scs(['ABCD', 'CDBC', 'BCDA'], 1))
print(scs(['ABCD', 'CDBC', 'BCDA']))

CDBCABCDA
ABCDBCDA


In [18]:
 # when the genome is repetitive, the shortest common superstring of the reads is not going to be the correct answer.
 # to solve it, use De Bruijin Graph

# Building De Bruijn Graph

In [26]:
def de_bruijn_ize(st, k):                       # k-mer er k
    edges = []
    nodes = set()
    for i in range(len(st) - k + 1):
        edges.append((st[i:i+k-1], st[i+1:i+k]))  # each k-1 mer
        nodes.add(st[i:i+k-1])
        nodes.add(st[i+1:i+k])
    return nodes, edges

In [27]:
nodes, edges = de_bruijn_ize('ACGCGTCG', 3)

In [28]:
print(nodes)

{'AC', 'CG', 'TC', 'GT', 'GC'}


In [29]:
print(edges)

[('AC', 'CG'), ('CG', 'GC'), ('GC', 'CG'), ('CG', 'GT'), ('GT', 'TC'), ('TC', 'CG')]


# Quiz Week 4

In [34]:
print('Week 4 Question 1:')
print(scs(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT']), ' length = ', len(scs(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'])))

Week 4 Question 1:
CCTTGGATTGC  length =  11
