In [1]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

import itertools

def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest

In [2]:
strings = ['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT']
strings

['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT']

In [3]:
len(scs(strings))

11

In [4]:
def scs_count(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    super_count = 0
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
            super_count += 1
    return shortest_sup, super_count  # return shortest

In [5]:
print(scs_count(strings))

('CCTTGGATTGC', 2)


In [6]:
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ads1_week4_reads.fq

--2022-04-23 21:36:17--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ads1_week4_reads.fq
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 18.64.171.149, 18.64.171.223, 18.64.171.60, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|18.64.171.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 395781 (387K) [video/m2ts]
Saving to: ‘ads1_week4_reads.fq.1’


2022-04-23 21:36:18 (3.91 MB/s) - ‘ads1_week4_reads.fq.1’ saved [395781/395781]



In [7]:
from Bio import SeqIO

In [8]:
path = 'ads1_week4_reads.fq'
with open(path) as f:
    for record in SeqIO.parse(f, 'fastq'):
        print(record)
        print(record.seq)
        break

ID: r0
Name: r0
Description: r0
Number of features: 0
Per letter annotation for: phred_quality
Seq('GTCCAGCAGAGCAAGTGATGCGAGAGCTGCCCATCCTCCAACCAGCATGCCCCT...AGT')
GTCCAGCAGAGCAAGTGATGCGAGAGCTGCCCATCCTCCAACCAGCATGCCCCTAGACATTGACACTGCATCGGAGTCAGGCCAAGATCCGCAGGACAGT


In [9]:
seqs = []
with open(path) as f:
    for record in SeqIO.parse(f, 'fastq'):
#         print(record)
#         print(record.seq)
#         break
        seqs.append(record.seq)

In [10]:
len(seqs[0])

100

In [None]:
%%time

assembled_genome = scs(seqs)

In [None]:
assembled_genome[:100]

In [None]:
len(assembled_genome)