In [1]:
def score_match(subject, query):
    score = 0
    for i in range(0, len(subject)):
        subject_base = subject[i]
        query_base = query[i]

        if subject_base == query_base:
            score = score + 1
        else:
            score = score - 1
    return score


In [2]:
def find_motif(seq, leng):
    motif = {}
    for i in range(0, len(seq)-leng+1):
        value = seq[i:i+leng]

        if value not in motif.keys():
            motif[value] = [i]
        else:
            motif[value] += [i]

    return motif


In [3]:
def find_matches(s_s, q_s, length):
    s_motif = find_motif(s_s, length)
    q_motif = find_motif(q_s, length)

    motifsets = set(s_motif.keys()) & set(q_motif.keys())

    s_position = []
    q_position = []
    for i in motifsets:
        s_position.append(s_motif[i])
        q_position.append(q_motif[i])

    return s_position, q_position


In [4]:
def extend_right_scores(subject, query, s_start, q_start, length, score_limit):
    r_result = []
    for i in range(0, len(query)):
        s_s_r = subject[s_start:s_start+length+i]
        q_s_r = query[q_start:q_start+length+i]

        score = score_match(s_s_r, q_s_r)

        if score >= score_limit:
            r_result.append([score, s_s_r, q_s_r])

        if score <= score_limit-50:
            break

        if s_s_r == subject[-len(s_s_r):] or q_s_r == query[-len(q_s_r):]:
            break
    return r_result


In [5]:
def extend_left_scores(subject, query, s_start, q_start, length, score_limit):

    l_result = []
    for i in range(0, len(query)):
        s_s_l = subject[s_start-i:s_start+length]
        q_s_l = query[q_start-i:q_start+length]

        score = score_match(s_s_l, q_s_l)

        if score >= score_limit:
            l_result.append([score, s_s_l, q_s_l])

        if score <= score_limit-50:
            break

        if s_s_l == subject[:len(s_s_l)] or q_s_l == query[:len(q_s_l)]:
            break
    return l_result
  

In [6]:
def calu_all(subject, query, length, score_limit):
    s_position, q_position = find_matches(subject, query, length)

    r = []
    l = []

    for i in range(0, len(s_position)):
        for s in s_position[i]:
            for q in q_position[i]:
                s_start = s
                q_start = q
                tr = extend_right_scores(subject, query, s, q,
                                         length, score_limit)
                tl = extend_left_scores(subject, query, s, q,
                                        length, score_limit)
                r.append(tr)
                l.append(tl)

    import itertools
    chain_r = itertools.chain(*r)
    chain_l = itertools.chain(*l)

    lst_r = list(chain_r)
    lst_l = list(chain_l)

    newlst_r = []
    for i in lst_r:
        if i not in newlst_r:
            newlst_r.append(i)

    newlst_l = []
    for i in lst_l:
        if i not in newlst_l:
            newlst_l.append(i)

    duplicate = []

    for i in newlst_r:
        if i in newlst_l:
            duplicate.append(i)

    for i in duplicate:
        newlst_r.remove(i)

    total = newlst_r + newlst_l

    return total
       

In [7]:
def nice_print_match(subject, query, score, s_s, q_s):
    import re

    print 'score: ', score
    m_s = re.search(s_s, subject)
    print("start: " + str(m_s.start())) + (' '*len(q_s)) + ("end: " + str(m_s.end()))
    print ' '*7, s_s
    print ' '*7, q_s
    m_q = re.search(q_s, query)
    print("start: " + str(m_q.start())) + (' '*len(q_s)) + ("end: " + str(m_q.end()))

    print('--------------------')
    

In [8]:
def read_festa(filename):
    try:
        with open(filename) as f:
            data = f.read()
            data = data.split('\n')
    except IOError:
        print("File does not exist!")

    seqs = {}
    for line in data:
        line = line.rstrip()
    # distinguish header from sequence
        if line[0] == '>':    # or line.startswith('>')
            words = line.split()
            name = words[0][1:]  # We don't want to have ">" sign
            seqs[name] = ""
        else:
            seqs[name] = seqs[name] + line

    return seqs


In [9]:
seqs = read_festa('dna1.fasta')

In [10]:
key = [x for x in seqs.keys()]

id1 = seqs[key[0]].lower()
id2 = seqs[key[1]].lower()

total = calu_all(id1, id2, 5, 9)
total.sort()

for i in total:
    nice_print_match(id1, id2, *i)

score:  9
start: 1321           end: 1332
        aaatgccgaac
        aaattccgaac
start: 244           end: 255
--------------------
score:  9
start: 1078               end: 1093
        aaccgagccgctatg
        aaccggcccgccatg
start: 1596               end: 1611
--------------------
score:  9
start: 1078                 end: 1095
        aaccgagccgctatgaa
        aaccggcccgccatgca
start: 1596                 end: 1613
--------------------
score:  9
start: 1078                   end: 1097
        aaccgagccgctatgaact
        aaccggcccgccatgcacg
start: 1596                   end: 1615
--------------------
score:  9
start: 2596           end: 2607
        aaccgccgcag
        aagcgccgcag
start: 1036           end: 1047
--------------------
score:  9
start: 1093               end: 1108
        aactctctcggcgtc
        aactcggtgggcgtc
start: 1003               end: 1018
--------------------
score:  9
start: 1093                     end: 1114
        aactctctcggcgtctgaccg
        aactcggtgggcgt