In [1]:
# Import Libraries
%matplotlib inline
import re
import matplotlib.pyplot as plt
import numpy as np
import Bio.SeqIO as SeqIO
import time
from sklearn.cluster import KMeans, k_means

In [2]:
# Extract Data
f = open("sequence.gb","r")
out = f.read()
f.close()

j = 0
dict = []
for i, record in enumerate(SeqIO.parse("sequence.gb", "genbank")):
    if all (key in record.features[0].qualifiers for key in ('host','country','collection_date')):
        j+=1
        
        host = record.features[0].qualifiers['host'][0]
        host = host.split(';')[0]
        
        location = record.features[0].qualifiers['country'][0]
        location = location.split(':')[0]
        
        year = record.features[0].qualifiers['collection_date'][0]
        year = year.split('-')[-1]
        
        #print (j)
        #print (record.id)
        #print (record.name)
        #print (record.description)
        #print (host)
        #print (location)
        #print (year)
        #print ('\n')
        
        dict.append(
            {
                'id'          :record.id,
                'name'        :record.name,
                'description' :record.description,
                'host'        :host,
                'location'    :location,
                'year'        :year,
                'seq'         :record.seq,
                'url'         :'http://www.ncbi.nlm.nih.gov/nuccore/'+record.id
            }
        )

In [None]:
for row in dict:
    print (row['id'] + ' ' + row['url'])

In [3]:
def match_score(A,B):
    if A == B: return 1
    else: return -1

In [4]:
def print_table(S,header):
    print(header)
    for i in range(0,len(S)):
        print(S[i])
    print('\n')

In [5]:
# Global Alignment
def global_alignment(v,w):
    m = len(v)
    n = len(w)

    # Backtrack enum
    R_UP = 1
    R_LEFT = 2
    R_DIAG = 3

    d = -1
    S = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]
    B = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]

    for i in range(1,len(w)+1):
        S[i][0] = d*i
        B[i][0] = R_UP
    for j in range(1,len(v)+1):
        S[0][j] = d*j
        B[0][j] = R_LEFT
    for i in range(1,len(w)+1):
        for j in range(1,len(v)+1):
            Match = S[i-1][j-1] + match_score(v[j-1],w[i-1])
            Insert = S[i-1][j] + d
            Delete = S[i][j-1] + d
            S[i][j] = max(Match, Insert, Delete)
            if S[i][j] == Match:
                B[i][j] = R_DIAG
            elif S[i][j] == Insert:
                B[i][j] = R_UP
            elif S[i][j] == Delete:
                B[i][j] = R_LEFT
    
    # Print resulting table
    #print_table(S,'Resulting Table')
    
    # Print backtrack matrix
    #print_table(B,'Backtrack Matrix')
    
    # Print resulting string
    #print('Backtrack Process')
    vr = ""
    wr = ""
    i = n
    j = m
    k = 0
    while B[i][j] != 0:
        if B[i][j] == R_DIAG:
            vr = v[j-1] + vr
            wr = w[i-1] + wr
            #print(i,j,'DIAG')
            i = i-1
            j = j-1
        elif B[i][j] == R_LEFT:
            wr = '-' + wr
            vr = v[j-1] + vr
            #print(i,j,'LEFT')
            j = j-1
        elif B[i][j] == R_UP:
            vr = '-' + vr
            wr = w[i-1] + wr
            #print(i,j,'UP')
            i = i-1

    #print('\n')
    
    #print('Global Alignment')
    #print('Length v:')
    #print(len(vr))
    #print('Length w:')
    #print(len(wr))
    #print(vr)
    #print(wr)
    #print('Score:')
    #print(S[len(w)][len(v)])
    return S[len(w)][len(v)]

In [6]:
def local_alignment(v,w):
    m = len(v)
    n = len(w)

    # Backtrack enum
    R_UP = 1
    R_LEFT = 2
    R_DIAG = 3

    d = -1
    max_val = -1
    max_row = 0
    max_col = 0
    S = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]
    B = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]

    for i in range(1,len(w)+1):
        for j in range(1,len(v)+1):
            Match = S[i-1][j-1] + match_score(v[j-1],w[i-1])
            Insert = S[i-1][j] + d
            Delete = S[i][j-1] + d
            S[i][j] = max(Match, Insert, Delete, 0)
            if S[i][j] > max_val:
                max_val = S[i][j]
                max_row = i
                max_col = j
            if S[i][j] == 0:
                continue
            elif S[i][j] == Match:
                B[i][j] = R_DIAG
            elif S[i][j] == Insert:
                B[i][j] = R_UP
            elif S[i][j] == Delete:
                B[i][j] = R_LEFT

    # Print resulting table
    #print_table(S,'Resulting Table')
          
    # Print backtrack matrix
    #print_table(B,'Backtrack Matrix')
    
    # Print resulting string
    #print('Backtrack Process')
    vr = ""
    wr = ""
    i = max_row
    j = max_col
    #print('Max Row',max_row,'Max Col',max_col)
    while B[i][j] > 0:
        if B[i][j] == R_DIAG:
            vr = v[j-1] + vr
            wr = w[i-1] + wr
            #print(i,j,'DIAG')
            i = i-1
            j = j-1
        elif B[i][j] == R_LEFT:
            #wr = '-' + wr
            vr = v[j-1] + vr
            #print(i,j,'LEFT')
            j = j-1
        elif B[i][j] == R_UP:
            #vr = '-' + vr
            wr = w[i-1] + wr
            #print(i,j,'UP')
            i = i-1

    #print('\n')
    
    #print('Local Alignment')
    #print('Length v:')
    #print(len(vr))
    #print('Length w:')
    #print(len(wr))
    #print(vr)
    #print(wr)
    #vx = vr
    #wx = wr
    #print('Score:')
    #print(max_val)
    #return vr,wr
    return max_val

In [16]:
def local_alignment(v,w):
    m = len(v)
    n = len(w)

    # Backtrack enum
    #R_UP = 1
    #R_LEFT = 2
    #R_DIAG = 3

    d = -1
    max_val = -1
    max_row = 0
    max_col = 0
    S = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]
    #B = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]

    for i in range(1,len(w)+1):
        for j in range(1,len(v)+1):
            Match = S[i-1][j-1] + match_score(v[j-1],w[i-1])
            Insert = S[i-1][j] + d
            Delete = S[i][j-1] + d
            S[i][j] = max(Match, Insert, Delete, 0)
            if S[i][j] > max_val:
                max_val = S[i][j]
            #    max_row = i
            #    max_col = j
            #if S[i][j] == 0:
            #    continue
            #elif S[i][j] == Match:
            #    B[i][j] = R_DIAG
            #elif S[i][j] == Insert:
            #    B[i][j] = R_UP
            #elif S[i][j] == Delete:
            #    B[i][j] = R_LEFT

    # Print resulting table
    #print_table(S,'Resulting Table')

    return max_val

In [None]:
def backtrack(B,v,w,max_row,max_col):
    m = len(v)
    n = len(w)

    # Backtrack enum
    R_UP = 1
    R_LEFT = 2
    R_DIAG = 3
          
    # Print backtrack matrix
    #print_table(B,'Backtrack Matrix')
    
    # Print resulting string
    #print('Backtrack Process')
    vr = ""
    wr = ""
    i = max_row
    j = max_col
    
    while B[i][j] > 0:
        if B[i][j] == R_DIAG:
            vr = v[j-1] + vr
            wr = w[i-1] + wr
            #print(i,j,'DIAG')
            i = i-1
            j = j-1
        elif B[i][j] == R_LEFT:
            #wr = '-' + wr
            vr = v[j-1] + vr
            #print(i,j,'LEFT')
            j = j-1
        elif B[i][j] == R_UP:
            #vr = '-' + vr
            wr = w[i-1] + wr
            #print(i,j,'UP')
            i = i-1

    #print('\n')
    
    #print('Local Alignment')
    #print('Length v:')
    #print(len(vr))
    #print('Length w:')
    #print(len(wr))
    #print(vr)
    #print(wr)

In [14]:
score = local_alignment(dict[1]['seq'],dict[100]['seq'])
score

48

In [16]:
score = global_alignment(dict[1]['seq'],dict[100]['seq'])
score

29

In [6]:
indonesian_virus = []
for row in dict:
    if row['location'] == 'Indonesia':
        indonesian_virus.append(row)

In [7]:
seq_len_kmeans = [[len(row['seq'])] for row in dict]

In [8]:
seq_centroid, seq_cluster, seq_cluster_error = k_means(seq_len_kmeans, n_clusters=2)

In [19]:
seq_centroid

array([[   813.76033058],
       [ 10596.31578947]])

In [9]:
#threshold: the average of centroids
threshold = (seq_centroid[0] + seq_centroid[1])/2
threshold

array([ 5705.03806003])

In [54]:
start_time = time.time() # execution time
score = []
for ind in indonesian_virus:
    i = 0
    for row in dict:
        if ind['id'] == row['id']:
            continue
        print(ind['id'] , ' vs ' , row['id'])
        i += 1
        s = ""
        if len(row['seq']) > threshold:
            print('Local')
            s = local_alignment(ind['seq'],row['seq'])
        else:
            print('Global')
            s = global_alignment(ind['seq'],row['seq'])
        score.append(s)
        print(s)
        if i > 2:
            break
print("Time elapsed: %s seconds" % (time.time() - start_time))

KU179098.1  vs  KX062044.1
Global
-468
KU179098.1  vs  KX062045.1
Global
-468
KU179098.1  vs  KX051563.1
Local


KeyboardInterrupt: 

In [11]:
print(len(indonesian_virus[0]['seq']))

1148


In [12]:
print(len(indonesian_virus[1]['seq']))

402


In [13]:
print(len(dict[2]['seq']))

10807


In [14]:
# before
start_time = time.time() # execution time
score = local_alignment(indonesian_virus[0]['seq'],dict[2]['seq'])
print(score)
print("Time elapsed: %s seconds" % (time.time() - start_time))

1130
Time elapsed: 26.79222083091736 seconds


In [18]:
# after
start_time = time.time() # execution time
score = local_alignment(indonesian_virus[0]['seq'],dict[2]['seq'])
print(score)
print("Time elapsed: %s seconds" % (time.time() - start_time))

1130
Time elapsed: 23.00524592399597 seconds


In [15]:
start_time = time.time() # execution time
score = local_alignment(indonesian_virus[1]['seq'],dict[2]['seq'])
print(score)
print("Time elapsed: %s seconds" % (time.time() - start_time))

390
Time elapsed: 9.176923990249634 seconds


In [15]:
start_time = time.time() # execution time
score = local_alignment(indonesian_virus[1]['seq'],dict[2]['seq'])
print(score)
print("Time elapsed: %s seconds" % (time.time() - start_time))

390
Time elapsed: 8.534305095672607 seconds


In [74]:
print(len(indonesian_virus)*(len(dict)-2))

352


In [17]:
start_time = time.time() # execution time
score = global_alignment(indonesian_virus[0]['seq'],dict[2]['seq'])
print(score)
print("Time elapsed: %s seconds" % (time.time() - start_time))

-8511
Time elapsed: 25.488998889923096 seconds


In [16]:
start_time = time.time() # execution time
score = global_alignment(indonesian_virus[1]['seq'],dict[2]['seq'])
print(score)
print("Time elapsed: %s seconds" % (time.time() - start_time))

-10003
Time elapsed: 8.755798816680908 seconds
