In [29]:
# Import Libraries
%matplotlib inline
import re
import matplotlib.pyplot as plt
import numpy as np
import Bio.SeqIO as SeqIO
from sklearn.cluster import KMeans, k_means

In [59]:
# Extract Data
f = open("sequence.gb","r")
out = f.read()
f.close()

j = 0
dict = []
for i, record in enumerate(SeqIO.parse("sequence.gb", "genbank")):
    if all (key in record.features[0].qualifiers for key in ('host','country','collection_date')):
        j+=1
        
        host = record.features[0].qualifiers['host'][0]
        host = host.split(';')[0]
        
        location = record.features[0].qualifiers['country'][0]
        location = location.split(':')[0]
        
        year = record.features[0].qualifiers['collection_date'][0]
        year = year.split('-')[-1]
        
        #print (j)
        #print (record.id)
        #print (record.name)
        #print (record.description)
        #print (host)
        #print (location)
        #print (year)
        #print ('\n')
        
        dict.append(
            {
                'id'          :record.id,
                'name'        :record.name,
                'description' :record.description,
                'host'        :host,
                'location'    :location,
                'year'        :year,
                'seq'         :record.seq,
                'url'         :'http://www.ncbi.nlm.nih.gov/nuccore/'+record.id
            }
        )

In [16]:
for row in dict:
    print (row['id'] + ' ' + row['url'])

KX062044.1 http://www.ncbi.nlm.nih.gov/nuccore/KX062044.1
KX062045.1 http://www.ncbi.nlm.nih.gov/nuccore/KX062045.1
KX051563.1 http://www.ncbi.nlm.nih.gov/nuccore/KX051563.1
KX056898.1 http://www.ncbi.nlm.nih.gov/nuccore/KX056898.1
KX059013.1 http://www.ncbi.nlm.nih.gov/nuccore/KX059013.1
KX059014.1 http://www.ncbi.nlm.nih.gov/nuccore/KX059014.1
KU978616.1 http://www.ncbi.nlm.nih.gov/nuccore/KU978616.1
KU963796.1 http://www.ncbi.nlm.nih.gov/nuccore/KU963796.1
KU963573.1 http://www.ncbi.nlm.nih.gov/nuccore/KU963573.1
KU963574.1 http://www.ncbi.nlm.nih.gov/nuccore/KU963574.1
KU991811.1 http://www.ncbi.nlm.nih.gov/nuccore/KU991811.1
KU985087.1 http://www.ncbi.nlm.nih.gov/nuccore/KU985087.1
KU985088.1 http://www.ncbi.nlm.nih.gov/nuccore/KU985088.1
KU940224.1 http://www.ncbi.nlm.nih.gov/nuccore/KU940224.1
KU940227.1 http://www.ncbi.nlm.nih.gov/nuccore/KU940227.1
KU940228.1 http://www.ncbi.nlm.nih.gov/nuccore/KU940228.1
KU954085.1 http://www.ncbi.nlm.nih.gov/nuccore/KU954085.1
KU955589.1 htt

In [60]:
def match_score(A,B):
    if A == B: return 1
    else: return -1

In [61]:
def print_table(S,header):
    print(header)
    for i in range(0,len(S)):
        print(S[i])
    print('\n')

In [62]:
# Global Alignment
def global_alignment(v,w):
    m = len(v)
    n = len(w)

    # Backtrack enum
    R_UP = 1
    R_LEFT = 2
    R_DIAG = 3

    d = -1
    S = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]
    B = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]

    for i in range(1,len(w)+1):
        S[i][0] = d*i
        B[i][0] = R_UP
    for j in range(1,len(v)+1):
        S[0][j] = d*j
        B[0][j] = R_LEFT
    for i in range(1,len(w)+1):
        for j in range(1,len(v)+1):
            Match = S[i-1][j-1] + match_score(v[j-1],w[i-1])
            Insert = S[i-1][j] + d
            Delete = S[i][j-1] + d
            S[i][j] = max(Match, Insert, Delete)
            if S[i][j] == Match:
                B[i][j] = R_DIAG
            elif S[i][j] == Insert:
                B[i][j] = R_UP
            elif S[i][j] == Delete:
                B[i][j] = R_LEFT
    
    # Print resulting table
    #print_table(S,'Resulting Table')
    
    # Print backtrack matrix
    #print_table(B,'Backtrack Matrix')
    
    # Print resulting string
    #print('Backtrack Process')
    vr = ""
    wr = ""
    i = n
    j = m
    k = 0
    while B[i][j] != 0:
        if B[i][j] == R_DIAG:
            vr = v[j-1] + vr
            wr = w[i-1] + wr
            #print(i,j,'DIAG')
            i = i-1
            j = j-1
        elif B[i][j] == R_LEFT:
            wr = '-' + wr
            vr = v[j-1] + vr
            #print(i,j,'LEFT')
            j = j-1
        elif B[i][j] == R_UP:
            vr = '-' + vr
            wr = w[i-1] + wr
            #print(i,j,'UP')
            i = i-1

    #print('\n')
    
    print('Global Alignment')
    print('Length v:')
    print(len(vr))
    print('Length w:')
    print(len(wr))
    print(vr)
    print(wr)
    print('Score:')
    print(S[len(w)][len(v)])

In [63]:
def local_alignment(v,w):
    m = len(v)
    n = len(w)

    # Backtrack enum
    R_UP = 1
    R_LEFT = 2
    R_DIAG = 3

    d = -1
    max_val = -1
    max_row = 0
    max_col = 0
    S = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]
    B = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]

    for i in range(1,len(w)+1):
        for j in range(1,len(v)+1):
            Match = S[i-1][j-1] + match_score(v[j-1],w[i-1])
            Insert = S[i-1][j] + d
            Delete = S[i][j-1] + d
            S[i][j] = max(Match, Insert, Delete, 0)
            if S[i][j] > max_val:
                max_val = S[i][j]
                max_row = i
                max_col = j
            if S[i][j] == 0:
                continue
            elif S[i][j] == Match:
                B[i][j] = R_DIAG
            elif S[i][j] == Insert:
                B[i][j] = R_UP
            elif S[i][j] == Delete:
                B[i][j] = R_LEFT

    # Print resulting table
    #print_table(S,'Resulting Table')
          
    # Print backtrack matrix
    #print_table(B,'Backtrack Matrix')
    
    # Print resulting string
    #print('Backtrack Process')
    vr = ""
    wr = ""
    i = max_row
    j = max_col
    while B[i][j] > 0:
        if B[i][j] == R_DIAG:
            vr = v[j-1] + vr
            wr = w[i-1] + wr
            #print(i,j,'DIAG')
            i = i-1
            j = j-1
        elif B[i][j] == R_LEFT:
            wr = '-' + wr
            vr = v[j-1] + vr
            #print(i,j,'LEFT')
            j = j-1
        elif B[i][j] == R_UP:
            vr = '-' + vr
            wr = w[i-1] + wr
            #print(i,j,'UP')
            i = i-1

    #print('\n')
    
    print('Local Alignment')
    print('Length v:')
    print(len(vr))
    print('Length w:')
    print(len(wr))
    print(vr)
    print(wr)
    vx = vr
    wx = wr
    print('Score:')
    print(max_val)
    return vr,wr

In [85]:
vr,wr = local_alignment(dict[1]['seq'],dict[19]['seq'])

Local Alignment
Length v:
349
Length w:
349
GAAGCACTGGTAGAGTTCAAGGACGCACATGCCAAAAGGCAAACTGTCGTGGTTCTAGGGAGTCAAGAAGGAGCAGTTCACACGGCCCTTGCTGGAGCTCTGGAGGCTGAGATGGATGGTGCAAAGGGAAGGCTGTCCTCTGGCCACTTGAAATGTCGCCTGAAAATGGATAAACTTAGATTGAAGGGCGTGTCATACTCCTTGTGTACCGCAGCGTTCACATTCACCAAGATCCCGGCTGAAACACTGCACGGGACAGTCACAGTGGAGGTACAGTACGCAGGGACAGATGGACCTTGCAAGGTTCCAGCTCAGATGGCGGTGGACATGCAAACTCTGACCCCAGTTG
GAGGCATTGGTGGAGTTCAAGGACGCCCACGCCAAGAGGCAAACTGTTGTGGTTCTGGGGAGCCAAGAGGGAGCTGTTCATACGGCCCTCGCTGGAGCTTTGGAGGCTGAGATGGATGGTGCAAAGGGAAGGCTATTCTCTGGCCATTTGAAATGCCGCCTAAAAATGGACAAGCTTAGGTTGAAGGGTGTGTCATATTCCCTGTGTACCGCAGCGTTCACATTTACCAAGGTCCCAGCTGAAACATTGCATGGAACAGTCACAGTGGAGGTGCAGTATGCAGGGACAGACGGACCCTGCAAAGTCCCAGCCCAGATGGCGGTGGACATGCAGACCCTGACCCCAGTTG
Score:
269


In [84]:
len(dict[1]['seq'])

349

In [80]:
print(dict[21]['seq'])

AGTTGTTGATCTGTGTGAATCAGACTGCGACAGTTCGAGTTTGAAGCGAAAGCTAGCAACAGTATCAACAGGTTTTATTTTGGATTTGGAAACGAGAGTTTCTGGTCATGAAAAACCCAAAGAAGAAATCCGGAGGATTCCGGATTGTCAATATGCTAAAACGCGGAGTAGCCCGTGTGAGCCCCTTTGGGGGCTTGAAGAGGCTGCCAGCCGGACTTCTGCTGGGTCATGGGCCCATCAGGATGGTCTTGGCGATTCTAGCCTTTTTGAGATTCACGGCAATCAAGCCATCACTGGGTCTCATCAATAGATGGGGTTCAGTGGGGAAAAAAGAGGCTATGGAAATAATAAAGAAGTTTAAGAAAGATCTGGCTGCCATGCTGAGAATAATCAATGCTAGGAAGGAGAAGAAGAGACGAGGCACAGATACTAGTGTCGGAATTGTTGGCCTCCTGCTGACCACAGCCATGGCAGTGGAGGTCACTAGACGTGGGAATGCATACTATATGTACTTGGACAGAAGCGATGCTGGGGAGGCCATATCTTTTCCAACCACAATGGGGATGAATAAGTGTTATATACAGATCATGGATCTTGGACACATGTGTGATGCCACCATGAGCTATGAATGCCCTATGCTGGATGAGGGGGTAGAACCAGATGACGTCGATTGTTGGTGCAACACGACGTCAACTTGGGTTGTGTACGGAACCTGCCACCACAAAAAAGGTGAAGCACGGAGATCTAGAAGAGCTGTGACGCTCCCCTCCCATTCCACTAGGAAGCTGCAAACGCGGTCGCAGACCTGGTTGGAATCAAGAGAATACACAAAGCACCTGATTAGAGTCGAAAATTGGATATTCAGGAACCCTGGCTTCGCGTTAGCAGCAGCTGCCATCGCTTGGCTTTTGGGAAGCTCAACGAGCCAAAAAGTCATATACTTGGTCATGATACTGCTGATTGCCCCGGCATACAGCATCAGGTGCATAGGAGTCAGCAA

In [65]:
global_alignment(s[0],s[1])

Global Alignment
Length v:
360
Length w:
360
CCGCTGCCCAACACAAGGTGAAGCCTACCTTGACAAGCAATCAGACACTCAATATGTCTGCAAAAGAACGTTAGTGGACAGAGGCTGGGGAAATGGATGTGGACTTTTTGGCAAAGGGAGCCTGGTGACATGCGCTAAGTTTGCATGCTCCAAGAAAATGACCGGGAAGAGCATCCAGCCAGAGAATCTGGAGTACCGGATAATGCTGTCAGTTCATGGCTCCCAGCACAGTGGGATGATCGTTAATGACACAGGACATGAAACTGATGAGAATAGAGCGAAGGTTGAGATAACGCCCAATTCACCAAGAGCCGAAGCCACCCTGGGGGGTTTTGGAAGCCTAGGACTTGATTGTGAACC
CCGCTGCCCAACACAAGGTGAAGCCTACCTTGACAAGCAATCAGACACTCAATATGTCTGCAAAAGAACGTTAGTGGACAGAGGCTGGGGAAATGGATGTGGACTTTTTGGCAAAGGGAGCCTGGTGACATGCGCTAAGTTTGCATGCTCCAAGAAAATGACCGGGAAGAGCATCCAGCCAGAGAATCTGGAGTACCGGATAATGCTGTCAGTTCATGGCTCCCAGCACAGTGGGATGATCGTTAATGACACAGGACATGAAACTGATGAGAATAGAGCGAAGGTTGAGATAACGCCCAATTCACCAAGAGCCGAAGCCACCCTGGGGGGTTTTGGAAGCCTAGGACTTGATTGTGAACC
Score:
360


In [4]:
for row in dict:
    print(len(row['seq']))

349
349
10807
10272
1024
1024
489
10709
10710
10442
10643
755
360
10525
10551
10676
240
10574
10569
10806
10806
10807
10795
10806
10798
10795
10805
231
231
231
231
10617
10617
10272
10272
10636
10636
459
564
281
10800
10805
10645
10729
10401
651
10676
10574
1813
1148
10793
238
10648
10808
665
663
688
667
682
679
679
672
676
693
678
661
692
692
1275
10807
10807
10807
749
2601
2601
10617
10675
10272
10272
10662
10727
10662
10662
705
10374
2826
2825
1922
10676
281
281
310
310
309
310
789
330
330
789
226
296
296
296
296
296
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
976
893
792
281
10141
1512
281
281
10617
201
281
281
841
772
467
10788
402
10269
10269
10251
10269
10269
10272


In [5]:
dict

[{'description': 'Zika virus isolate Haiti/1227/2014 envelope protein gene, partial cds.',
  'host': 'Homo sapiens',
  'id': 'KX062044.1',
  'location': 'Haiti',
  'name': 'KX062044',
  'seq': Seq('GAAGCACTGGTAGAGTTCAAGGACGCACATGCCAAAAGGCAAACTGTCGTGGTT...TTG', IUPACAmbiguousDNA()),
  'url': 'http://www.ncbi.nlm.nih.gov/nuccore/KX062044.1',
  'year': '2014'},
 {'description': 'Zika virus isolate Haiti/1230/2014 envelope protein gene, partial cds.',
  'host': 'Homo sapiens',
  'id': 'KX062045.1',
  'location': 'Haiti',
  'name': 'KX062045',
  'seq': Seq('GAAGCACTGGTAGAGTTCAAGGACGCACATGCCAAAAGGCAAACTGTCGTGGTT...TTG', IUPACAmbiguousDNA()),
  'url': 'http://www.ncbi.nlm.nih.gov/nuccore/KX062045.1',
  'year': '2014'},
 {'description': 'Zika virus isolate Haiti/1/2016, complete genome.',
  'host': 'Homo sapiens',
  'id': 'KX051563.1',
  'location': 'USA',
  'name': 'KX051563',
  'seq': Seq('AGTTGTTACTGTTGCTGACTCAGACTGCGACAGTTCGAGTTTGAAGCGAAAGCT...CTT', IUPACAmbiguousDNA()),
  'url': 'http://w

In [13]:
seq_len_kmeans = [[len(row['seq'])] for row in dict]

In [37]:
random_state = 1000

In [57]:
seq_centroid, seq_cluster, seq_cluster_error = k_means(seq_len_kmeans, n_clusters=2)

In [56]:
seq_cluster_error

30665512.365376152

In [54]:
seq_cluster_error

30665512.365376152

In [58]:
seq_cluster

array([0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1])