In [1]:
# import regex library and matplotlib
import re
import matplotlib.pyplot as plt

In [125]:
# open file
f = open("sequence.gb","r")
out = f.read()
f.close()

In [126]:
# see the data
print(out)

LOCUS       KX062044                 349 bp ss-RNA     linear   VRL 14-APR-2016
DEFINITION  Zika virus isolate Haiti/1227/2014 envelope protein gene, partial
            cds.
ACCESSION   KX062044
VERSION     KX062044.1  GI:1018266330
KEYWORDS    .
SOURCE      Zika virus
  ORGANISM  Zika virus
            Viruses; ssRNA viruses; ssRNA positive-strand viruses, no DNA
            stage; Flaviviridae; Flavivirus.
REFERENCE   1  (bases 1 to 349)
  AUTHORS   Lednicky,J.A., Morris,J.G. Jr., Beau De Rochars,V.M., Elbadry,M.A.,
            Okech,B.A. and Loeb,J.C.
  TITLE     Envelope protein gene sequences of two different Zika virus
            isolates from blood drawn in 2014 from Haitian children
  JOURNAL   Unpublished
REFERENCE   2  (bases 1 to 349)
  AUTHORS   Lednicky,J.A., Morris,J.G. Jr., Beau De Rochars,V.M., Elbadry,M.A.,
            Okech,B.A. and Loeb,J.C.
  TITLE     Direct Submission
  JOURNAL   Submitted (12-APR-2016) Environmental and Global Health, University
            of 

In [130]:
# implement regex
regex = re.compile('\/host="(.*?)"[\s\S]*?\/country="(.*?)"[\s\S]*?\/collection_date="(.*?)"')

In [131]:
# insert data to dictionary
result = regex.finditer(out)
dict = []
for i in result:
    dict.append(
        {
            'host': i.group(1).split(';')[0],
            'location': i.group(2).split(':')[0],
            'year': i.group(3).split('-')[-1]
        }
    )

In [132]:
# sort the dictionary
dict.sort(key=lambda x: (x['host'], x['location'], x['year']))

In [128]:
# print resulting dictionary
dict

[{'host': 'Aedes aegypti', 'location': 'Malaysia', 'year': '1966'},
 {'host': 'Aedes africanus',
  'location': 'Central African Republic',
  'year': '1976'},
 {'host': 'Aedes africanus', 'location': 'Senegal', 'year': '1984'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2

In [133]:
# Creating table for the dictionary
print('{:>5} {:<15} {:<25} {:<10}'.format('No', 'Host','Location','Year'))
i = 0
for key in dict:
    i = i+1
    print('{:>5} {:<15} {:<25} {:<5}'.format(i, key['host'], key['location'], key['year']))

   No Host            Location                  Year      
    1 Aedes aegypti   Malaysia                  1966 
    2 Aedes africanus Central African Republic  1976 
    3 Aedes africanus Senegal                   1984 
    4 Aedes africanus Senegal                   1984 
    5 Aedes opok      Central African Republic  2012 
    6 Aedes taylori   Senegal                   1984 
    7 Aedes taylori   Senegal                   1984 
    8 Homo sapiens    Brazil                    2015 
    9 Homo sapiens    Brazil                    2015 
   10 Homo sapiens    Brazil                    2015 
   11 Homo sapiens    Brazil                    2015 
   12 Homo sapiens    Brazil                    2015 
   13 Homo sapiens    Brazil                    2015 
   14 Homo sapiens    Brazil                    2015 
   15 Homo sapiens    Brazil                    2015 
   16 Homo sapiens    Brazil                    2015 
   17 Homo sapiens    Brazil                    2015 
   18 Homo sapiens    B

In [3]:
# example of generating graph
D = {u'Label1':26, u'Label2': 17, u'Label3':30}

plt.bar(range(len(D)), D.values(), align='center')
plt.xticks(range(len(D)), ['Label1', 'Label2', 'Label3'])

plt.show()

In [8]:
# example of iterating
from collections import Counter
from itertools import chain

year = []
location = []
host = []
for key in dict:
    year.append(key['year'])
    location.append(key['location'])
    host.append(key['host'])

In [9]:
# host count
Counter(host)

Counter({'Aedes aegypti': 1,
         'Aedes africanus': 3,
         'Aedes opok': 1,
         'Aedes taylori': 2,
         'Homo sapiens': 167,
         'Macaca mulatta': 2,
         'sentinel monkey': 1,
         'sentinel rhesus': 1})

In [10]:
# location count
Counter(location)

Counter({'Brazil': 37,
         'Cambodia': 2,
         'Canada': 1,
         'Central African Republic': 2,
         'Chile': 51,
         'China': 12,
         'Colombia': 4,
         'Cook Islands': 3,
         'French Polynesia': 7,
         'Gabon': 2,
         'Guatemala': 2,
         'Haiti': 5,
         'Indonesia': 1,
         'Israel': 4,
         'Italy': 3,
         'Malaysia': 1,
         'Martinique': 1,
         'Mexico': 6,
         'New Caledonia': 8,
         'Nigeria': 2,
         'Norway': 1,
         'Philippines': 2,
         'Puerto Rico': 1,
         'Russia': 3,
         'Senegal': 4,
         'Suriname': 4,
         'Thailand': 3,
         'USA': 2,
         'Uganda': 4})

In [11]:
# year count
Counter(year)

Counter({'06': 1,
         '1947': 3,
         '1966': 1,
         '1968': 2,
         '1976': 1,
         '1984': 4,
         '2007': 3,
         '2010': 2,
         '2012': 3,
         '2013': 9,
         '2014': 69,
         '2015': 51,
         '2016': 29})

In [12]:
# getting all unique years
s = set()
for key in dict:
    s.add(key['year'])
s

{'06',
 '1947',
 '1966',
 '1968',
 '1976',
 '1984',
 '2007',
 '2010',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016'}

In [13]:
D = {u'Label1':26, u'Label2': 17, u'Label3':30}

plt.bar(range(len(dict)), dict.values(), align='center')
plt.xticks(range(len(dict)), ['Host', 'Location', 'Year'])

plt.show()

AttributeError: 'list' object has no attribute 'values'

In [14]:
D

{'Label1': 26, 'Label2': 17, 'Label3': 30}

In [15]:
dict

[{'host': 'Aedes aegypti', 'location': 'Malaysia', 'year': '1966'},
 {'host': 'Aedes africanus',
  'location': 'Central African Republic',
  'year': '1976'},
 {'host': 'Aedes africanus', 'location': 'Senegal', 'year': '1984'},
 {'host': 'Aedes africanus', 'location': 'Senegal', 'year': '1984'},
 {'host': 'Aedes opok',
  'location': 'Central African Republic',
  'year': '2012'},
 {'host': 'Aedes taylori', 'location': 'Senegal', 'year': '1984'},
 {'host': 'Aedes taylori', 'location': 'Senegal', 'year': '1984'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'location': 'Brazil', 'year': '2015'},
 {'host': 'Homo sapiens', 'loc

In [92]:
import numpy

str = 'gcat'
S = []
for i in range(0,len(str)):
    for j in range(0,len(str)):
        if str[i] == str[j]:
            S[str[i]][str[j]] = 1
        else:
            S[str[i]][str[j]] = -1

TypeError: list indices must be integers, not str

In [154]:
def match_score(A,B):
    if A == B: return 1
    else: return -1

def global_alignment(v,w):
    m = len(v)
    n = len(w)

    # Backtrack enum
    R_UP = 1
    R_LEFT = 2
    R_DIAG = 3

    d = -1
    S = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]
    B = [[0 for x in range(len(v)+1)] for y in range(len(w)+1)]

    for i in range(0,len(w)+1):
        S[i][0] = d*i
        B[i][0] = R_UP
    for j in range(0,len(v)+1):
        S[0][j] = d*j
        B[0][j] = R_LEFT
    for i in range(1,len(w)+1):
        for j in range(1,len(v)+1):
            print(v[j-1],' = ',w[i-1])
            Match = S[i-1][j-1] + match_score(v[j-1],w[i-1])
            Insert = S[i-1][j] + d
            Delete = S[i][j-1] + d
            S[i][j] = max(Match, Insert, Delete)
            if S[i][j] == Match:
                B[i][j] = R_DIAG
            elif S[i][j] == Insert:
                B[i][j] = R_UP
            elif S[i][j] == Delete:
                B[i][j] = R_LEFT

    # Print resulting matrix
    for i in range(0,len(S)):
        print(S[i])
        
    print('\n')
    
    # Print backtrack matrix
    for i in range(0,len(B)):
        print(B[i])

    # Print resulting string
    vr = ""
    wr = ""
    i = n-1
    j = m-1
    k = 0
    while i >= 0 or j >= 0:
        if B[i][j] == R_DIAG:
            vr = v[j] + vr
            wr = w[i] + wr
            print(i,j,'DIAG')
            i = i-1
            j = j-1
        elif B[i][j] == R_LEFT:
            wr = '-' + wr
            vr = v[j] + vr
            print(i,j,'LEFT')
            j = j-1
        elif B[i][j] == R_UP:
            vr = '-' + vr
            wr = w[i] + wr
            print(i,j,'UP')
            i = i-1
        k = k+1
        if k == 8:
            break

    print('\n')
    print(vr)
    print(wr)

In [155]:
v = 'GCATGC'
w = 'ATGC'

global_alignment(v,w)

G  =  A
C  =  A
A  =  A
T  =  A
G  =  A
C  =  A
G  =  T
C  =  T
A  =  T
T  =  T
G  =  T
C  =  T
G  =  G
C  =  G
A  =  G
T  =  G
G  =  G
C  =  G
G  =  C
C  =  C
A  =  C
T  =  C
G  =  C
C  =  C
[0, -1, -2, -3, -4, -5, -6]
[-1, -1, -2, -1, -2, -3, -4]
[-2, -2, -2, -2, 0, -1, -2]
[-3, -1, -2, -3, -1, 1, 0]
[-4, -2, 0, -1, -2, 0, 2]


[2, 2, 2, 2, 2, 2, 2]
[1, 3, 3, 3, 2, 2, 2]
[1, 3, 3, 1, 3, 2, 2]
[1, 3, 2, 3, 1, 3, 2]
[1, 1, 3, 2, 1, 1, 3]
3 5
2 4 DIAG
1 3 DIAG
0 2 DIAG
0 1 LEFT
0 0 LEFT
0 -1 LEFT
0 -2 LEFT
0 -3 LEFT


GCGCATGC
-----TGC






NameError: name 'B' is not defined

In [86]:
v = 'PLEASANTLY'
w = 'MEANLY'

global_alignment(v,w)

[0, -1, -2, -3, -4, -5, -6]
[-1, -1, -2, -3, -4, -5, -6]
[-2, -2, -2, -3, -4, -3, -4]
[-3, -3, -1, -2, -3, -4, -4]
[-4, -4, -2, 0, -1, -2, -3]
[-5, -5, -3, -1, -1, -2, -3]
[-6, -6, -4, -2, -2, -2, -3]
[-7, -7, -5, -3, -1, -2, -3]
[-8, -8, -6, -4, -2, -2, -3]
[-9, -9, -7, -5, -3, -1, -2]
[-10, -10, -8, -6, -4, -2, 0]


[1, 1, 1, 1, 1, 1, 1]
[2, 3, 3, 3, 3, 3, 3]
[2, 3, 3, 3, 3, 3, 2]
[2, 3, 3, 2, 2, 1, 3]
[2, 3, 1, 3, 2, 2, 2]
[2, 3, 1, 1, 3, 3, 3]
[2, 3, 1, 3, 3, 3, 3]
[2, 3, 1, 1, 3, 2, 3]
[2, 3, 1, 1, 1, 3, 3]
[2, 3, 1, 1, 1, 3, 2]
[2, 3, 1, 1, 1, 1, 3]


PL-EASANTLY
-YMEA--NL-Y


In [87]:
v = 'RDNDMHLCLLMPTLLDNSLHISGEHEEWGVKPWGCIDCDLMTIILQGWNNMMLNWGLFLHNDFVCAYVNICTISHRSMAGLYCYACNYDWYCQNPSWWDCEQYYPAVLINDGCCMHIQQDIMFHWMCWGSFCAEGPFQHYRILLFEWMMNFPATDKEQYAYPCAFSRNCWIRVMVENQCTNVSKLLSILQLKWCNENIAPYGQTMNVEYMNIRKSCTKMWIDFTKESTHDFRIMVEWMRIMNHHMQFEHHSTAFMAVVCALEECLSESVDRLILYMSENWALKWAGIYRPCKPLTKFQQTGNWMFGRIQFALASKEWAVIAHVCEVRLHMIPTVDNRCWCPEVQSQYSPHDTEGCKGRSWGWKLEEQVFNTAGCGLIELKHYDGIVTKVNIIWGLCNVLWNDWLRCWRSVLLLKIYGKSYWALNQENHYTIKSVTNHGCRGQMPNLGDTYRGPTFWDQVDWYAYLYCWEPTQGEICLHYQHNAVLKPLMSIMCGTPSRCMWAKYSTHHSWRKYFIFGSKSTSQDFWRRTWTSATETPAEYEVKKFTCHGKDKGFGPSCHVGTFCAIYAPAKFNAFNFNADVEMKMDHIRISSWAFLGAVHQHAPYGCIRASWVPQMCGFSCMNFKTDHQRRPEFDGQGQVWINLDYHSHECMPYCVNRYGLWLCPVIDCNTWNSTGIHACGSLDLVSKGKTWRKTAIYCKRRKEMGIQRDNCKNEGMFQIWERITSMPAIFYAWHSPMFAGQRGANQQTAVEPQKSVVMVIQYGDKMPTWIHCASDFPFVSTCLDKKQTQDGEKAQSDPHHSNNDGHAWDSAFAQGRARDLCLWDTMMRDIVKAQGKGLDGMFPKMFGILKKGVMWFKWAFAVNDIPKWCNHKT'
w = 'RDNKMHLCLNMPYLLDLSLHISGSHEEWGVKNWGCIVCDLLTIILQGWNNMMLNWGLFLHNDFVCAYVNICTISHRSMAVQPCMLYCYACPYDEWCQYVLIWDGCIWWQFFSWMCWGSFCEDRRDICHEGPFQHQRILLPETHILDKNTDRFQCWLHRFENQCTNLSKLGYHPCVCCLQCKSNNAPYGQWSLKVANIMNVEYMNIRKSCTKMWIDFTKESTFGPCGEYDFRIEWMRIMKLRNCRPGYHHMQFEHHTAFMAVVACHLRHVALEEDFHRNESHHFSESVDHLITFQMEENWAKPLRKFQQTGRNQFALASKEWAVEYRWTSVLLAHMCEVRAHMIPTVDNRCWCPEVQSQPIMHISTKHDCSWGWKHGEEDLMVGWQRYRFNTAGCGLIELKHVDGAVTKVNCIWGLCNVLGNDWLRCSVLLLKNQENHYTIGYCDSQCTNPGMSPGYRCKAQKWYFGQMPNLGCTYRGPTFNDYIGIFPEWSMACFQWYAYLYCCWQDFSFWHETQGEIAVGCLHYQMKMPDYIVKIMCGTPSRMWAKYSTHHSWIFGSLSTSCDFWRRTWYSATPAEMEHEGQEVHKFTCHIWQWRRYCNCGQRFRAPWAPAKFNAFNFNADVEMKMDHIRISSWAFLRAVHQHAPYRCHFRASWVPQMCGFEKRHLPNEQKHYMGLHQRRPEFDNLDEHNHEIMPYCAVIDNTWKSTGIHVVSVQIMMCLDLVSKGKTWRKTAIYCKRRRDDYFPEMGIQRCNCKNEGMFQIWERILSMPAKFYAWHSPMYAGQRGANQQTALEPMPIYQLEYGDMPTWIHAASDFPTTWMKEQTQSGEKADPHRDWSNNDAWDSAFAQGRARDLCLWDTMMRDMVTHFAQGKGLDGMFPKMFGYYLKKGVMVFRWAFASNDGPKWCNHKTC'

global_alignment(v,w)

[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, -65, -66, -67, -68, -69, -70, -71, -72, -73, -74, -75, -76, -77, -78, -79, -80, -81, -82, -83, -84, -85, -86, -87, -88, -89, -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, -100, -101, -102, -103, -104, -105, -106, -107, -108, -109, -110, -111, -112, -113, -114, -115, -116, -117, -118, -119, -120, -121, -122, -123, -124, -125, -126, -127, -128, -129, -130, -131, -132, -133, -134, -135, -136, -137, -138, -139, -140, -141, -142, -143, -144, -145, -146, -147, -148, -149, -150, -151, -152, -153, -154, -155, -156, -157, -158, -159, -160, -161, -162, -163, -164, -165, -166, -167, -168, -169, -170, -171, -172, -173, -174, -175, -176, -177, -178, -179, -180, -181, -182, -183, -184, 