# Chapter 5 Searching Data

## 5.2 STORY: TRANSLATING AN RNA SEQUENCE INTO THE CORRESPONDING PROTEIN SEQUENCE
### 5.2.2 Example Python Session

In [10]:
codon_table = {'GCU': 'A','GCC':'A','GCA':'A','GCG':'A','CGU':'R',
               'CGC':'R','CGA':'R','CGG':'R','AGA':'R','AGG':'R',
               'UCU':'S','UCC':'S','UCA':'S','UCG':'S','AGU':'S',
               'AGC':'S','AUC':'I','AUA':'I','AUU':'I','UUA':'L',
               'UUG':'L','CUU':'L','UAU':'Y','UAC':'Y','AUG':'M',
               'CUC':'L','CUA':'L','CUG':'L','GGU':'G','GGC':'G',
               'GGA':'G','GGG':'G','GUU':'V','GUC':'V','GUA':'V',
               'GUG':'V','ACU':'T','ACC':'T','ACA':'T','ACG':'T',
               'CCU':'P','CCC':'P','CCA':'P','CCG':'P','AAU':'N',
               'AAC':'N','GAU':'D','GAC':'D','UGU':'C','UGC':'C',
               'CAA':'Q','CAG':'Q','GAA':'E','GAG':'E','CAU':'H',
               'CAC':'H','AAA':'K','AAG':'K','UUU':'F','UUC':'F',
               'UGG':'W','UAG':'STOP','UGA':'STOP','UAA':'STOP' }

STANDARD_GENETIC_CODE = {
'UUU':'Phe', 'UUC':'Phe', 'UCU':'Ser', 'UCC':'Ser',
'UAU':'Tyr', 'UAC':'Tyr', 'UGU':'Cys', 'UGC':'Cys',
'UUA':'Leu', 'UCA':'Ser', 'UAA':None, 'UGA':None,
'UUG':'Leu', 'UCG':'Ser', 'UAG':None, 'UGG':'Trp',
'CUU':'Leu', 'CUC':'Leu', 'CCU':'Pro', 'CCC':'Pro',
'CAU':'His', 'CAC':'His', 'CGU':'Arg', 'CGC':'Arg',
'CUA':'Leu', 'CUG':'Leu', 'CCA':'Pro', 'CCG':'Pro',
'CAA':'Gln', 'CAG':'Gln', 'CGA':'Arg', 'CGG':'Arg',
'AUU':'Ile', 'AUC':'Ile', 'ACU':'Thr', 'ACC':'Thr',
'AAU':'Asn', 'AAC':'Asn', 'AGU':'Ser', 'AGC':'Ser',
'AUA':'Ile', 'ACA':'Thr', 'AAA':'Lys', 'AGA':'Arg',
'AUG':'Met', 'ACG':'Thr', 'AAG':'Lys', 'AGG':'Arg',
'GUU':'Val', 'GUC':'Val', 'GCU':'Ala', 'GCC':'Ala',
'GAU':'Asp', 'GAC':'Asp', 'GGU':'Gly', 'GGC':'Gly',
'GUA':'Val', 'GUG':'Val', 'GCA':'Ala', 'GCG':'Ala',
'GAA':'Glu', 'GAG':'Glu', 'GGA':'Gly', 'GGG':'Gly'}

# Common imports
import numpy as np
import os
# Where to save the data
PROJECT_ROOT_DIR = "./data/chap_5/"

In [12]:
rna = ''

for line in open(os.path.join(PROJECT_ROOT_DIR,"A06662-RNA.fasta"),"r"):
    if not line.startswith(">"):
        rna += line.strip()

# translate one frame at a time
for frame in range(3):
    prot = ''
    print("Reading frame "+str(frame+1))
    
    for i in range(frame,len(rna),3):
        codon = rna[i:i+3]
        
        if codon in codon_table:
            if codon_table[codon] == 'STOP':
                prot += '*'
            else:
                prot += codon_table[codon]
        else:
            # handle too short codons
            prot += '-'
            
    # format to blocks of 48 columns
    i = 0
    while i < len(prot):
        print(prot[i:i+48])
        i += 48

Reading frame 1
WDQSAEAACVRVRVRVCACVCVRLHLCRVGKEIEMGGQ*AQVPKALNP
LVWSLLRAMGAIEKSEQGCV*M*GLEGSSREASSKAFAIIW*ENPARM
DRQNGIEMSWQLKWTGFGTSLVVGSKQRRIWDSGGLAWGRRGCLRGWE
G*E*DDTWWCLAGGGQG*LCEGTARATEAF*DPAVPEPGRQDLHCGRP
GEHLA
Reading frame 2
GTSQQRQRVCACVCVCVRVCVYACICVGWVRR*RWAGSRPRSRRP*TH
WFGVS*GQWGPLRSLNRAVSECEV*KDPPEKPALKLLQSSGERTQQGW
TGRME*R*VGS*SGQDLVLAWLWGASRGESGTLVVWPGADGGVSGAGR
DESRMIHGGVWQEAGKDDYVKALPGQLKPFETLLSQNQGGKTFIVGDQ
VSIW-
Reading frame 3
GPVSRGSVCARACACVCVCVCTLAFVSGG*GDRDGRAVGPGPEGLEPT
GLESPKGNGGH*EV*TGLCLNVRSRRILQRSQL*SFCNHLVREPSKDG
QAEWNRDELAAEVDRIWY*PGCGEQAEENLGLWWSGLGQTGVSQGLGG
MRVG*YMVVSGRRRARMTM*RHCPGN*SLLRPCCPRTREARPSLWETR
*ASG-


### 5.3.3 Searching with while Loops

In [15]:
swissprot = open(os.path.join(PROJECT_ROOT_DIR,"SwissProt.fasta"),"r")
insulin_ac = 'P61981'
result = None
while result == None:
    try:
        line = next(swissprot) #Python3 doesn't have f.next,use next(f) instead
        if line.startswith(">"):
            ac = line.split("|")[1]
            if ac == insulin_ac:
                result = line.strip()
                print(result)
    except StopIteration:
        print("Not found %s"%(insulin_ac))
        result = True

Not found P61981


### Example 5.1 How to Fill a Dictionary from a FASTA File 
### Where the Uniprot ACs Are the Keys and the Corresponding Sequences Are Their Values

In [16]:
sequence = {}
seq = ''
ac = ''

for line in open(os.path.join(PROJECT_ROOT_DIR,"SwissProt.fasta"),"r"):
    if line.startswith(">") and seq == '':
        ac = line.split("|")[1]
    
    elif line[0] != '>':
        seq += line.strip()
    
    elif line.startswith(">") and seq != '':
        sequence[ac] = seq
        seq = ''
        ac = line.split("|")[1]

sequence[ac] = seq
print(sequence.keys())
print(sequence)

dict_keys(['P06213', 'P01308', 'P08025', 'P05019', 'P09208', 'P15127', 'P08069', 'P24062', 'P01344', 'P15208'])
{'P06213': 'MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHLQILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYALVIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNEECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECLGNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQGCHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGCTVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETLEIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQERNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQNVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFSDERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWERQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQILKELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAFPNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYVSARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKE

### Example 5.2 How to Write a Simple Protein Sequence Loop Predictor

In [17]:
propensities = {
'N': 0.2299, 'P': 0.5523, 'Q':-0.18770, 'A':-0.2615,
'R':-0.1766, 'S': 0.1429, 'C':-0.01515, 'T': 0.0089,
'D': 0.2276, 'E':-0.2047, 'V':-0.38620, 'F':-0.2256,
'W':-0.2434, 'G': 0.4332, 'H':-0.00120, 'Y':-0.2075,
'I':-0.4222, 'K':-0.1001, 'L': 0.33793, 'M':-0.2259
}

threshold = 0.3

input_seq = "IVGGYTCGANTVPYQVSLNSGYHFCGGSLINS\
QWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVH\
PSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQ\
CLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQI\
TSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWG\
SGCAQKNKPGVYTKVCNYVSWIKQTIASN"

output_seq = ""
# Cycle over every amino acid in input_seq
for res in input_seq:
    if res in propensities:
        if propensities[res] >= threshold:
            output_seq += res.upper()
        else:
            output_seq += res.lower()
    else:
        print("Unrecognized character:",res)
        break

print(output_seq)

ivGGytcGantvPyqvsLnsGyhfcGGsLinsqwvvsaahcyksGiqvrLGedninvveGneqfisasksivhPsynsntLnndimLikLksaasLnsrvasisLPtscasaGtqcLisGwGntkssGtsyPdvLkcLkaPiLsdsscksayPGqitsnmfcaGyLeGGkdscqGdsGGPvvcsGkLqGivswGsGcaqknkPGvytkvcnyvswikqtiasn


### Example 5.3 How to Extract the Amino Acid Sequence from a PDB File

In [22]:
aa_codes = {
'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E',
'PHE':'F', 'GLY':'G', 'HIS':'H', 'LYS':'K',
'ILE':'I', 'LEU':'L', 'MET':'M', 'ASN':'N',
'PRO':'P', 'GLN':'Q', 'ARG':'R', 'SER':'S',
'THR':'T', 'VAL':'V', 'TYR':'Y', 'TRP':'W'}

seq = ''
for line in open(os.path.join(PROJECT_ROOT_DIR,"1tld.pdb"),"r"):
    if line[0:6] == "SEQRES":
        columns = line.rstrip().split()
        for resname in columns[4:]:
            seq += aa_codes[resname]

i = 0
print(">1TLD")
while i < len(seq):
    print(seq[i:i+64])
    i = i + 64        

>1TLD
IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQF
ISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSG
TSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVS
WGSGCAQKNKPGVYTKVCNYVSWIKQTIASN


In [19]:
len(aa_codes)

20

## 5.5 TESTING YOURSELF
### Exercise 5.1 A Simple Dictionary

Create a dictionary where the following five codons are associated withtheir corresponding values:

In [25]:
d = {'UAA':'Stop','UAG':'Stop','UGA':'Stop','AUG':'Start','GGG':'Glycin'}

### Exercise 5.2 Counting START and STOP Codons in a Nucleotide Sequence

Write a program that counts the number of STOP codons and the number of START codons in an input nucleotide sequence. The program must
print two elements: the number of START codons and the number of STOP codons.

**Hint**: Download an RNA sequence from NCBI in FASTA format and use what you learned from Section 5.2.2.

In [26]:
from collections import Counter

particular_codon = {'UAA':'Stop','UAG':'Stop','UGA':'Stop','AUG':'Start'}
rna = ''
l = []

for line in open(os.path.join(PROJECT_ROOT_DIR,"A06662-RNA.fasta"),"r"):
    if not line.startswith(">"):
        rna += line.strip()

for i in range(0,len(rna),3):
    codon = rna[i:i + 3]
    if codon in particular_codon:
        l.append(particular_codon[codon])

counts = Counter(l)
print("There are %s stop codon in this sequence."%(counts['Stop']))
print("There are %s start codon in this sequence."%(counts['Start']))

There are 8 stop codon in this sequence.
There are 5 start codon in this sequence.


### Exercise 5.3 Search Keywords in a PubMed Abstract

Copy and paste to a text file the title and abstract of a paper of your choice (e.g., you can download it from PubMed: http://www.ncbi.nlm.nih.gov/pubmed). Check if two (or more) keywords of your choice (e.g., “calmodulin” or “CALM2”) are (both or alternatively) present in the abstract, and if yes, print that you found them or otherwise that you didn’t.

In [31]:
f = open(os.path.join(PROJECT_ROOT_DIR,"pubmed.txt"),"r")
keyword1 = "calmodulin"
keyword2 = "CALM2"
flag = True

while flag:
    try:
        line = next(f)
        if keyword1 in line and keyword2 in line:
            print("Both keywords calmodulin and CALM2 in this article.")
            flag = False
        elif keyword1 in line:
            keyword1 = False
            print("Keyword calmodulin in this article.")
        
        elif keyword2 in line:
            keyword2 = False
            print("Keyword CALM2 in this article.")
    
    except StopIteration:
        flag = False
        if keyword1 and keyword1:
            print("Both keywords calmodulin and CALM2 not in this article.")

Keyword calmodulin in this article.


### Exercise 5.4 Secondary Structure Predictor

Write a Sequence-Based Predictor for Secondary Structure Elements.

**Hint**: Use the following table of preferences, where the second column is for helices and the third is for beta sheets (http://www.bmrb.wisc.edu/referenc/choufas.html):

Scan the input sequence residue by residue and replace each residue with H (helix) if its pref_H ≥ 1 and its pref_E < pref_H, with E (sheet) if its pref_E ≥ 1 and its pref_H < pref_E, and with L (loop) otherwise. Print (or write to a file) the input and output sequences, one on top of the other.

In [37]:
table =[('A',1.45,0.97),
('C',0.77,1.30),
('D',0.98,0.80),
('E',1.53,0.26),
('F',1.12,1.28),
('G',0.53,0.81),
('H',1.24,0.71),
('I',1.00,1.60),
('K',1.07,0.74),
('L',1.34,1.22),
('M',1.20,1.67),
('N',0.73,0.65),
('P',0.59,0.62),
('Q',1.17,1.23),
('R',0.79,0.90),
('S',0.79,0.72),
('T',0.82,1.20),
('V',1.14,1.65),
('W',1.14,1.19),
('Y',0.61,1.29)]

pref_H = {}
pref_E = {}
for k,h,e in table:
    pref_H[k] = h
    pref_E[k] = e

In [40]:
input_seq = "IVGGYTCGANTVPYQVSLNSGYHFCGGSLINS\
QWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVH\
PSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQ\
CLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQI\
TSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWG\
SGCAQKNKPGVYTKVCNYVSWIKQTIASN"
pref_seq = ''

for letter in input_seq:
    if pref_H[letter] >= 1 or pref_E[letter] >= 1:
        if pref_E[letter] < pref_H[letter]:
            pref_seq += 'H'
        elif pref_E[letter] > pref_H[letter]:
            pref_seq += 'E'
    else:
        pref_seq += 'L'

print(input_seq)
print(pref_seq)

IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN
EELLEEELHLEELEEELHLLLEHEELLLHELLEEEELHHHEEHLLEEELHLHLLELEEHLLHEEELHLHLEEHLLELLLEHLLLEEHEHHHLHHLHLLLEHLELHLELEHLHLEEEHELLELLEHLLLELELLEHHEHHHLEHLLLLEHLHELLEEELLEEEHLEHHLLHLLEELLLLLLEEELLHHELEELELLLEHEHLHLLEEEHEELEELEEHEEEHLL


### Exercise 5.5 Write a Predictor for the Solvent Accessibility of Amino Acidic Residues in a Protein Sequence

The input of the predictor must be a protein sequence file in FASTA format. The output must be the same sequence with residues in uppercase if
they are predicted to be accessible to the solvent and in lowercase otherwise. You can find the solvent-exposed area of PDB residues (in Appendix C, Section C.10, “Solvent Accessibility of Amino Acids in Known Protein Structures”) and consider that a residue has a propensity to be accessible if it has >70% in the >30 $Å^2$ column. Try to change the propensity threshold and see what happens to your output.

In [44]:
solvent={'A':0.48,
'R':0.84,
'D':0.81,
'N':0.82,
'C':0.32,
'E':0.93, 
'Q':0.81, 
'G':0.51,
'H':0.66,
'I':0.39,
'L':0.41,
'K':0.93,
'M':0.44,
'F':0.42, 
'P':0.78,
'S':0.70,
'T':0.71, 
'W':0.49,
'Y':0.67,
'V':0.40
}


20

In [45]:
f = open(os.path.join(PROJECT_ROOT_DIR,"SwissProt.fasta"),"r")
input_seq = ''
out_seq = ''

for line in f:
    if line.startswith(">") and input_seq != '':
        print(input_seq)
        print(out_seq)
        input_seq = ''
        out_seq = ''
    
    elif not line.startswith(">"):
        line = line.strip()
        input_seq += line
        for letter in input_seq:
            if letter in solvent:
                if solvent[letter] > 0.7:
                    out_seq += letter.upper()
                else:
                    out_seq += letter.lower()
            else:
                out_seq += '-'

MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHLQILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYALVIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNEECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECLGNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQGCHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGCTVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETLEIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQERNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQNVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFSDERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWERQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQILKELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAFPNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYVSARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCVSRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIGPLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYL