Given: A protein string of length at most 1000 aa.

Return: The total number of different RNA strings from which the protein could have been translated, modulo 1,000,000. (Don't neglect the importance of the stop codon in protein translation.)

See problem here https://rosalind.info/problems/mrna/

In [1]:
# Opens file and reads it
with open('./Text_Files/rosalind_mrna.txt', 'r') as f:
    lines = f.readlines()
    for i in range(len(lines)):
        lines[i]=lines[i].replace('\n', '')

In [2]:
lines[0]

'MPQRCGEEYNPWQADMDYFPTVQEFDEIHPFVPWTESDYMKACRWVEFDVPTRRSHMCYFSAKNCLICESNVTNDSNMSRQVDTLPQYMRNVQWYHSKVKIDDRLMAFLCTRQCLMVRMTWFQSWHMHAQQGYHISRFATKEIIHHMVWLRYSQYQLYPDYNCFLMEVGDASQEPQHDNITMLQKKWMPHVPNVRHGKFQVISGCNAIPITREPRGHWYWNASMSPIAPMHNNGRWMRATFDAVWRTINYWAPDKEDTMQWERIMNPEMPRLEDAYGDYANSMRQWFEYSERIQYHTTRLTGPNTWPFHYYDRDNDMDYGPFTKYNGKDEHGFHEQSYTGCLCTPCTATRGHFIKPDVNDTRDQHYPHVNDTTHYTKKYTKYIRWIGTNECDHASHTILVHSKDDCNVTHDFKFIPPTGTYCPWLMYYDPPREKNMQQIKGKYIYGQFCNILFKKMPVNSGSEGSETISNWCTELMQQEDKRRKKYHAYRCPIDLYAYTRCFEKTGNSCYSSMNQNTLKPHCLTKLQYQLLGLWNYALDSLDGIWNEFEHIRNRCQFEIVRDNAYIGKASNLNCHVMSYPHCMQPMMLEASKYELILQYQSMLKRKEMMVTWYFNNCVQFMSYSKMCIYVWCPQFGKPNVLNGWREDVCCELTRTYSYQACSDNWDHVPPYKWQKKCMIGGSSMDLFKGLGLEAGVMSKDSRENASTWTYIGETKVECKAPRIDDHRNRITHDHQCHNGRTAWVDELISECMTYYHEDEMFIHFDPQWCMEAFEWCDPDDAWDCQWAWWTYEGDFREPIVVNQEQQVAMWKSIAHYASSNRPHERLQADHDTWIFPIKFGKIESYAMPTTTGYIQAQKKHFSFGMDFTTFPLENRCYWQKLIQAGDNRPTQDKTGGSQPCPKQHLHDMFNHPNQPTPICMAMITRGIKFCVEAGDKNRGLIKAWMCLLYWLDLSYVICWVSMNMTALSYLPQHHNELWEWENINLRRYPQGYVGYSPYY

In [3]:
#Codon table copied from https://rosalind.info/problems/orf/solutions/
codon_table = """TTT F      CTT L      ATT I      GTT V
TTC F      CTC L      ATC I      GTC V
TTA L      CTA L      ATA I      GTA V
TTG L      CTG L      ATG M      GTG V
TCT S      CCT P      ACT T      GCT A
TCC S      CCC P      ACC T      GCC A
TCA S      CCA P      ACA T      GCA A
TCG S      CCG P      ACG T      GCG A
TAT Y      CAT H      AAT N      GAT D
TAC Y      CAC H      AAC N      GAC D
TAA Stop   CAA Q      AAA K      GAA E
TAG Stop   CAG Q      AAG K      GAG E
TGT C      CGT R      AGT S      GGT G
TGC C      CGC R      AGC S      GGC G
TGA Stop   CGA R      AGA R      GGA G
TGG W      CGG R      AGG R      GGG G"""

codon_table = dict(zip(codon_table.split()[::2], codon_table.split()[1::2]))

##############################################################
#Reversing the order of the pre-processed codon table because it's easier than re-writing it.
aa_table = {'A': [],'C': [], 'D': [], 'E': [], 'F': [], 
            'G': [], 'H': [],'I': [], 'K': [], 'L': [], 
            'M': [],'N': [],'P': [], 'Q': [],'R': [], 
            'S': [], 'T': [], 'V': [], 'W': [], 'Y': [], 'Stop': []}

for key in codon_table:
    aa_table[codon_table[key]].append(key)

In [4]:
#Now have the sequence of each codon for an amino acid
aa_table

{'A': ['GCT', 'GCC', 'GCA', 'GCG'],
 'C': ['TGT', 'TGC'],
 'D': ['GAT', 'GAC'],
 'E': ['GAA', 'GAG'],
 'F': ['TTT', 'TTC'],
 'G': ['GGT', 'GGC', 'GGA', 'GGG'],
 'H': ['CAT', 'CAC'],
 'I': ['ATT', 'ATC', 'ATA'],
 'K': ['AAA', 'AAG'],
 'L': ['CTT', 'CTC', 'TTA', 'CTA', 'TTG', 'CTG'],
 'M': ['ATG'],
 'N': ['AAT', 'AAC'],
 'P': ['CCT', 'CCC', 'CCA', 'CCG'],
 'Q': ['CAA', 'CAG'],
 'R': ['CGT', 'CGC', 'CGA', 'AGA', 'CGG', 'AGG'],
 'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
 'T': ['ACT', 'ACC', 'ACA', 'ACG'],
 'V': ['GTT', 'GTC', 'GTA', 'GTG'],
 'W': ['TGG'],
 'Y': ['TAT', 'TAC'],
 'Stop': ['TAA', 'TAG', 'TGA']}

In [5]:
def protein_to_mrna(protein):
    #including aa table as part of function, that way if above portion is removed, the function still works
    aa_table={'A': ['GCT', 'GCC', 'GCA', 'GCG'],  'C': ['TGT', 'TGC'],  'D': ['GAT', 'GAC'],  'E': ['GAA', 'GAG'],  'F': ['TTT', 'TTC'],  'G': ['GGT', 'GGC', 'GGA', 'GGG'],  'H': ['CAT', 'CAC'],  'I': ['ATT', 'ATC', 'ATA'],  'K': ['AAA', 'AAG'],  'L': ['CTT', 'CTC', 'TTA', 'CTA', 'TTG', 'CTG'],  'M': ['ATG'],  'N': ['AAT', 'AAC'],  'P': ['CCT', 'CCC', 'CCA', 'CCG'],  'Q': ['CAA', 'CAG'],  'R': ['CGT', 'CGC', 'CGA', 'AGA', 'CGG', 'AGG'],  'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],  'T': ['ACT', 'ACC', 'ACA', 'ACG'],  'V': ['GTT', 'GTC', 'GTA', 'GTG'],  'W': ['TGG'],  'Y': ['TAT', 'TAC'],  'Stop': ['TAA', 'TAG', 'TGA']}
    possible_mrnas=[]
    for aa in protein:
        possible_mrnas.append(len(aa_table[aa]))
    possible=1
    for num in possible_mrnas:
        possible*=num 
    possible=possible*3 #Multiple by 3 at the end to account for stop codons
    return possible % 1000000

In [6]:
protein_to_mrna(lines[0])

883008