https://www.ncbi.nlm.nih.gov/nuccore/NC_045512

In [216]:
import re

In [217]:
def read_seq(inputfile): 
    with open(inputfile, "r") as f: 
        seq = f.read() 
    seq = seq.replace("\n", "") 
    seq = seq.replace("\r", "") 
    return seq 

In [218]:
def DNA_to_pre_mRNA(DNA):
    table = {
        'C': 'G', 'G': 'C', 'A': 'U', 'T': 'A'
    }
    
    pre_mRNA = ""
    for base in DNA:
        pre_mRNA += table[base]
    
    return pre_mRNA

In [219]:
def pre_mRNA_to_mRNA(pre_mRNA):
    introns = []
    exon = pre_mRNA
    regex = r"GU(?:\w{0,}?)AG" 
    introns = re.findall(regex, pre_mRNA)

    for intron in introns: 
        exon = exon.replace(intron, "") 
    """
    found_G = False
    is_intron = False
    found_GU_A = False
    for base in pre_mRNA:
        if base == 'G':
            found_G = True
        if base == 'U' and found_G:
            is_intron = True
        if is_intron and base == 'A':
            found_GU_A = True
        if found_GU_A and base == 'G':
            is_intron = False
        if is_intron:
            intron += base
        else:
            exon += base
    """

    return introns, exon

In [220]:
def mRNA_to_proteins(mRNA):
    table = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
    "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
    "UAU":"Y", "UAC":"Y", "UAA":"STOP", "UAG":"STOP",
    "UGU":"C", "UGC":"C", "UGA":"STOP", "UGG":"W",
    "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
    "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
    "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
    "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
    "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
    "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
    "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
    "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
    "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
    "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
    "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
    "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}

    protein = ""
    proteins = []
    for i in range(0, len(mRNA)-(len(mRNA)%3), 3):
        codon = table[mRNA[i:i+3]]
        if codon == "STOP":
            proteins.append(protein)
            protein = ""
        else:
            protein += codon
    return proteins

In [243]:
"""
https://en.wikipedia.org/wiki/Start_codon
The most common start codon is AUG (i.e., ATG in the corresponding DNA sequence).
An alternative start codon sequence, such as GUG or UUG, 
may commence translation sequence if the AUG codon is unavailable.

If truncated (no stop sequence before next AUG) it will produce a non functional protein.
the next frame starting by AUG will be considered therefore as starting sequence
"""
def find_start_codon(mRNA):
    for i in range(0, len(mRNA)-(len(mRNA)%3), 1):
        codon = mRNA[i:i+3]
        if codon == "AUG":
            start_index = i

In [221]:
DNA = read_seq("datasets/NM_207618.txt")
DNA

'GGTCAGAAAAAGCCCTCTCCATGTCTACTCACGATACATCCCTGAAAACCACTGAGGAAGTGGCTTTTCAGATCATCTTGCTTTGCCAGTTTGGGGTTGGGACTTTTGCCAATGTATTTCTCTTTGTCTATAATTTCTCTCCAATCTCGACTGGTTCTAAACAGAGGCCCAGACAAGTGATTTTAAGACACATGGCTGTGGCCAATGCCTTAACTCTCTTCCTCACTATATTTCCAAACAACATGATGACTTTTGCTCCAATTATTCCTCAAACTGACCTCAAATGTAAATTAGAATTCTTCACTCGCCTCGTGGCAAGAAGCACAAACTTGTGTTCAACTTGTGTTCTGAGTATCCATCAGTTTGTCACACTTGTTCCTGTTAATTCAGGTAAAGGAATACTCAGAGCAAGTGTCACAAACATGGCAAGTTATTCTTGTTACAGTTGTTGGTTCTTCAGTGTCTTAAATAACATCTACATTCCAATTAAGGTCACTGGTCCACAGTTAACAGACAATAACAATAACTCTAAAAGCAAGTTGTTCTGTTCCACTTCTGATTTCAGTGTAGGCATTGTCTTCTTGAGGTTTGCCCATGATGCCACATTCATGAGCATCATGGTCTGGACCAGTGTCTCCATGGTACTTCTCCTCCATAGACATTGTCAGAGAATGCAGTACATATTCACTCTCAATCAGGACCCCAGGGGCCAAGCAGAGACCACAGCAACCCATACTATCCTGATGCTGGTAGTCACATTTGTTGGCTTTTATCTTCTAAGTCTTATTTGTATCATCTTTTACACCTATTTTATATATTCTCATCATTCCCTGAGGCATTGCAATGACATTTTGGTTTCGGGTTTCCCTACAATTTCTCCTTTACTGTTGACCTTCAGAGACCCTAAGGGTCCTTGTTCTGTGTTCTTCAACTGTTGAAAGCCAGAGTCACTAAAAATGCCAAACACAGAAGACAGCTTTGCTAATACCATTAAATACT

In [222]:
pre_mRNA = DNA_to_pre_mRNA(DNA)
pre_mRNA

'CCAGUCUUUUUCGGGAGAGGUACAGAUGAGUGCUAUGUAGGGACUUUUGGUGACUCCUUCACCGAAAAGUCUAGUAGAACGAAACGGUCAAACCCCAACCCUGAAAACGGUUACAUAAAGAGAAACAGAUAUUAAAGAGAGGUUAGAGCUGACCAAGAUUUGUCUCCGGGUCUGUUCACUAAAAUUCUGUGUACCGACACCGGUUACGGAAUUGAGAGAAGGAGUGAUAUAAAGGUUUGUUGUACUACUGAAAACGAGGUUAAUAAGGAGUUUGACUGGAGUUUACAUUUAAUCUUAAGAAGUGAGCGGAGCACCGUUCUUCGUGUUUGAACACAAGUUGAACACAAGACUCAUAGGUAGUCAAACAGUGUGAACAAGGACAAUUAAGUCCAUUUCCUUAUGAGUCUCGUUCACAGUGUUUGUACCGUUCAAUAAGAACAAUGUCAACAACCAAGAAGUCACAGAAUUUAUUGUAGAUGUAAGGUUAAUUCCAGUGACCAGGUGUCAAUUGUCUGUUAUUGUUAUUGAGAUUUUCGUUCAACAAGACAAGGUGAAGACUAAAGUCACAUCCGUAACAGAAGAACUCCAAACGGGUACUACGGUGUAAGUACUCGUAGUACCAGACCUGGUCACAGAGGUACCAUGAAGAGGAGGUAUCUGUAACAGUCUCUUACGUCAUGUAUAAGUGAGAGUUAGUCCUGGGGUCCCCGGUUCGUCUCUGGUGUCGUUGGGUAUGAUAGGACUACGACCAUCAGUGUAAACAACCGAAAAUAGAAGAUUCAGAAUAAACAUAGUAGAAAAUGUGGAUAAAAUAUAUAAGAGUAGUAAGGGACUCCGUAACGUUACUGUAAAACCAAAGCCCAAAGGGAUGUUAAAGAGGAAAUGACAACUGGAAGUCUCUGGGAUUCCCAGGAACAAGACACAAGAAGUUGACAACUUUCGGUCUCAGUGAUUUUUACGGUUUGUGUCUUCUGUCGAAACGAUUAUGGUAAUUUAUGA

In [223]:
(introns, mRNA) = pre_mRNA_to_mRNA(pre_mRNA)
print(introns)
print(mRNA)

['GUCUUUUUCGGGAG', 'GUACAG', 'GUGCUAUGUAG', 'GUGACUCCUUCACCGAAAAG', 'GUAG', 'GUCAAACCCCAACCCUGAAAACGGUUACAUAAAG', 'GUUAG', 'GUCUCCGGGUCUGUUCACUAAAAUUCUGUGUACCGACACCGGUUACGGAAUUGAG', 'GUGAUAUAAAG', 'GUUUGUUGUACUACUGAAAACGAG', 'GUUAAUAAG', 'GUUUGACUGGAG', 'GUGAG', 'GUUCUUCGUGUUUGAACACAAG', 'GUAG', 'GUGUGAACAAG', 'GUCCAUUUCCUUAUGAG', 'GUUCACAG', 'GUUUGUACCGUUCAAUAAG', 'GUCAACAACCAAG', 'GUCACAG', 'GUAG', 'GUAAG', 'GUUAAUUCCAG', 'GUGUCAAUUGUCUGUUAUUGUUAUUGAG', 'GUUCAACAAG', 'GUGAAG', 'GUCACAUCCGUAACAG', 'GUACUACGGUGUAAG', 'GUAG', 'GUCACAG', 'GUACCAUGAAG', 'GUAUCUGUAACAG', 'GUCAUGUAUAAG', 'GUUAG', 'GUCCCCGGUUCGUCUCUGGUGUCGUUGGGUAUGAUAG', 'GUGUAAACAACCGAAAAUAG', 'GUAG', 'GUGGAUAAAAUAUAUAAG', 'GUAG', 'GUAACGUUACUGUAAAACCAAAG', 'GUUAAAG', 'GUCUCUGGGAUUCCCAG', 'GUUGACAACUUUCGGUCUCAG', 'GUUUGUGUCUUCUGUCGAAACGAUUAUGGUAAUUUAUGAAAUAAG', 'GUAUUUAUACAAAAAUUUUCGAACAUACUUGUUCCAUACCACGAG', 'GUGAACAACUAUACUUUUCUAAAG', 'GUCACUCAAUAAG', 'GUGGGAG', 'GUAAG']
CCAAGAUGAGGACUUUUGUCUAAACGAAACGAGAAACAGAUAUUAAAGAGA

In [224]:
proteins = mRNA_to_proteins(mRNA)
proteins

['PR',
 'GLLSKRNEKQILKRELTKI',
 'KEIYI',
 'S',
 'ERSTLNTRLIVKQTINL',
 'Q',
 'NLLLTRFSQD',
 'KELQTGTTVTLPDLRGVSYVMYKSWGLRPSKIQNKHKNIRDSPKGIGNDNWKNKTQE',
 'FLRDDMNIFSFQY',
 'PTLD']

In [225]:
pre_mRNA2 = "acguccgcaagagaagccuuaauauauucaaaaagcuacgccucagauuucgcgcucgagcccaaaacaacugguguacggguugaucacaucaaaugaagucgcuaaagucggugaucucacuauccuugucuucggcuuuugcucucucggcuaucaucuaagcaggcgaguuccauggugaccggaacgacggcuacuggaguccaugaucgcaagcgucgggcugggguaaaagaggcucagcucauaauaguccgccccaccaguacgggacucgauaggccccgucguugccguagaaacgcaauuuuccucagacccacuauacgcaccucgauuuagcaugguuccgggguugcgcuuugagaaucauacguaaggaucggaaccuaggaaugcaccacagaacuuugaaauacuagaacaaguugauugacaacggaguaucggcgccccacauuuaacgaauaauugcaggcgccagacgaugcuaggugcguccguaucaagauucgaggucgcuacuggcuucgcuugccgaucgagcucagaguuugugagaguuguuacuaauugcguggucgccuaauauccuugauacuacguggguguacuagacaucccggacagaaaaucucuuaaacgcuagaguucucuuggaagcgccugcacuucuugugaacauacgaugauagccacucuaagcccaacgcacuucgcuuggcccacauugcccccagagcuuauucaucgacaggcguuccacucuuggauucaucaguaaacuuuauuauacgugguaagcgugcuuauagcugucggaaucucacuuaggcggauugaagugagacagccugaaaguaaccguguacaggcgccgucaauguguuuugagugugcaccuacaaaaaguguuauuuaggcaggggagcuuuguaguuucuuuagaagagccgcgaaugaaccaacgguagacugcgagcgcguucaaccuaau"
(introns, mRNA) = pre_mRNA_to_mRNA(pre_mRNA2.upper())
proteins = mRNA_to_proteins(mRNA)
proteins

['TEALIYSKSYASDFALEPKTTVAKQAIHDRKQAQLIIYGTR',
 'APNAIFLRPTIRTSI',
 'HESYDRNLGMHHRTLKY',
 'NNIGAPHLTNNCRRQTMLDSSSETSRTENLLNARRLHFFHSKPNALRLAHIAPRAYSSTG',
 'TLLYVPAD']

In [234]:
DNA = read_seq("datasets/covid.txt")
DNA

'ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTC

In [227]:
pre_mRNA = DNA_to_pre_mRNA(DNA)
pre_mRNA

'UAAUUUCCAAAUAUGGAAGGGUCCAUUGUUUGGUUGGUUGAAAGCUAGAGAACAUCUAGACAAGAGAUUUGCUUGAAAUUUUAGACACACCGACAGUGAGCCGACGUACGAAUCACGUGAGUGCGUCAUAUUAAUUAUUGAUUAAUGACAGCAACUGUCCUGUGCUCAUUGAGCAGAUAGAAGACGUCCGACGAAUGCCAAAGCAGGCACAACGUCGGCUAGUAGUCGUGUAGAUCCAAAGCAGGCCCACACUGGCUUUCCAUUCUACCUCUCGGAACAGGGACCAAAGUUGCUCUUUUGUGUGCAGGUUGAGUCAAACGGACAAAAUGUCCAAGCGCUGCACGAGCAUGCACCGAAACCUCUGAGGCACCUCCUCCAGAAUAGUCUCCGUGCAGUUGUAGAAUUUCUACCGUGAACACCGAAUCAUCUUCAACUUUUUCCGCAAAACGGAGUUGAACUUGUCGGGAUACACAAGUAGUUUGCAAGCCUACGAGCUUGACGUGGAGUACCAGUACAAUACCAACUCGACCAUCGUCUUGAGCUUCCGUAAGUCAUGCCAGCAUCACCACUCUGUGAACCACAGGAACAGGGAGUACACCCGCUUUAUGGUCACCGAAUGGCGUUCCAAGAAGAAGCAUUCUUGCCAUUAUUUCCUCGACCACCGGUAUCAAUGCCGCGGCUAGAUUUCAGUAAACUGAAUCCGCUGCUCGAACCGUGACUAGGAAUACUUCUAAAAGUUCUUUUGACCUUGUGAUUUGUAUCGUCACCACAAUGGGCACUUGAGUACGCACUCGAAUUGCCUCCCCGUAUGUGAGCGAUACAGCUAUUGUUGAAGACACCGGGACUACCGAUGGGAGAACUCACGUAAUUUCUGGAAGAUCGUGCACGACCAUUUCGAAGUACGUGAAACAGGCUUGUUGACCUGAAAUAACUGUGAUUCUCCCCACAUAUGACGACGGCACUUGUACUCGUACUUUAACGAACCAUGUGCCUUGCAAG

In [228]:
(introns, mRNA) = pre_mRNA_to_mRNA(pre_mRNA)
print(introns)
print(mRNA)

['GUCCAUUGUUUGGUUGGUUGAAAG', 'GUGAG', 'GUACGAAUCACGUGAG', 'GUCAUAUUAAUUAUUGAUUAAUGACAG', 'GUCCUGUGCUCAUUGAG', 'GUCCGACGAAUGCCAAAG', 'GUCGGCUAG', 'GUCGUGUAG', 'GUUGCUCUUUUGUGUGCAG', 'GUUGAG', 'GUCCAAG', 'GUCUCCGUGCAG', 'GUAG', 'GUGAACACCGAAUCAUCUUCAACUUUUUCCGCAAAACGGAG', 'GUCGGGAUACACAAG', 'GUUUGCAAG', 'GUGGAG', 'GUACAAUACCAACUCGACCAUCGUCUUGAG', 'GUAAG', 'GUGAACCACAG', 'GUACACCCGCUUUAUGGUCACCGAAUGGCGUUCCAAG', 'GUAUCAAUGCCGCGGCUAG', 'GUAAACUGAAUCCGCUGCUCGAACCGUGACUAG', 'GUUCUUUUGACCUUGUGAUUUGUAUCGUCACCACAAUGGGCACUUGAG', 'GUAUGUGAG', 'GUUGAAG', 'GUAAUUUCUGGAAG', 'GUGCACGACCAUUUCGAAG', 'GUGAAACAG', 'GUUGACCUGAAAUAACUGUGAUUCUCCCCACAUAUGACGACGGCACUUGUACUCGUACUUUAACGAACCAUGUGCCUUGCAAG', 'GUCUGUGGAAAACUUUAAUUUAACCGUUUCUUUAAACUGUGGAAG', 'GUUUAAAACAUAAAG', 'GUAUUAG', 'GUUGGUUCCCAACUUUUCUUUUUCGAACUACCGAAAUACCCAUCUUAAG', 'GUCAACGCAG', 'GUUUACUUACGUUGGUUUACACGGAAAG', 'GUACUUCACACUAG', 'GUACCGUCUGCCCGCUAAAACAAUUUCGGUGAACGCUUAAAACACCGUGACUCUUAAACUGAUUUCUUCCACGGUGAUGAACACCAAUGAAUGGGGUUUUACGACAACAAUUUU

In [229]:
proteins = mRNA_to_proteins(mRNA)
proteins

['',
 'FPNMEG',
 'RTSRQEICLKF',
 'THRHRRTNHCQLR',
 'KTRHN',
 'SKAGPHWLSILPLGTGTKSNGQNRCTSMHRNL',
 'GTSSRILISTLNLSGYTICKPTSLTTTSSCQHHHSEQGKKHSCHYFLDHRFQNTSKIRTRIASPYRYSYYTGTTDGRTHIYAYFSRYLTYPLQGI',
 'VLITRQIVVYLRWFTRKITPL',
 'ILHPGLNGL',
 'PNF',
 'SIPTSRIQPTASRITNFMSFITTTEI',
 'ITSLAVIKPKRRRCSRKHL',
 'HFPNLIFL',
 'HH',
 'NFNL',
 'LRRNT',
 'TLRRAQHAKGNFDEHAQNLTLILNR',
 'LSTLPDSTEASPID',
 'TFGQELTELLFKFLPHLKESLPTL',
 'QFK',
 'TTLDTLNSNIYLKTETHD',
 'DMLSLTLRRFSGRISLFELLTTKSLFSATENLNYFPNYFMNYSSRDGYPSDTTPIFETLNEYHSNLTEKDGRSTPTSSSTNTSSSQTRYYTHTMTSTNGSIWKP',
 'TTTTRILFCRHSFFFGSDNFTYQNHQ',
 'ILNDYY',
 'CIDDYLVNFTHHQHKIRL',
 'RFNRVYNCFHFFEEFSRILLKLMNSTIKTTTGI',
 'ILKHIYDKHESTETIFRDTFEKPLLLTFPTLRISPSIRQLFRTQLLQCYLL',
 'FKECLLNNEI',
 'L',
 'LPNEQSL',
 'LERISFYEFSHKIDDTNMDDFSDHLPNLPM',
 'HLLRFCHEFFTFSRKI',
 '',
 '',
 'RLLFNLDKEP',
 'TLFTNTHL',
 'FR',
 'Y',
 'IIYFENNPYTLENKENEEEDFVDFL',
 'NNFEIFLTRIRPIILSTIFTYNDGWKLILKKLSSLLIIPQIYLLSVLPFCKIQNGLLLCTTPKTHDGSKDPSIL',
 'FTQR',
 'RNNYNRMTLFFNNLHHTF',
 'T

In [230]:
proteins = mRNA_to_proteins(pre_mRNA)

In [242]:
mRNA_to_proteins(DNA[266-1:13483+600].replace('T', 'U'))

['MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLEQPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKGAGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDGYPLECIKDLLARAGKASCTLSEQLDFIDTKRGVYCCREHEHEIAWYTERSEKSYELQTPFEIKLAKKFDTFNGECPNFVFPLNSIIKTIQPRVEKKKLDGFMGRIRSVYPVASPNECNQMCLSTLMKCDHCGETSWQTGDFVKATCEFCGTENLTKEGATTCGYLPQNAVVKIYCPACHNSEVGPEHSLAEYHNESGLKTILRKGGRTIAFGGCVFSYVGCHNKCAYWVPRASANIGCNHTGVVGEGSEGLNDNLLEILQKEKVNINIVGDFKLNEEIAIILASFSASTSAFVETVKGLDYKAFKQIVESCGNFKVTKGKAKKGAWNIGEQKSILSPLYAFASEAARVVRSIFSRTLETAQNSVRVLQKAAITILDGISQYSLRLIDAMMFTSDLATNNLVVMAYITGGVVQLTSQWLTNIFGTVYEKLKPVLDWLEEKFKEGVEFLRDGWEIVKFISTCACEIVGGQIVTCAKEIKESVQTFFKLVNKFLALCADSIIIGGAKLKALNLGETFVTHSKGLYRKCVKSREETGLLMPLKAPKEIIFLEGETLPTEVLTEEVVLKTGDLQPLEQPTSEAVEAPLVGTPVCINGLMLLEIKDTEKYCALAPNMMVTNNTFTLKGGAPTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVELGTEVNEFACVVADAVIKTLQPVSELLTPLGIDLDEWSMATYYLFDESGEFKLASHMYCSFYPPDEDEEEGDCEEEEFEPSTQYEYGTEDDYQGKPLEFGATSAALQPEEEQEEDWLDDDSQQTVGQQDGSEDNQ

In [232]:
mRNA_to_proteins(DNA[26523-1:27191].replace('T', 'U'))

['MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFLYIIKLIFLWLLWPVTLACFVLAAVYRINWITGGIAIAMACLVGLMWLSYFIASFRLFARTRSMWSFNPETNILLNVPLHGTILTRPLLESELVIGAVILRGHLRIAGHHLGRCDIKDLPKEITVATSRTLSYYKLGASQRVAGDSGFAAYSRYRIGNYKLNTDHSSSSDNIALLVQ']

In [233]:
DNA[26523-1:27191].replace('T', 'U')

'AUGGCAGAUUCCAACGGUACUAUUACCGUUGAAGAGCUUAAAAAGCUCCUUGAACAAUGGAACCUAGUAAUAGGUUUCCUAUUCCUUACAUGGAUUUGUCUUCUACAAUUUGCCUAUGCCAACAGGAAUAGGUUUUUGUAUAUAAUUAAGUUAAUUUUCCUCUGGCUGUUAUGGCCAGUAACUUUAGCUUGUUUUGUGCUUGCUGCUGUUUACAGAAUAAAUUGGAUCACCGGUGGAAUUGCUAUCGCAAUGGCUUGUCUUGUAGGCUUGAUGUGGCUCAGCUACUUCAUUGCUUCUUUCAGACUGUUUGCGCGUACGCGUUCCAUGUGGUCAUUCAAUCCAGAAACUAACAUUCUUCUCAACGUGCCACUCCAUGGCACUAUUCUGACCAGACCGCUUCUAGAAAGUGAACUCGUAAUCGGAGCUGUGAUCCUUCGUGGACAUCUUCGUAUUGCUGGACACCAUCUAGGACGCUGUGACAUCAAGGACCUGCCUAAAGAAAUCACUGUUGCUACAUCACGAACGCUUUCUUAUUACAAAUUGGGAGCUUCGCAGCGUGUAGCAGGUGACUCAGGUUUUGCUGCAUACAGUCGCUACAGGAUUGGCAACUAUAAAUUAAACACAGACCAUUCCAGUAGCAGUGACAAUAUUGCUUUGCUUGUACAGUAA'

In [235]:
DNA.replace('T', 'U')

'AUUAAAGGUUUAUACCUUCCCAGGUAACAAACCAACCAACUUUCGAUCUCUUGUAGAUCUGUUCUCUAAACGAACUUUAAAAUCUGUGUGGCUGUCACUCGGCUGCAUGCUUAGUGCACUCACGCAGUAUAAUUAAUAACUAAUUACUGUCGUUGACAGGACACGAGUAACUCGUCUAUCUUCUGCAGGCUGCUUACGGUUUCGUCCGUGUUGCAGCCGAUCAUCAGCACAUCUAGGUUUCGUCCGGGUGUGACCGAAAGGUAAGAUGGAGAGCCUUGUCCCUGGUUUCAACGAGAAAACACACGUCCAACUCAGUUUGCCUGUUUUACAGGUUCGCGACGUGCUCGUACGUGGCUUUGGAGACUCCGUGGAGGAGGUCUUAUCAGAGGCACGUCAACAUCUUAAAGAUGGCACUUGUGGCUUAGUAGAAGUUGAAAAAGGCGUUUUGCCUCAACUUGAACAGCCCUAUGUGUUCAUCAAACGUUCGGAUGCUCGAACUGCACCUCAUGGUCAUGUUAUGGUUGAGCUGGUAGCAGAACUCGAAGGCAUUCAGUACGGUCGUAGUGGUGAGACACUUGGUGUCCUUGUCCCUCAUGUGGGCGAAAUACCAGUGGCUUACCGCAAGGUUCUUCUUCGUAAGAACGGUAAUAAAGGAGCUGGUGGCCAUAGUUACGGCGCCGAUCUAAAGUCAUUUGACUUAGGCGACGAGCUUGGCACUGAUCCUUAUGAAGAUUUUCAAGAAAACUGGAACACUAAACAUAGCAGUGGUGUUACCCGUGAACUCAUGCGUGAGCUUAACGGAGGGGCAUACACUCGCUAUGUCGAUAACAACUUCUGUGGCCCUGAUGGCUACCCUCUUGAGUGCAUUAAAGACCUUCUAGCACGUGCUGGUAAAGCUUCAUGCACUUUGUCCGAACAACUGGACUUUAUUGACACUAAGAGGGGUGUAUACUGCUGCCGUGAACAUGAGCAUGAAAUUGCUUGGUACACGGAACGUUC

In [252]:
RNA = DNA.replace('T', 'U')

65

In [295]:
def is_stop(codon):
    stop_seqs = ["UAA", "UAG", "UGA"]
    return codon in stop_seqs

def find_first_valid_protein(RNA):
    start_idx = 0
    last_codon = ""
    while not is_stop(last_codon):
        start_idx = find_start_codon(RNA[start_idx:])
        last_codon = RNA[start_idx-3:start_idx]
    return start_idx

find_first_valid_protein(RNA)

KeyboardInterrupt: 

In [294]:
start_idx = 265
RNA[start_idx-3:start_idx+3]

'AAGAUG'

In [253]:
find_start_codon(RNA[0:])

106

In [270]:
find_start_codon(RNA[106+3:])

156

In [258]:
107+158

265

In [260]:
159/3

53.0

In [269]:
RNA[106:268]

'AUGCUUAGUGCACUCACGCAGUAUAAUUAAUAACUAAUUACUGUCGUUGACAGGACACGAGUAACUCGUCUAUCUUCUGCAGGCUGCUUACGGUUUCGUCCGUGUUGCAGCCGAUCAUCAGCACAUCUAGGUUUCGUCCGGGUGUGACCGAAAGGUAAGAUG'

In [266]:
len("CUUAGUGCACUCACGCAGUAUAAUUAAUAACUAAUUACUGUCGUUGACAGGACACGAGUAACUCGUCUAUCUUCUGCAGGCUGCUUACGGUUUCGUCCGUGUUGCAGCCGAUCAUCAGCACAUCUAGGUUUCGUCCGGGUGUGACCGAAAGGUAAG")

156

In [267]:
156/3

52.0

In [289]:
RNA[265:268]

'AUG'