https://www.ncbi.nlm.nih.gov/nuccore/NC_045512

In [216]:
import re

In [7]:
def read_seq(inputfile): 
    with open(inputfile, "r") as f: 
        seq = f.read() 
    seq = seq.replace("\n", "") 
    seq = seq.replace("\r", "") 
    return seq 

In [218]:
def DNA_to_pre_mRNA(DNA):
    table = {
        'C': 'G', 'G': 'C', 'A': 'U', 'T': 'A'
    }
    
    pre_mRNA = ""
    for base in DNA:
        pre_mRNA += table[base]
    
    return pre_mRNA

In [219]:
def pre_mRNA_to_mRNA(pre_mRNA):
    introns = []
    exon = pre_mRNA
    regex = r"GU(?:\w{0,}?)AG" 
    introns = re.findall(regex, pre_mRNA)

    for intron in introns: 
        exon = exon.replace(intron, "") 
    """
    found_G = False
    is_intron = False
    found_GU_A = False
    for base in pre_mRNA:
        if base == 'G':
            found_G = True
        if base == 'U' and found_G:
            is_intron = True
        if is_intron and base == 'A':
            found_GU_A = True
        if found_GU_A and base == 'G':
            is_intron = False
        if is_intron:
            intron += base
        else:
            exon += base
    """

    return introns, exon

In [73]:
def is_minus1_ribosomal_frameshift(buf):
    # detect X_XXY_YYZ pattern
    if len(buf) < 9:
        return False
    seq = buf[2:9]
    X = seq[0]
    Y = seq[3]
    Z = seq[6]
    # check X Y Z are all different between each other
    if len(list(set([X, Y, Z]))) != 3:
        return False
    pattern = seq.replace(X, 'X').replace(Y, 'Y').replace(Z, 'Z')
    return pattern == 'XXXYYYZ'

def mRNA_to_proteins(mRNA):
    table = {
        "UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
        "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
        "UAU":"Y", "UAC":"Y", "UAA":"-", "UAG":"-",
        "UGU":"C", "UGC":"C", "UGA":"-", "UGG":"W",
        "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
        "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
        "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
        "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
        "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
        "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
        "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
        "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
        "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
        "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
        "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
        "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",
    }

    protein = ""
    proteins = []
    minus1_ribosomal_frameshift_ring_buffer = ""
    
    start_idx = find_first_valid_protein(mRNA)
    
    for i in range(start_idx, len(mRNA)-(len(mRNA)%3), 3):
        codon_source = mRNA[i:i+3]
        if len(minus1_ribosomal_frameshift_ring_buffer) == 9:
            minus1_ribosomal_frameshift_ring_buffer = minus1_ribosomal_frameshift_ring_buffer[3:] + codon_source
        else:
            minus1_ribosomal_frameshift_ring_buffer += codon_source
        if is_minus1_ribosomal_frameshift(minus1_ribosomal_frameshift_ring_buffer):
            print("detected -1 ribosomal frameshift at index: {0}".format(i))
            #i -= 1
        codon = table[codon_source]
        if codon == "-":
            proteins.append(protein)
            protein = ""
        else:
            protein += codon
    return proteins

In [40]:
"""
https://en.wikipedia.org/wiki/Start_codon
The most common start codon is AUG (i.e., ATG in the corresponding DNA sequence).
An alternative start codon sequence, such as GUG or UUG, 
may commence translation sequence if the AUG codon is unavailable.

If truncated (no stop sequence before next AUG) it will produce a non functional protein.
the next frame starting by AUG will be considered therefore as the starting sequence
"""
def find_start_codon(mRNA):
    for i in range(0, len(mRNA)-(len(mRNA)%3), 1):
        codon = mRNA[i:i+3]
        if codon == "AUG":
            return i

def is_stop(codon):
    stop_seqs = ["UAA", "UAG", "UGA"]
    return codon in stop_seqs

def find_first_valid_protein(RNA):
    start_idx = 0
    total_idx = 0
    last_codon = ""
    while not is_stop(last_codon):
        start_idx = find_start_codon(RNA[start_idx:]) + 3
        total_idx += start_idx
        last_codon = RNA[start_idx-6:start_idx-3]
    return total_idx - 3

In [221]:
DNA = read_seq("datasets/NM_207618.txt")
DNA

'GGTCAGAAAAAGCCCTCTCCATGTCTACTCACGATACATCCCTGAAAACCACTGAGGAAGTGGCTTTTCAGATCATCTTGCTTTGCCAGTTTGGGGTTGGGACTTTTGCCAATGTATTTCTCTTTGTCTATAATTTCTCTCCAATCTCGACTGGTTCTAAACAGAGGCCCAGACAAGTGATTTTAAGACACATGGCTGTGGCCAATGCCTTAACTCTCTTCCTCACTATATTTCCAAACAACATGATGACTTTTGCTCCAATTATTCCTCAAACTGACCTCAAATGTAAATTAGAATTCTTCACTCGCCTCGTGGCAAGAAGCACAAACTTGTGTTCAACTTGTGTTCTGAGTATCCATCAGTTTGTCACACTTGTTCCTGTTAATTCAGGTAAAGGAATACTCAGAGCAAGTGTCACAAACATGGCAAGTTATTCTTGTTACAGTTGTTGGTTCTTCAGTGTCTTAAATAACATCTACATTCCAATTAAGGTCACTGGTCCACAGTTAACAGACAATAACAATAACTCTAAAAGCAAGTTGTTCTGTTCCACTTCTGATTTCAGTGTAGGCATTGTCTTCTTGAGGTTTGCCCATGATGCCACATTCATGAGCATCATGGTCTGGACCAGTGTCTCCATGGTACTTCTCCTCCATAGACATTGTCAGAGAATGCAGTACATATTCACTCTCAATCAGGACCCCAGGGGCCAAGCAGAGACCACAGCAACCCATACTATCCTGATGCTGGTAGTCACATTTGTTGGCTTTTATCTTCTAAGTCTTATTTGTATCATCTTTTACACCTATTTTATATATTCTCATCATTCCCTGAGGCATTGCAATGACATTTTGGTTTCGGGTTTCCCTACAATTTCTCCTTTACTGTTGACCTTCAGAGACCCTAAGGGTCCTTGTTCTGTGTTCTTCAACTGTTGAAAGCCAGAGTCACTAAAAATGCCAAACACAGAAGACAGCTTTGCTAATACCATTAAATACT

In [222]:
pre_mRNA = DNA_to_pre_mRNA(DNA)
pre_mRNA

'CCAGUCUUUUUCGGGAGAGGUACAGAUGAGUGCUAUGUAGGGACUUUUGGUGACUCCUUCACCGAAAAGUCUAGUAGAACGAAACGGUCAAACCCCAACCCUGAAAACGGUUACAUAAAGAGAAACAGAUAUUAAAGAGAGGUUAGAGCUGACCAAGAUUUGUCUCCGGGUCUGUUCACUAAAAUUCUGUGUACCGACACCGGUUACGGAAUUGAGAGAAGGAGUGAUAUAAAGGUUUGUUGUACUACUGAAAACGAGGUUAAUAAGGAGUUUGACUGGAGUUUACAUUUAAUCUUAAGAAGUGAGCGGAGCACCGUUCUUCGUGUUUGAACACAAGUUGAACACAAGACUCAUAGGUAGUCAAACAGUGUGAACAAGGACAAUUAAGUCCAUUUCCUUAUGAGUCUCGUUCACAGUGUUUGUACCGUUCAAUAAGAACAAUGUCAACAACCAAGAAGUCACAGAAUUUAUUGUAGAUGUAAGGUUAAUUCCAGUGACCAGGUGUCAAUUGUCUGUUAUUGUUAUUGAGAUUUUCGUUCAACAAGACAAGGUGAAGACUAAAGUCACAUCCGUAACAGAAGAACUCCAAACGGGUACUACGGUGUAAGUACUCGUAGUACCAGACCUGGUCACAGAGGUACCAUGAAGAGGAGGUAUCUGUAACAGUCUCUUACGUCAUGUAUAAGUGAGAGUUAGUCCUGGGGUCCCCGGUUCGUCUCUGGUGUCGUUGGGUAUGAUAGGACUACGACCAUCAGUGUAAACAACCGAAAAUAGAAGAUUCAGAAUAAACAUAGUAGAAAAUGUGGAUAAAAUAUAUAAGAGUAGUAAGGGACUCCGUAACGUUACUGUAAAACCAAAGCCCAAAGGGAUGUUAAAGAGGAAAUGACAACUGGAAGUCUCUGGGAUUCCCAGGAACAAGACACAAGAAGUUGACAACUUUCGGUCUCAGUGAUUUUUACGGUUUGUGUCUUCUGUCGAAACGAUUAUGGUAAUUUAUGA

In [223]:
(introns, mRNA) = pre_mRNA_to_mRNA(pre_mRNA)
print(introns)
print(mRNA)

['GUCUUUUUCGGGAG', 'GUACAG', 'GUGCUAUGUAG', 'GUGACUCCUUCACCGAAAAG', 'GUAG', 'GUCAAACCCCAACCCUGAAAACGGUUACAUAAAG', 'GUUAG', 'GUCUCCGGGUCUGUUCACUAAAAUUCUGUGUACCGACACCGGUUACGGAAUUGAG', 'GUGAUAUAAAG', 'GUUUGUUGUACUACUGAAAACGAG', 'GUUAAUAAG', 'GUUUGACUGGAG', 'GUGAG', 'GUUCUUCGUGUUUGAACACAAG', 'GUAG', 'GUGUGAACAAG', 'GUCCAUUUCCUUAUGAG', 'GUUCACAG', 'GUUUGUACCGUUCAAUAAG', 'GUCAACAACCAAG', 'GUCACAG', 'GUAG', 'GUAAG', 'GUUAAUUCCAG', 'GUGUCAAUUGUCUGUUAUUGUUAUUGAG', 'GUUCAACAAG', 'GUGAAG', 'GUCACAUCCGUAACAG', 'GUACUACGGUGUAAG', 'GUAG', 'GUCACAG', 'GUACCAUGAAG', 'GUAUCUGUAACAG', 'GUCAUGUAUAAG', 'GUUAG', 'GUCCCCGGUUCGUCUCUGGUGUCGUUGGGUAUGAUAG', 'GUGUAAACAACCGAAAAUAG', 'GUAG', 'GUGGAUAAAAUAUAUAAG', 'GUAG', 'GUAACGUUACUGUAAAACCAAAG', 'GUUAAAG', 'GUCUCUGGGAUUCCCAG', 'GUUGACAACUUUCGGUCUCAG', 'GUUUGUGUCUUCUGUCGAAACGAUUAUGGUAAUUUAUGAAAUAAG', 'GUAUUUAUACAAAAAUUUUCGAACAUACUUGUUCCAUACCACGAG', 'GUGAACAACUAUACUUUUCUAAAG', 'GUCACUCAAUAAG', 'GUGGGAG', 'GUAAG']
CCAAGAUGAGGACUUUUGUCUAAACGAAACGAGAAACAGAUAUUAAAGAGA

In [224]:
proteins = mRNA_to_proteins(mRNA)
proteins

['PR',
 'GLLSKRNEKQILKRELTKI',
 'KEIYI',
 'S',
 'ERSTLNTRLIVKQTINL',
 'Q',
 'NLLLTRFSQD',
 'KELQTGTTVTLPDLRGVSYVMYKSWGLRPSKIQNKHKNIRDSPKGIGNDNWKNKTQE',
 'FLRDDMNIFSFQY',
 'PTLD']

In [225]:
pre_mRNA2 = "acguccgcaagagaagccuuaauauauucaaaaagcuacgccucagauuucgcgcucgagcccaaaacaacugguguacggguugaucacaucaaaugaagucgcuaaagucggugaucucacuauccuugucuucggcuuuugcucucucggcuaucaucuaagcaggcgaguuccauggugaccggaacgacggcuacuggaguccaugaucgcaagcgucgggcugggguaaaagaggcucagcucauaauaguccgccccaccaguacgggacucgauaggccccgucguugccguagaaacgcaauuuuccucagacccacuauacgcaccucgauuuagcaugguuccgggguugcgcuuugagaaucauacguaaggaucggaaccuaggaaugcaccacagaacuuugaaauacuagaacaaguugauugacaacggaguaucggcgccccacauuuaacgaauaauugcaggcgccagacgaugcuaggugcguccguaucaagauucgaggucgcuacuggcuucgcuugccgaucgagcucagaguuugugagaguuguuacuaauugcguggucgccuaauauccuugauacuacguggguguacuagacaucccggacagaaaaucucuuaaacgcuagaguucucuuggaagcgccugcacuucuugugaacauacgaugauagccacucuaagcccaacgcacuucgcuuggcccacauugcccccagagcuuauucaucgacaggcguuccacucuuggauucaucaguaaacuuuauuauacgugguaagcgugcuuauagcugucggaaucucacuuaggcggauugaagugagacagccugaaaguaaccguguacaggcgccgucaauguguuuugagugugcaccuacaaaaaguguuauuuaggcaggggagcuuuguaguuucuuuagaagagccgcgaaugaaccaacgguagacugcgagcgcguucaaccuaau"
(introns, mRNA) = pre_mRNA_to_mRNA(pre_mRNA2.upper())
proteins = mRNA_to_proteins(mRNA)
proteins

['TEALIYSKSYASDFALEPKTTVAKQAIHDRKQAQLIIYGTR',
 'APNAIFLRPTIRTSI',
 'HESYDRNLGMHHRTLKY',
 'NNIGAPHLTNNCRRQTMLDSSSETSRTENLLNARRLHFFHSKPNALRLAHIAPRAYSSTG',
 'TLLYVPAD']

In [8]:
DNA = read_seq("datasets/covid.txt")
DNA

'ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTC

In [9]:
RNA = DNA.replace('T', 'U')

In [31]:
start_idx = find_first_valid_protein(RNA)
start_idx

265

In [72]:
mRNA_to_proteins(RNA)

detected -1 ribosomal frameshift at index: 3625
detected -1 ribosomal frameshift at index: 4255
detected -1 ribosomal frameshift at index: 4264
detected -1 ribosomal frameshift at index: 4639
detected -1 ribosomal frameshift at index: 6088
detected -1 ribosomal frameshift at index: 6748
detected -1 ribosomal frameshift at index: 13465
detected -1 ribosomal frameshift at index: 14674
detected -1 ribosomal frameshift at index: 16672
detected -1 ribosomal frameshift at index: 18064
detected -1 ribosomal frameshift at index: 18478
detected -1 ribosomal frameshift at index: 20230
detected -1 ribosomal frameshift at index: 21124
detected -1 ribosomal frameshift at index: 21406
detected -1 ribosomal frameshift at index: 24439
detected -1 ribosomal frameshift at index: 24697
detected -1 ribosomal frameshift at index: 27268
detected -1 ribosomal frameshift at index: 27553
detected -1 ribosomal frameshift at index: 29308


['MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLEQPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKGAGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDGYPLECIKDLLARAGKASCTLSEQLDFIDTKRGVYCCREHEHEIAWYTERSEKSYELQTPFEIKLAKKFDTFNGECPNFVFPLNSIIKTIQPRVEKKKLDGFMGRIRSVYPVASPNECNQMCLSTLMKCDHCGETSWQTGDFVKATCEFCGTENLTKEGATTCGYLPQNAVVKIYCPACHNSEVGPEHSLAEYHNESGLKTILRKGGRTIAFGGCVFSYVGCHNKCAYWVPRASANIGCNHTGVVGEGSEGLNDNLLEILQKEKVNINIVGDFKLNEEIAIILASFSASTSAFVETVKGLDYKAFKQIVESCGNFKVTKGKAKKGAWNIGEQKSILSPLYAFASEAARVVRSIFSRTLETAQNSVRVLQKAAITILDGISQYSLRLIDAMMFTSDLATNNLVVMAYITGGVVQLTSQWLTNIFGTVYEKLKPVLDWLEEKFKEGVEFLRDGWEIVKFISTCACEIVGGQIVTCAKEIKESVQTFFKLVNKFLALCADSIIIGGAKLKALNLGETFVTHSKGLYRKCVKSREETGLLMPLKAPKEIIFLEGETLPTEVLTEEVVLKTGDLQPLEQPTSEAVEAPLVGTPVCINGLMLLEIKDTEKYCALAPNMMVTNNTFTLKGGAPTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVELGTEVNEFACVVADAVIKTLQPVSELLTPLGIDLDEWSMATYYLFDESGEFKLASHMYCSFYPPDEDEEEGDCEEEEFEPSTQYEYGTEDDYQGKPLEFGATSAALQPEEEQEEDWLDDDSQQTVGQQDGSEDNQ

In [53]:
mRNA_to_proteins(RNA[13476-1:13503-1])
RNA[13476-1:13503-1]

'CGGUGUAAGUGCAGCCCGUCUUACACC'

In [49]:
#X_XXY_YYZ
"0123456789abcdef"[2:9]

'2345678'

In [54]:
RNA[266-1+4398*3:13468]

'UUUUUAAAC'

In [55]:
mRNA_to_proteins(RNA[266-1+4398*3:13468])

[]