In [53]:
# returns list with indexes of start and stop codons
def find_start_stop_codons(seq):
    seq = seq.lower()
    starts = ["atg"]
    stops = ["taa", "tag", "tga"]
    codons = []
    i = 0
    while i < len(seq) - 2:
        if seq[i:i+3] in stops:
            # -1 identifier for stop codons
            codons.append([-1, i+3])
        elif seq[i:i+3] in starts:
            # +1 identifier for start codons
            codons.append([1, i])
        i += 3
    return codons

# returns the start/stop codons for the three frames
def find_frames(seq):
    # three frames: original, shifted 1, shifted 2
    frame1 = find_start_stop_codons(seq)
    frame2 = find_start_stop_codons(seq[1:])
    frame3 = find_start_stop_codons(seq[2:])
    return (frame1, frame2, frame3)
        
# find the longest subsequence that is an ORF
def find_longest_orf(seq):
    frames = find_frames(seq)
    min_start = None
    max_end = None
    curr_start = None
    max_seq = seq
    # there are only 3 frames maximum, so this is not n^2 loop
    for f in frames: 
        # goes through start and stop codons
        for codon in f: 
            # start codon
            if codon[0] == 1:
                # first start codon 
                if min_start == None:
                    min_start = codon[1]
                    curr_start = codon[1]
                # sequence was just reset
                elif curr_start == None:
                    curr_start = codon[1] 
            # end codon
            # a sequence cannot have more than one stop codon inside of it
            elif codon[0] == -1:
                # accounts for the first stop codon encountered
                if min_start != None and max_end == None: 
                    max_end = codon[1]
                    # reset the start codon to None
                    curr_start = None
                # if there is a start codon being kept track of
                elif curr_start != None: 
                    # check if the current orf is longer than the stored maximum
                    if codon[1] - curr_start > max_end - min_start:
                        max_end = codon[1]
                        min_start = curr_start
                        # determines which frame the sequence was in 
                        max_seq = seq[frames.index(f):]
                        # reset the start codon to None
                        curr_start = None
                    else:
                        # reset the start codon to None
                        curr_start = None
    # outputs the longest subsequence
    return max_seq[min_start:max_end]
        
seq = "caatgaaacccaaatagcccaaaatgaaacccaaacccaaacccaaaccctagaaacccatgaaaccctga"
print(find_longest_orf(seq))

atgaaacccaaacccaaacccaaaccctag
