# Translation Answer

In [20]:
import re
import os

codon_table = dict(zip([a + b + c for a in "TCAG" for b in "TCAG" for c in "TCAG"], 
                       'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'))
test_seq = "AAACCATGCCCCCCGGGAAATAACCC"

def find_orfs(seq):
    stops = "TAA TAG TGA".split()
    len_seq = len(seq)
    start_pos = [m.start() for m in re.finditer('ATG', seq)]
    start_ends = []
    for i in start_pos:
        start_codon_pos = i
        start = start_codon_pos
        max_codons = (len_seq - start_codon_pos) // 3
        for codon in range(max_codons):
            codon_seq = seq[start: start + 3]
            start += 3
            if codon_seq in stops:
                stop_codon_pos = start
                break
        pos = (start_codon_pos + 1, stop_codon_pos)
        start_ends.append(pos)
    return start_ends


def translate(seq, start, stop, codons):
    start -= 1
    stop -= 1
    protein = []
    for codon_pos in range(codons):
        codon = seq[start: start + 3]
        protein.append(codon_table[codon])
        start += 3
    return "".join(protein)
    

def find_longest_orf(orf_list):
    longest_orf = 0
    logest_orf_pos = ()
    for orf_pos in orf_list:
        orf_len = orf_pos[1] - orf_pos[0] + 1
        if orf_len > longest_orf:
            longest_orf = orf_len
            logest_orf_pos = orf_pos
    return logest_orf_pos[0], logest_orf_pos[1], int(longest_orf / 3 - 1)
    
    
def read_file(filepath):
    with open(filepath) as f:
        seq = []
        for line in f:
            if not line.startswith(">"):
                seq.append(line.strip())
    return "".join(seq)


def main():
    folder = "files"
    out_file = "answers.csv"
    num_of_files = 0
    for filename in os.listdir(folder):
        if filename.endswith(".fasta"):
            num_of_files += 1
    aa_list = []
    for filenum in range(num_of_files):
        filepath = os.path.join("files", str(filenum + 1) + ".fasta")
        seq = read_file(filepath)
        #seq = test_seq
        orf_pos = find_orfs(seq)
        longest_orf = find_longest_orf(orf_pos)
        start = longest_orf[0]
        stop = longest_orf[1]
        codons = longest_orf[2]
        aa_seq = translate(seq, start, stop, codons)
        print(filepath, aa_seq[:5])
        aa_list.append(aa_seq)
    with open(out_file, "w") as f:
        for i in aa_list:
            f.write("{}\n".format(i[:5]))
    print("done")
    
main()

files\1.fasta MSHGN
files\2.fasta MSLLV
files\3.fasta MKFTE
files\4.fasta MPNYA
files\5.fasta MGNPE
files\6.fasta MYSGH
files\7.fasta MFRCF
files\8.fasta MSIPS
files\9.fasta MVRAV
files\10.fasta MVRYA
files\11.fasta MEDID
files\12.fasta MVSSE
files\13.fasta MRDGA
files\14.fasta MFPTS
files\15.fasta MLSSL
files\16.fasta MPAEC
files\17.fasta MLGTC
files\18.fasta MPALA
files\19.fasta MWLDA
files\20.fasta MLALG
files\21.fasta MRSDM
files\22.fasta MGSVL
files\23.fasta MYTAK
files\24.fasta MAVEV
files\25.fasta MTLGC
files\26.fasta MDTFL
files\27.fasta MVPRR
files\28.fasta MRPHY
files\29.fasta MQSDP
files\30.fasta MLRLP
files\31.fasta MATTT
files\32.fasta MDIND
files\33.fasta MNDHK
files\34.fasta MVEAR
files\35.fasta MLLLE
files\36.fasta MPSHR
files\37.fasta MFGTR
files\38.fasta MPVQL
files\39.fasta MLPCY
files\40.fasta MRTAT
files\41.fasta MNLLP
files\42.fasta MGLIC
files\43.fasta MSTIP
files\44.fasta MTLNL
files\45.fasta MLGLF
files\46.fasta MIRIP
files\47.fasta MSTNA
files\48.fasta MIYQV
f