In [None]:
#function to translate DNA sequences into Protein sequences
def protein_translation(seq1):
  # Remove any hyphens from the sequence and convert it to uppercase for consistent processing.
  seq1 = seq1.replace("-","")
  seq1 = seq1.upper()

  # Check if the sequence length is not divisible by 3. If so, it cannot be translated into proteins.
  #if len(seq1) % 3 != 0:
    #print(f"Sequence length ({len(seq1)}) is not a multiple of 3. Cannot translate.")
    #return None

  # Check if the sequence starts with a valid start codon ('ATG').
  # If not, translation cannot initiate so we stop the function
  if seq1[0:3] != "ATG":
    print(f"Sequence does not start with an 'ATG' start codon. Cannot translate.")
    return None

  protein_seq = ""
  # Define the standard genetic code table for translating codons to amino acids.
  table = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TGC':'C', 'TGT':'C','TGG':'W'}
  # Define common stop codons.
  stop_codon = ["TAA","TGA","TAG"]

  # Iterate through the sequence, reading three nucleotides at a time (codons).
  for i in range(0, len(seq1)-2, 3):
    codon = seq1[i:i+3]
    # If the codon is in the translation table, add its corresponding amino acid to the protein sequence.
    if codon in table:
      protein_seq += table[codon]
    # If the codon is a stop codon, terminate translation.
    elif codon in stop_codon:
      break
    # If an unknown codon is encountered, translation cannot proceed, and the function returns None.
    else:
      return None

  # Return the translated protein sequence.
  return protein_seq

In [None]:
seqRS18320_ref = "ATGTCGTTTGTGTTGGTTTCGCCGGAGACCGTGGCGGCGGTGGCCACGGATCTCAAGCGCATCGGCGCCTCGCTGGCCCACGAAAACGCGTCGGCGGCCGCTTCGACGACGGCGGTGGTCTCCGCGGCCGCCGACGAGGTATCGACGGCGGTCGCCGCTCTGTTCTCCCAACACGCCCAGGGCTACCAAGCGGCGGCCGCTCAGGTAGCAGCGTTTCATAGCCGGTTTGTGCAAGCCCTGACGGCCGGTGCCGGGGCGTACGCATTTGCCGAGGCGGCCAACGCGTCGCCGCTACAGTCAGCCATGGGTGCGGTAAGCGCGTCTGCGCAGACGCTGTTGTCGCGCCCGTTGATCGGCAATGGCGCCAATGCGACGACGCCGGGCGGTAACGGCGGCGACGGCGGATGGCTATTCGGCAGCGGCGGCAACGGCGCGCCCGGCGCGGCGGGCCAGTCCGGCGGTAACGGCGGGTCAGCCGGACTGTGGGGTAACGGCGGCGCGGGTGGCGCCGGCGGCAGCGGCGGCGCCGCCGGCGGCAACGGCGGTAACGGCGGGTGGCTGTTCGGCGCCGGCGGCACCGGCGGTATCGGCGGCACCGGTGCTCCCGGCGCCATGGGCGGCACCGGCGGCAACGGCGGCAACGGCGCGCTGCTGATCGGCGGCGGCGGCCTCGGCGGCGCCGGCGGCATGGGTGGCACCGGCGGCGGCACCGGCGGCACCGGCGGCAACGGCGGCAACGGCGCGCTGCTGATCGGCGCTGGTGGTGTCGGAGGTGCTGGCGGGATCGGTGGCCAGGGTACCGGCGCCGGCGGTGCCGCCGGCGCCGGCGGCACCGGGGGCAACGGCGGCGCCGGGGGGTTGTTCATGAACGGCGGCGACGGCGGCGCCGGCGGTCAAGGCGGCGACGGTGCGGCCGGCGACGCGGCTGCCAGCGCCGGCGGCACCGGCGGCAAAGGCGGCCAAGGCGGCGACGGCGGCACCGGAGGGGCCGGCGGCGCAGGCCCAGTGCTGTTCGGCCACGGCGGCGCCGGCGGCATGGGCGGCCAAGGCGGCACCGGTGGAATGGGCGGCGCCGGCGGAGACGGCACCACCGTCATCGCGGCCGGTACCGGGGGGGAGGGCGGCACCGGCGGCACCGGCGGTACCGGCGGCAACGGCGCTGACGCCGCTGCTGTGGTGGGCTTCGGCGCGAACGGCGACCCTGGCTTCGCTGGCGGCAAAGGCGGTAACGGCGGAATAGGTGGGGCCGCGGTGACAGGCGGGGTCGCCGGCGACGGCGGCACCGGCGGCAAAGGTGGCACCGGCGGTGCCGGCGGCGCCGGCAACGACGCCGGCAGCACCGGCAATCCCGGCGGTAAGGGCGGCGACGGCGGGATCGGCGGTGCCGGCGGGGCCGGCGGCGCGGCCGGCACCGGCAACGGCGGCCATGCCGGCAACACAGGTGACGGCGGCGACGGCGGGACCGGCGGTAACGGCGGCAACGGCACCGGAGGCGTGAACGGCGCCGACAACACCCTCAACCCCGACACCCCCGGCGGCGCCGGGGAGCCCGGCGGGGCCGGCGGGGCCGGCGGGGCCGGCGGGGCCGCCGGCGGCCCGGGCGGTACCGGCGGTACCGGCGGTAACGGCGGCAACGGCGGCAACGGCGGCAACGGCGGCAATGCCGGCAACAACAGCACCAATGCCCCAGTCGGTGGCGAAGGCGGCGCCGGCGGCGACGGCGGCGCCGGCGGCGCAGGCGGGGCCGCCAACGGCGGCACCGCGGGCAGCCAGGGCACTGGGGGCGTCGGCGGCGACGGCGGCGCGGGCGGCAACGGCGGCGGCGGCAAGGCTGGCACCGGCAACAGCGGCAACTTTGGGGTGGACGGCGAAGCCGGCTTCAGCGGCGGCGCCGGTGGCAACGGCGGCGTAGGCGGGGCCGCCGGCGCCAATGGCGGAACCGGCGGCAGCGGTGGTAATGGCGGTGACGGCGGTGCGGGAGGCATTGGCGGGGCCGGCGGCAACGGCATACCGGGCACTGGCACAGAGCCTGCCGGGGGCACCGGCGCCAAAGGTGGAGACGGCGGCGACGGTGGCGCCGGCGGCGCAGGCGGCAATGCCGGCGGGGCCGGCGGTAACGGCGGGGCCGGCGGCCAGGGCGGCAATGCCGGCCAGGGTGGCGCCGGCGGTGCGGGCGGCAACGCCGTGATTCCCGGCGACGGCGTCGGGAAGGCGCCGCACGGCGGCGCGGGCGGCAGCGGCGGAGACGGCGGCAAAGGCGGCCAGGGCGGTAGTGGCGGCACCGGCGGATCCGGTGCCCCGATCGGTGGCGGCGCCGGAGGCACCGGAGGGTCCGGCGGACACGCCGGCAAGGGTGGCGCCGGCGGCATCGGCGCACAGGGCACCACCATCACCGTGCCCGGGAACGGCGGCAACGCCGGCGACGGCGGCAACGGCGGTGGCGGCGGCGCGGGCGGCACCGGCGGCGACGGCGCCACCGGCACGCCAGCCGGCAACGGCGGCGACGGCGGCAACGGCGGCAACGCCGGCGACGGCGGCAACGGCGGCTCCGGCGACTTCGGTGGCAATACCACCAGCGGCGCCTCCGGCAGCGGCGGCAACGGCGGCAACGCCGGCACCGCGGGTAGCGGCGGTGCGGGCGGAACCGGCGGCACCGGCCTTAGCGGCGGCAACGGCGGCAACGGCGGTGACGGCGGTAACGGCGCCCACGGCACCGTCGGCGCCCAGTTCGTCCCGGCCACCAGCTTGCCCACACCCAACGGCGGGGCCGGTGGCAACGGTGGCACCGGAAGCAACGGCGGCGCGCCCGGCCCCGCCGGGGCGCCCGGCCCCACTACCGGCGGTAACGCTGGCAGCCAGGGCATCGGCGGCGACGGGGGCAACGGCGGCGACGGCGGTAAAGGCGGTGACGGCGCCGACGCTGTCAACGTCGTATTCATGCCGACTGAGCCACAGGCCGCGACCGGCACTGCCGGCAGCGCCGGTGACCCCACCGGCGGTAACGGAGGGCCCGGCACTCCCGGCAGCCCCATGGTTGCCCCGCCCCCGCCAACGCCAATCACTCAAGTCCAACAGGGCGGTGACGGTGGCGCCGGGGGCACCGGATCCACCAACGCCAACGACGGCACAGCCACCGGCGGAAAGGGCGGAGAAGGCGGAGTCGGCAGCATTCTCGGCGGGCCCGGCGGCAACGGCGGAACTGGCGGCAACGCCTCGGCAACCGGCACCAACGGGGTGGCCAACGCCGGGAATGGCGGCAAGGGTGGCGACGGCGGCCAGTTTGGGGCCGGCGGCAACGGTGGTGCCGGCGGCAGCGTAACCGACGGATCCGCCGGCAGCACCGCAGGCAACGGCGGCAACGGCGGCAACGCAACCAACGGCACCATCGCAGGCCAACCCGCCGGCGGCAACGGCTCGGCCGGCGGGAAAGGCGGCGACGGCGGCAACATCGCCGCCGGTGCCACCGGCACCGCCGGCAACGGCGGGAACGGCGGCAACGGCAACGACGGCGCCGTCAACGCCGGCACCGGCGGCTCCGGCGGGAACGGCGGTAACGCCGGTGGCGGCGGCGCCAATGGCGGCGACGGCGGCGCCGGCGGCGCCGGCGGGGCCGGCGGGCGTGGCGGCAAGGGCATCGACGGCGGGTTCGGCGGTGACGGCGGCAACGGCGGCAGCAACAACGGCACCGGCGCCGGTGGCAACGGCGGCAACGGCGGCACCGGCGGGGTCGGCTCGGTTGGCGCGGCTGGTGGCGATGGCGGCAACGGCGGCACCGGAGGCTTCGCCGGTTTCGGCGGCACCGCAGGCAATGGCGGTTCCGGCGGCACGGGCGGGGCCGGCGGCGACGGCGGCACCGGCGGGGGCGGCGGCAACGGCGGCACCGGCGTTATCGCCGGCGGCGGGGGGACCGGCGGCAACGGCGGCGCCAGCGGGGCCGGCGGCGCCGGCGGCACGGGCGGGTTCGCCGGCAACGGCAATGCCGGCGGCAATGGCGGCACCGGCGGCGCGAGCGAGGACGGCGACAACGGCAACGCTGGCAGCGGCGCCACCGGCGGTACCGGCGGCAACGGCGGCACCGGCGGCGACGGCGGCGCTGCCGGGCTGGGCGGCGTCGCGTGA"

In [None]:
seqRS18320_Aer = "ATGTCGTTTGTGTTGGTTTCGCCGGAGACCGTGGCGGCGGTGGCCACGGATCTCAAGCGCATCGGCGCCTCGCTGGCCCACGAAAACGCGTCGGCGGCCGCTTCGACGACGGCGGTGGTCTCCGCGGCCGCCGACGAGGTATCGACGGCGGTCGCCGCTCTGTTCTCCCAACACGCCCAGGGCTACCAAGCGGCGGCCGCTCAGGTAGCAGCGTTTCATAGCCGGTTTGTGCAAGCCCTGACGGCCGGTGCCGGGGCGTACGCATTTGCCGAGGCGGCCAACGCGTCGCCGCTACAGTCAGCCATGGGTGCGGTAAGCGCGTCTGCGCAGACGCTGTTGTCGCGCCCGTTGATCGGCAATGGCGCCAATGCGACGACGCCGGGCGGTAACGGCGGCGACGGCGGATGGCTATTCGGCAGCGGCGGCAACGGCGCGCCCGGCGCGGCGGGCCAGTCCGGCGGTAACGGCGGGTCAGCCGGACTGTGGGGTAACGGCGGCGCGGGTGGCGCCGGCGGCAGCGGCGGCGCCGCCGGCGGCAACGGCGGTAACGGCGGGTGGCTGTTCGGCGCCGGCGGCACCGGCGGTATCGGCGGCACCGGTGCTCCCGGCGCCATGGGCGGCACCGGCGGCAACGGCGGCAACGGCGCGCTGCTGATCGGCGGCGGCGGCCTCGGCGGCGCCGGCGGCATGGGTGGCACCGGCGGCGGCACCGGCGGCACCGGCGGCAACGGCGGCAACGGCGCGCTGCTGATCGGCGCTGGTGGTGTCGGAGGTGCTGGCGGGATCGGTGGCCAGGGTACCGGCGCCGGCGGTGCCGCCGGCGCCGGCGGCACCGGGGGCAACGGCGGCGCCGGGGGGTTGTTCATGAACGGCGGCGACGGCGGCGCCGGCGGTCAAGGCGGCGACGGTGCGGCCGGCGACGCGGCTGCCAGCGCCGGCGGCACCGGCGGCAAAGGCGGCCAAGGCGGCGACGGCGGCACCGGAGGGGCCGGCGGCGCAGGCCCAGTGCTGTTCGGCCACGGCGGCGCCGGCGGCATGGGCGGCCAAGGCGGCACCGGTGGAATGGGCGGCGCCGGCGGAGACGGCACCACCGTCATCGCGGCCGGTACCGGGGGGGAGGGCGGCACCGGCGGCACCGGCGGTACCGGCGGCAACGGCGCTGACGCCGCTGCTGTGGTGGGCTTCGGCGCGAACGGCGACCCTGGCTTCGCTGGCGGCAAAGGCGGTAACGGCGGAATAGGTGGGGCCGCGGTGACAGGCGGGGTCGCCGGCGACGGCGGCACCGGCGGCAAAGGTGGCACCGGCGGTGCCGGCGGCGCCGGCAACGACGCCGGCAGCACCGGCAATCCCGGCGGTAAGGGCGGCGACGGCGGGATCGGCGGTGCCGGCGGGGCCGGCGGCGCGGCCGGCACCGGCAACGGCGGCCATGCCGGCAACACAGGTGACGGCGGCGACGGCGGGACCGGCGGTAACGGCGGCAACGGCACCGGAGGCGTGAACGGCGCCGACAACACCCTCAACCCCGACACCCCCGGCGGCGCCGGGGAGCCCGGCGGGGCCGGCGGGGCCGGCGGGGCCGGCGGGGCCGCCGGCGGCCCGGGCGGTACCGGCGGTACCGGCGGTAACGGCGGCAACGGCGGCAACGGCGGCAACGGCGGCAATGCCGGCAACAACAGCACCAATGCCCCAGTCGGTGGCGAAGGCGGCGCCGGCGGCGACGGCGGCGCCGGCGGCGCAGGCGGGGCCGCCAACGGCGGCACCGCGGGCAGCCAGGGCACTGGGGGCGTCGGCGGCAACGGCGGCGCGGGCGGCAACGGCGGCGGCGGCAAGGCTGGCACCGGCAACAGCGGCAACTTTGGGGTGGACGGCGAAGCCGGCTTCAGCGGCGGCGCCGGTGGCAACGGCGGCGTAGGCGGGGCCGCCGGCGCCAATGGCGGAACCGGCGGCAGCGGTGGTAATGGCGGTGACGGCGGTGCGGGAGGCATTGGCGGGGCCGGCGGCAACGGCATACCGGGCACTGGCACAGAGCCTGCCGGGGGCACCGGCGCCAAAGGTGGAGACGGCGGCGACGGTGGCGCCGGCGGCGCAGGCGGCAATGCCGGCGGGGCCGGCGGCAACGGCGGGGCCGGCGGCCAGGGCGGCAATGCCGGCCAGGGTGGCGCCGGCGGTGCGGGCGGCAACGCCGTGATTCCCGGCGACGGCGTCGGGAAGGCGCCGCACGGCGGCGCGGGCGGCAGCGGCGGAGACGGCGGCAAAGGCGGCCAGGGCGGTAGTGGCGGCACCGGCGGATCCGGTGCCCCGATCGGTGGCGGCGCCGGAGGCACCGGAGGGTCCGGCGGACACGCCGGCAAGGGTGGCGCCGGCGGCATCGGCGCACAGGGCACCACCATCACCGTGCCCGGGAACGGCGGCAACGCCGGCGACGGCGGCAACGGCGGTGGCGGCGGCGCGGGCGGCACCGGCGGCGACGGCGCCACCGGCACGCCAGCCGGCAACGGCGGCGACGGCGGCAACGGCGGCAACGCCGGCGACGGCGGCAACGGCGGCTCCGGCGACTTCGGTGGCAATACCACCAGCGGCGCCTCCGGCAGCGGCGGCAACGGCGGCAACGCCGGCACCGCGGGTAGCGGCGGTGCGGGCGGAACCGGCGGCACCGGCCTTAGCGGCGGCAACGGCGGCAACGGCGGTGACGGCGGTAACGGCGCCCACGGCACCGTCGGCGCCCAGTTCGTCCCGGCCACCAGCTTGCCCACACCCAACGGCGGGGCCGGTGGCAACGGTGGCACCGGAAGCAACGGCGGCGCGCCCGGCCCCGCCGGGGCGCCCGGCCCCACTACCGGCGGTAACGCTGGCAGCCAGGGCATCGGCGGCGACGGGGGCAACGGCGGCGACGGCGGTAAAGGCGGTGACGGCGCCGACGCTGTCAACGTCGTATTCATGCCGACTGAGCCACAGGCCGCGACCGGCACTGCCGGCAGCGCCGGTGACCCCACCGGCGGTAACGGAGGGCCCGGCACTCCCGGCAGCCCCATGGTTGCCCCGCCCCCGCCAACGCCAATCACTCAAGTCCAACAGGGCGGTGACGGTGGCGCCGGGGGCACCGGATCCACCAACGCCAACGACGGCACAGCCACCGGCGGAAAGGGCGGAGAAGGCGGAGTCGGCAGCATTCTCGGCGGGCCCGGCGGCAACGGCGGAACTGGCGGCAACGCCTCGGCAACCGGCACCAACGGGGTGGCCAACGCCGGGAATGGCGGCAAGGGTGGCGACGGCGGCCAGTTTGGGGCCGGCGGCAACGGTGGTGCCGGCGGCAGCGTAACCGACGGATCCGCCGGCAGCACCGCAGGCAACGGCGGCAACGGCGGCAACGCAACCAACGGCACCATCGCAGGCCAACCCGCCGGCGGCAACGGCTCGGCCGGCGGGAAAGGCGGCGACGGCGGCAACATCGCCGCCGGTGCCACCGGCACCGCCGGCAACGGCGGGAACGGCGGCAACGGCAACGACGGCGCCGTCAACGCCGGCACCGGCGGCTCCGGCGGGAACGGCGGTAACGCCGGTGGCGGCGGCGCCAATGGCGGCGACGGCGGCGCCGGCGGCGCCGGCGGGGCCGGCGGGCGTGGCGGCAAGGGCATCGACGGCGGGTTCGGCGGTGACGGCGGCAACGGCGGCAGCAACAACGGCACCGGCGCCGGTGGCAACGGCGGCAACGGCGGCACCGGCGGGGTCGGCTCGGTTGGCGCGGCTGGTGGCGATGGCGGCAACGGCGGCACCGGAGGCTTCGCCGGTTTCGGCGGCACCGCAGGCAATGGCGGTTCCGGCGGCACGGGCGGGGCCGGCGGCGACGGCGGCACCGGCGGGGGCGGCGGCAACGGCGGCACCGGCGTTATCGCCGGCGGCGGGGGGACCGGCGGCAACGGCGGCGCCAGCGGGGCCGGCGGCGCCGGCGGCACGGGCGGGTTCGCCGGCAACGGCAATGCCGGCGGCAATGGCGGCACCGGCGGCGCGAGCGAGGACGGCGACAACGGCAACGCTGGCAGCGGCGCCACCGGCGGTACCGGCGGCAACGGCGGCACCGGCGGCGACGGCGGCGCTGCCGGGCTGGGCGGCGTCGCGTGA"

In [None]:
#function to check if the reference protein sequence and the inserted protein sequence matching or not (positive strand) ?
def protein_translation_positivestrand_check(seq1, seq2):
  # Remove hyphens and convert sequences to uppercase for consistency
  seq1 = seq1.replace("-","")
  seq1 = seq1.upper()
  seq2 = seq2.replace("-","")
  seq2 = seq2.upper()

  # Translate both DNA sequences to protein sequences using the helper function
  protein_seq1 = protein_translation(seq1)
  protein_seq2 = protein_translation(seq2)
  print(protein_seq1)
  print(protein_seq2)

  nonconserved_sequence = []

  # Check if either protein sequence could not be translated (e.g., invalid start codon, length not divisible by 3)
  if protein_seq1 is None or protein_seq2 is None:
    print("Stop the function because of the missing protein sequence")
    return None

  # Case 1: Both protein sequences have the same length
  if len(protein_seq1) == len(protein_seq2):
    print("the protein sequence of both reference and BCG Aeres have the same length")
    # Iterate through the sequences to find non-conserved positions
    for i in range(len(protein_seq1)):
      if protein_seq1[i] != protein_seq2[i]:
        nonconserved_sequence.append({"position" : i, "reference sequence" : protein_seq1[i], "BCG Aeras sequence" : protein_seq2[i]})

  # Case 2: Protein sequences have different lengths
  else:
    print("the protein sequence of both reference and BCG Aeres have different length")
    # Iterate up to the length of the shorter sequence to find non-conserved positions
    min_len = min(len(protein_seq1), len(protein_seq2))
    for i in range(min_len):
      if protein_seq1[i] != protein_seq2[i]:
         nonconserved_sequence.append({"position" : i, "reference sequence" : protein_seq1[i], "BCG Aeras sequence" : protein_seq2[i]})

  # Check if any non-conserved sequences were found
  if len(nonconserved_sequence) == 0:
    print("the protein sequence of both reference and BCG Aeres have conserved sequences")
  else:
    # Return the list of non-conserved positions and amino acids
    return nonconserved_sequence

In [None]:
protein_translation_positivestrand_check(seqRS18320_ref, seqRS18320_Aer)


MSFVLVSPETVAAVATDLKRIGASLAHENASAAASTTAVVSAAADEVSTAVAALFSQHAQGYQAAAAQVAAFHSRFVQALTAGAGAYAFAEAANASPLQSAMGAVSASAQTLLSRPLIGNGANATTPGGNGGDGGWLFGSGGNGAPGAAGQSGGNGGSAGLWGNGGAGGAGGSGGAAGGNGGNGGWLFGAGGTGGIGGTGAPGAMGGTGGNGGNGALLIGGGGLGGAGGMGGTGGGTGGTGGNGGNGALLIGAGGVGGAGGIGGQGTGAGGAAGAGGTGGNGGAGGLFMNGGDGGAGGQGGDGAAGDAAASAGGTGGKGGQGGDGGTGGAGGAGPVLFGHGGAGGMGGQGGTGGMGGAGGDGTTVIAAGTGGEGGTGGTGGTGGNGADAAAVVGFGANGDPGFAGGKGGNGGIGGAAVTGGVAGDGGTGGKGGTGGAGGAGNDAGSTGNPGGKGGDGGIGGAGGAGGAAGTGNGGHAGNTGDGGDGGTGGNGGNGTGGVNGADNTLNPDTPGGAGEPGGAGGAGGAGGAAGGPGGTGGTGGNGGNGGNGGNGGNAGNNSTNAPVGGEGGAGGDGGAGGAGGAANGGTAGSQGTGGVGGDGGAGGNGGGGKAGTGNSGNFGVDGEAGFSGGAGGNGGVGGAAGANGGTGGSGGNGGDGGAGGIGGAGGNGIPGTGTEPAGGTGAKGGDGGDGGAGGAGGNAGGAGGNGGAGGQGGNAGQGGAGGAGGNAVIPGDGVGKAPHGGAGGSGGDGGKGGQGGSGGTGGSGAPIGGGAGGTGGSGGHAGKGGAGGIGAQGTTITVPGNGGNAGDGGNGGGGGAGGTGGDGATGTPAGNGGDGGNGGNAGDGGNGGSGDFGGNTTSGASGSGGNGGNAGTAGSGGAGGTGGTGLSGGNGGNGGDGGNGAHGTVGAQFVPATSLPTPNGGAGGNGGTGSNGGAPGPAGAPGPTTGGNAGSQGIGGDGGNGGDGGKGGDGADAVNVVFMPTEPQAATGTAGSAGDPTGG

[{'position': 598, 'reference sequence': 'D', 'BCG Aeras sequence': 'N'}]

In [None]:
seqRS18350_ref = "ATGTCGTTCGTGTTGATCGCACCGGAATTCGTGACAGCAGCCGCGGGGGATCTGACGAATCTGGGTTCGTCGATTAGCGCGGCCAACGCGTCGGCAGCCAGTGCGACCACGCAGGTGCTGGCTGCGGGCGCCGATGAGGTGTCTGCCCGTATTGCGGCGCTGTTCGGCGGGTTTGGCCTGGAGTACCAGGCGATTAGTGCGCAGGTGGCGGCCTACCACCAGCGGTTTGTGCAGGCCTCGAGTACCGGCGCGGGCGCATATGCCTCGGCCGAGGCCACCGCCGCTGAGCAGATCGTGCTGGGCGTGATCAATGCGCCCACCCAGGCGCTGCTGGGGCGCCCGTTGATCGGTGACGGCGCCAATGCGACGACTCCCGGCGGGGCCGGCGGGGCCGGCGGCAGCGGTGGGGGCACCGGCGGTGCCGGCGGCGCCGGTGGGTGGCTGTTCGGGGTTGGCGGCGCCGGCGGTGTCGGTGGGGCCGGTGGCGGCACCGGCGGGGCGGGCGGGCCCGGTGGTTTGATCTGGGGCGGCGGCGGGGCCGGCGGTGTCGGTGGGGCCGGTGGCGGCACCGGCGGGGCCGGCGGCCGCGCCGAGCTGCTGTTCGGCGCCGGCGGTGCGGGCGGCGCGGGTGGGGCGGGCACCGACGGCGGGCCCGGTGCTACCGGCGGGACCGGCGGACACGGCGGAGTCGGCGGCGACGGCGGATGGCTGGCACCCGGCGGGGCCGGCGGGGCCGGCGGGCAAGGCGGGGCAGGTGGTGCCGGCAGCGATGGTGGCGCGTTGGGTGGTACCGGCGGGACGGGCGGTACCGGCGGCGCCGGTGGCGCCGGCGGTCGCGGCGCACTGCTGCTGGGCGCTGGCGGACAGGGCGGCCTCGGCGGCGCCGGCGGACAAGGCGGGATGGGGGGTGCTGGCGGGGCCGGCGCCGATAACCCCACCGGCATCGGCGGCACCGGCGGTGACGGCGGCACCGGCGGTAGCGCCGGTGAGGGCGGGGCCGGTGGTGCCGCGGGCCAGCTCTTCAGCGCCAGCGGAGCGGCCGGTAACGCCGGTGTCGGCGGGGCCGGCGGCCAAGGCGGTGACGGCGGAGCCGGCGGGGCCGGCGCCGACGCCGACCAGCCCGGCGCCACCGGCGGCACCGGGTTCGCCGGTGGAGCCGGCGGAGCCGGCGGGGCCGGCGGTAGCAGCGGTGCCGGCGGCACCAACGGCTCCGGCGGCGCCGGCGGCACCGGCGGACAAGGCGGGATGGGGGGTGCTGGCGGGGCCGGCGCCGATAACCCCACCGGCATCGGCGGCACCGGCGGTGACGGCGGCACCGGCGGAGCGGCCGGAGCCGGCGGGGCCGGCGGAGCGGCCGGCACCGGAGGCACCGGCGGCATGATCGGCACCACAGGCAACGCCGGTGTCGGCGGGGCCGGCGGCCAAGGCGGTGACGGCGGAGCCGGCGGGGCCGGCGCCGACGCCGACCAGCCCGGCGCCACCGGCGGCACCGGGTTCGCCGGTGGAGCCGGCGGGGCCGGCGGGGCCGGCGGTAGCAGCGGTGCCGGCGGCACCAACGGCTCCGGCGGCGCCGGCGGCACCGGCGGACAAGGCGGCGCCGGGGGTGCTGGCGGGGCCGGCGCCGATAACCCCACCGGCATCGGCGGCACCGGCGGTGACGGCGGCACCGGCGGAGCGGCCGGAGCCGGCGGGGCCGGCGGAGCGGCCGGCACCGGAGGCACCGGCGGCATGATCGGCACCACAGGCAACGCCGGTGTCGGCGGGGCCGGCGGCCAAGGCGGTGACGGCGGAGCCGGCGGGGCCGGCGGGGCCGGCGGTAGCAGCGGTGCCGGCGGCACCAACGGCTCCGGCGGCGCCGGCGGCACCGGCGGACAAGGCGGCGCCGGGGGTGCTGGCGGGGCCGGCGCCGATAACCCCACCGGCATCGGCGGCACCGGCGGTGACGGCGGCACCGGCGGAGCGGCCGGAGCCGGCGGGGCCGGCGGAGCGGCCGGCACCGGAGGCACCGGCGGCATGATCGGCACCACAGGCAACGCCGGTGTCGGCGGGGCCGGCGGCCAAGGCGGTGACGGCGGAGCCGGCGGGGCCGGCGCCGACGCCGACCAGCCCGGCGCCACCGGCGGCACCGGGTTCGCCGGTGGAGCCGGCGGGGCCGGCGGGGCCGGCGGGGCCGGCGGTAGCAGCGGTGCCGGCGGCACCAACGGCTCCGGCGGCGCCGGCGGCACCGGCGGACAAGGCGGCGCCGGGGGTGCTGGCGGGGCCGGCGCCGATAACCCCACCGGCATCGGCGGCACCGGCGGTGACGGCGGCACCGGCGGAGCGGCCGGAGCCGGCGGGGCCGGCGGAGCGGCCGGCACCGGAGGCACCGGCGGCATGATCGGCACCACAGGCAACGCCGGTGTCGGCGGGGCCGGCGGCCAAGGCGGTGACGGCGGAGCCGGCGGGGCCGGCGCCGACGCCGACCAGCCCGGCGCCACCGGCGGCACCGGGTTCGCCGGTGGAGCCGGCGGGGCCGGCGGGGCCGGCGGTAGCAGCGGTGCCGGCGGCACCAACGGCTCCGGCGGCGCCGGCGGCACCGGCGGACAAGGCGGCGCCGGGGGTGCTGGCATCAGCTTCAGCAACGGCAGCAACGGCGGCACCGGCGGCACCGGGGGCGTGGGCGGCACCGGGGGCGACGGCGGCAACGCAGGCACCGGCGCCGGCGACCCCGGCAAAGGCGGCACCGGCGGCACCGGCGGCAGCGGCGGGGCCGGCGGTAGCGGCGGGGCCAACTTCAACGGCGGCACCGGCGGCACCGGCGGCACCGGCGGCACCGGCGGCAAAGGCGGCATGGGCGGCATCGCTGGCGACGGCGGGCCCGGCGGTGACGGCGGCAACGCCGGGGTCGGAGGAAAAGGCGGCACCAACGGCAACGGCGGCAGCGGCGGGACCGGCGGCACAGGCGGGCCCGGCGGCAGCGGCGGCGCGCCCACCGGCAGCGGCACCGGCGGCAAAGGCGGCGCCGGCGGTGACGGCGGCGATGGCGCCGACGGAGGGGCAGCCACCGGCGTCGGCGACGGCGGCGACGGTGGTAACGGTGGTAACGGTGGTAACGGCGGCACGGGCGTCGGCTCGCCCGGCGGCCTCGGCGGGGCAGGAGGCACTGGAGGCCTCGGCGGCGCCGGTGCAGGCGGCGGAGCCGACGGCGATGATGGCGACGACGGCCAACCCGGCAACAACGGCAGCTGA"

In [None]:
seqRS18350_Aer = "ATGTCGTTCGTGTTGATCGCACCGGAATTCGTGACAGCAGCCGCGGGGGATCTGACGAATCTGGGTTCGTCGATTAGCGCGGCCAACGCGTCGGCAGCCAGTGCGACCACGCAGGTGCTGGCTGCGGGCGCCGATGAGGTGTCTGCCCGTATTGCGGCGCTGTTCGGCGGGTTTGGCCTGGAGTACCAGGCGATTAGTGCGCAGGTGGCGGCCTACCACCAGCGGTTTGTGCAGGCCTCGAGTACCGGCGCGGGCGCATATGCCTCGGCCGAGGCCACCGCCGCTGAGCAGATCGTGCTGGGCGTGATCAATGCGCCCACCCAGGCGCTGCTGGGGCGCCCGTTGATCGGTGACGGCGCCAATGCGACGACTCCCGGCGGGGCCGGCGGGGCCGGCGGCAGCGGTGGGGGCACCGGCGGTGCCGGCGGCGCCGGTGGGTGGCTGTTCGGGGTTGGCGGCGCCGGCGGTGTCGGTGGGGCCGGTGGCGGCACCGGCGGGGCGGGCGGGCCCGGTGGTTTGATCTGGGGCGGCGGCGGGGCCGGCGGTGTCGGTGGGGCCGGTGGCGGCACCGGCGGGGCCGGCGGCCGCGCCGAGCTGCTGTTCGGCGCCGGCGGTGCGGGCGGCGCGGGTGGGGCGGGCACCGACGGCGGGCCCGGTGCTACCGGCGGGACCGGCGGACACGGCGGAGTCGGCGGCGACGGCGGATGGCTGGCACCCGGCGGGGCCGGCGGGGCCGGCGGGCAAGGCGGGGCAGGTGGTGCCGGCAGCGATGGTGGCGCGTTGGGTGGTACCGGCGGGACGGGCGGTACCGGCGGCGCCGGTGGCGCCGGCGGTCGCGGCGCACTGCTGCTGGGCGCTGGCGGACAGGGCGGCCTCGGCGGCGCCGGCGGACAAGGCGGGATGGGGGGTGCTGGCGGGGCCGGCGCCGATAACCCCACCGGCATCGGCGGCACCGGCGGTGACGGCGGCACCGGCGGTAGCGCCGGTGAGGGCGGGGCCGGTGGTGCCGCGGGCCAGCTCTTCAGCGCCAGCGGAGCGGCCGGTAACGCCGGTGTCGGCGGGGCCGGCGGCCAAGGCGGTGACGGCGGAGCCGGCGGGGCCGGCGCCGACGCCGACCAGCCCGGCGCCACCGGCGGCACCGGGTTCGCCGGTGGAGCCGGCGGAGCCGGCGGGGCCGGCGGTAGCAGCGGTGCCGGCGGCACCAACGGCTCCGGCGGCGCCGGCGGCACCGGCGGACAAGGCGGGATGGGGGGTGCTGGCGGGGCCGGCGCCGATAACCCCACCGGCATCGGCGGCACCGGCGGTGACGGCGGCACCGGCGGAGCGGCCGGAGCCGGCGGGGCCGGCGGAGCGGCCGGCACCGGAGGCACCGGCGGCATGATCGGCACCACAGGCAACGCCGGTGTCGGCGGGGCCGGCGGCCAAGGCGGTGACGGCGGAGCCGGCGGGGCCGGCGCCGACGCCGACCAGCCCGGCGCCACCGGCGGCACCGGGTTCGCCGGTGGAGCCGGCGGGGCCGGCGGGGCCGGCGGTAGCAGCGGTGCCGGCGGCACCAACGGCTCCGGCGGCGCCGGCGGCACCGGCGGACAAGGCGGCGCCGGGGGTGCTGGCGGGGCCGGCGCCGATAACCCCACCGGCATCGGCGGCACCGGCGGTGACGGCGGCACCGGCGGAGCGGCCGGAGCCGGCGGGGCCGGCGGAGCGGCCGGCACCGGAGGCACCGGCGGCATGATCGGCACCACAGGCAACGCCGGTGTCGGCGGGGCCGGCGGCCAAGGCGGTGACGGCGGAACCGGCGGGGCCGGCGGGGCCGGCGGTAGCAGCGGTGCCGGCGGCACCAACGGCTCCGGCGGCGCCGGCGGCACCGGCGGACAAGGCGGCGCCGGGGGTGCTGGCGGGGCCGGCGCCGATAACCCCACCGGCATCGGCGGCACCGGCGGTGACGGCGGCACCGGCGGAGCGGCCGGAGCCGGCGGGGCCGGCGGAGCGGCCGGCACCGGAGGCACCGGCGGCATGATCGGCACCACAGGCAACGCCGGTGTCGGCGGGGCCGGCGGCCAAGGCGGTGACGGCGGAGCCGGCGGGGCCGGCGCCGACGCCGACCAGCCCGGCGCCACCGGCGGCACCGGGTTCGCCGGTGGAGCCGGCGGGGCCGGCGGGGCCGGCGGGGCCGGCGGTAGCAGCGGTGCCGGCGGCACCAACGGCTCCGGCGGCGCCGGCGGCACCGGCGGACAAGGCGGCGCCGGGGGTGCTGGCGGGGCCGGCGCCGATAACCCCACCGGCATCGGCGGCACCGGCGGTGACGGCGGCACCGGCGGAGCGGCCGGAGCCGGCGGGGCCGGCGGAGCGGCCGGCACCGGAGGCACCGGCGGCATGATCGGCACCACAGGCAACGCCGGTGTCGGCGGGGCCGGCGGCCAAGGCGGTGACGGCGGAGCCGGCGGGGCCGGCGCCGACGCCGACCAGCCCGGCGCCACCGGCGGCACCGGGTTCGCCGGTGGAGCCGGCGGGGCCGGCGGGGCCGGCGGTAGCAGCGGTGCCGGCGGCACCAACGGCTCCGGCGGCGCCGGCGGCACCGGCGGACAAGGCGGCGCCGGGGGTGCTGGCATCAGCTTCAGCAACGGCAGCAACGGCGGCACCGGCGGCACCGGGGGCGTGGGCGGCACCGGGGGCGACGGCGGCAACGCAGGCACCGGCGCCGGCGACCCCGGCAAAGGCGGCACCGGCGGCACCGGCGGCAGCGGCGGGGCCGGCGGTAGCGGCGGGGCCAACTTCAACGGCGGCACCGGCGGCACCGGCGGCACCGGCGGCACCGGCGGCAAAGGCGGCATGGGCGGCATCGCTGGCGACGGCGGGCCCGGCGGTGACGGCGGCAACGCCGGGGTCGGAGGAAAAGGCGGCACCAACGGCAACGGCGGCAGCGGCGGGACCGGCGGCACAGGCGGGCCCGGCGGCAGCGGCGGCGCGCCCACCGGCAGCGGCACCGGCGGCAAAGGCGGCGCCGGCGGTGACGGCGGCGATGGCGCCGACGGAGGGGCAGCCACCGGCGTCGGCGACGGCGGCGACGGTGGTAACGGTGGTAACGGTGGTAACGGCGGCACGGGCGTCGGCTCGCCCGGCGGCCTCGGCGGGGCAGGAGGCACTGGAGGCCTCGGCGGCGCCGGTGCAGGCGGCGGAGCCGACGGCGATGATGGCGACGACGGCCAACCCGGCAACAACGGCAGCTGA"

In [None]:
protein_translation_positivestrand_check(seqRS18350_ref, seqRS18350_Aer)

MSFVLIAPEFVTAAAGDLTNLGSSISAANASAASATTQVLAAGADEVSARIAALFGGFGLEYQAISAQVAAYHQRFVQASSTGAGAYASAEATAAEQIVLGVINAPTQALLGRPLIGDGANATTPGGAGGAGGSGGGTGGAGGAGGWLFGVGGAGGVGGAGGGTGGAGGPGGLIWGGGGAGGVGGAGGGTGGAGGRAELLFGAGGAGGAGGAGTDGGPGATGGTGGHGGVGGDGGWLAPGGAGGAGGQGGAGGAGSDGGALGGTGGTGGTGGAGGAGGRGALLLGAGGQGGLGGAGGQGGMGGAGGAGADNPTGIGGTGGDGGTGGSAGEGGAGGAAGQLFSASGAAGNAGVGGAGGQGGDGGAGGAGADADQPGATGGTGFAGGAGGAGGAGGSSGAGGTNGSGGAGGTGGQGGMGGAGGAGADNPTGIGGTGGDGGTGGAAGAGGAGGAAGTGGTGGMIGTTGNAGVGGAGGQGGDGGAGGAGADADQPGATGGTGFAGGAGGAGGAGGSSGAGGTNGSGGAGGTGGQGGAGGAGGAGADNPTGIGGTGGDGGTGGAAGAGGAGGAAGTGGTGGMIGTTGNAGVGGAGGQGGDGGAGGAGGAGGSSGAGGTNGSGGAGGTGGQGGAGGAGGAGADNPTGIGGTGGDGGTGGAAGAGGAGGAAGTGGTGGMIGTTGNAGVGGAGGQGGDGGAGGAGADADQPGATGGTGFAGGAGGAGGAGGAGGSSGAGGTNGSGGAGGTGGQGGAGGAGGAGADNPTGIGGTGGDGGTGGAAGAGGAGGAAGTGGTGGMIGTTGNAGVGGAGGQGGDGGAGGAGADADQPGATGGTGFAGGAGGAGGAGGSSGAGGTNGSGGAGGTGGQGGAGGAGISFSNGSNGGTGGTGGVGGTGGDGGNAGTGAGDPGKGGTGGTGGSGGAGGSGGANFNGGTGGTGGTGGTGGKGGMGGIAGDGGPGGDGGNAGVGGKGGTNGNGGSGGTGGTGGPGGSGGAPTGSGTGGKGG

[{'position': 597, 'reference sequence': 'A', 'BCG Aeras sequence': 'T'}]

In [None]:
def reverse_complement_strand(seq1):
  # Remove any hyphens and convert the sequence to uppercase for consistent processing.
  seq1 = seq1.replace("-","")
  seq1 = seq1.upper()

  complement_chars = []
  # Iterate through each nucleotide in the input sequence to find its complement.
  for i in range(len(seq1)):
    if seq1[i] == "A":
      complement_chars.append("T")
    elif seq1[i] == "T":
      complement_chars.append("A")
    elif seq1[i] == "G":
      complement_chars.append("C")
    elif seq1[i] == "C":
      complement_chars.append("G")

  # Join the complementary nucleotides to form the complementary DNA strand.
  complement_seq = "".join(complement_chars)

  # Reverse the complementary sequence to obtain the reverse-complement DNA strand.
  seq_reversecomplement = complement_seq[::-1]

  return seq_reversecomplement

In [None]:
#function to check if the reference protein sequence and the inserted protein sequence matching or not (negative strand) ?
def protein_translation_negativestrand_check(seq1, seq2):
  # Remove hyphens and convert sequences to uppercase for consistency
  seq1 = seq1.replace("-","")
  seq1 = seq1.upper()
  seq2 = seq2.replace("-","")
  seq2 = seq2.upper()
  seq1 = reverse_complement_strand(seq1)
  seq2 = reverse_complement_strand(seq2)

  # Translate both DNA sequences to protein sequences using the helper function
  protein_seq1 = protein_translation(seq1)
  protein_seq2 = protein_translation(seq2)
  print(protein_seq1)
  print(protein_seq2)

  nonconserved_sequence = []

  # Check if either protein sequence could not be translated (e.g., invalid start codon, length not divisible by 3)
  if protein_seq1 is None or protein_seq2 is None:
    print("Stop the function because of the missing protein sequence")
    return None

  # Case 1: Both protein sequences have the same length
  if len(protein_seq1) == len(protein_seq2):
    print("the protein sequence of both reference and BCG Aeres have the same length")
    # Iterate through the sequences to find non-conserved positions
    for i in range(len(protein_seq1)):
      if protein_seq1[i] != protein_seq2[i]:
        nonconserved_sequence.append({"position" : i, "reference sequence" : protein_seq1[i], "BCG Aeras sequence" : protein_seq2[i]})

  # Case 2: Protein sequences have different lengths
  else:
    print("the protein sequence of both reference and BCG Aeres have different length")
    # Iterate up to the length of the shorter sequence to find non-conserved positions
    min_len = min(len(protein_seq1), len(protein_seq2))
    for i in range(min_len):
      if protein_seq1[i] != protein_seq2[i]:
         nonconserved_sequence.append({"position" : i, "reference sequence" : protein_seq1[i], "BCG Aeras sequence" : protein_seq2[i]})

  # Check if any non-conserved sequences were found
  if len(nonconserved_sequence) == 0:
    print("the protein sequence of both reference and BCG Aeres have conserved sequences")
  else:
    # Return the list of non-conserved positions and amino acids
    return nonconserved_sequence

In [None]:
seqRS07850_ref = "TCAGATGGTTGGATCGCCACCGGCGCCACCGGCGCCGCCCGCGCCACCAGCACCGCCGCTGCCATCTGGGTCCGTCGAGTCGCCGAGGACGCCGGCGCCGCCATTGTCGCCAAATACCGTGAGACCTAGCAGGGTGCCGGCGCCGCCCTTGCCGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCGCCATCGCCGATGATGTTTTCCCCGCCCTTGCCGCCAGCCCCAGCGTTCCCGCCGGCTCCGCCACTGGCGCCGGTGCCGCCGGGTGCAACGGCGTTGGCGCCGTTACCGCCGTTGCCGCCTTTGCCCCCGGTGTCTGCAAAGTCGGGGGTCGCACCCTGCGCGGCGCGGGTCACGCCGTCACCGCTGAGCCCCCCGAGCCCGCCAGCGCCGCTGAAGCCAGGATTGCCGCCGTTGCCGCCATGGCCGCCGTTGGCACCGGGTGCGACGGCGTTGCCGCCGGTCCCGCCGACCCCACCGTTGCCGCCTTTACCACCGTCCTGGCCACGCTCGCCCGCGGTGGTGGCATTGGCACCCTCGGCACCACTACCACCGAGCCCGCCGTCTGCGCCGCGGCCGCCAGTCCCACCGGCCCCGCCATTGCCGGCGAGAGTTCCGCCGTCGCCGCCGGCGCCGCCCTGGCCGCCGTTGCCGCCGCTATTGCCTTTGCCACCGACTGCGCCCGAATCGCTCGCGTTCGTCCCTGCGGCGCCGTTGGCGCCGTTGCCGCCGGCCCCGCCGGCGCCGCCGTTGCCGACCAGCCCGCCATGGCCGCCGGGTCCGCCGTTGGCGCCGTTGGTGCCCGCGGTGGTGGCGTTGGCGCCGTTGCCGCCGGCACCGCCGTTGCCGCCGCTGGTGGGGGTGGCGCCGATGGCGCCCTGAGCGCCGGTGATGGAGCCGGCTCCGCCGGTGCCTCCGGCCCCGCCGGTGCCGGGGTTGCCGCCGTTGCCGCCGTGACCGCCGGCACCACCGTTGACGGCCTGGTTGCCGTTGGCGCCGGCTCCGCCGATACCGCCGTCCCCGCCGGCCCCGCCGGCACCGGCCAGCCAGCCACCCCGGCCCCCGGCCCCGCCATCGCCGGCGTCGCCGCCGGCCCCGCCGTTGCTACCGTCTGGGAAGATACCGCCCTTACCGGCGGCGCCGGCGATACCCGCAGCGCCGTGTCCGCCGGCACCACCGTGCCCGCCCACGCCCAACAGCCCGGCCGCACCCCCGACACCGCCGTGTCCACCCACACCACCGATCGGGCCGGGCCCGCCGGCACCTCCGTGCCCGCCGGCCCCGTAGAGGGTCCCGCCCAGGCCACCGGCACCACCGGTACCGCCGACCCCGCCGGGCCCGCCGGGCCCGCCGGGCCCGCCGGTTCCGCCGACCCCGAACAGTCCGGCGTTGCCGCCGGCCCCGCCGGTTGCCCCGCCCAGCAGGCTCTGCCCGCCGGCCCCGCCGACTCCACCATTGCCCAGCAGCCAGCCGCCGCTACCCCCGGCCCCACCGGCGGCGCCGGCCCCACCGGCCCCACCGGCCCCGCCGGTGCCGAACAACCCGGCGGCCCCGCCGGCCCCGCCGACTTGGCCGGGCGCGCCCGAGCCGCCGGCCCCACCGTTGCCCCACAAGATCCCGCCGGCCCCGCCGGCCTGCCCGGTGCCGGGTGCTCCAGCCGCCCCATCACCGATCAACGGGCGACCCAGCAACGCCTGGGTGGGCGCATTGAGGGCATTGAGCACGTTGTGCTCCAGCGTCGCCAACGGTGCGGCGTTGGTCGCCTCCGCGCTGACATACGAGCCGACCGCGGCGCTTAACGTCTGCGCAAATCGGTCATGAAACGCTGCCACCTGCGTGCTGATCGCCTGATACTCCCGAGCATGGCTGCCAAACAGCGTCGCGATCGCCGCCGACACCTCATCGGCGCCCGCGGCCAGCACGCTGGTGGTTGACCCCGCCGCCGCGCTGTTGGCTACACCGATCGATGACCCGATGCGCGCCACATCTAAGGCTGCGGCCGCCACCGTCTCCGGGGCCACGATCACCAACGACAT"

In [None]:
seqRS07850_Aer = "TCAGATGGTTGGATCGCCACCGGCGCCACCGGCGCCGCCCGCGCCACCAGCACCGCCGCTGCCATCTGGGTCCGTCGAGTCGCCGAGGACGCCGGCGCCGCCATTGTCGCCAAATACCGTGAGACCTAGCAGGGTGCCGGCGCCGCCCTTGCCGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGCGACGATGATGTCCTGGCCGCCGGCGCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCGCCATCGCCGATGATGTTTTCCCCGCCCTTGCCGCCAGCCCCAGCGTTCCCGCCGGCTCCGCCACTGGCGCCGGTGCCGCCGGGTGCAACGGCGTTGGCGCCGTTACCGCCGTTGCCGCCTTTGCCCCCGGTGTCTGCAAAGTCGGGGGTCGCACCCTGCGCGGCGCGGGTCACGCCGTCACCGCTGAGCCCCCCGAGCCCGCCAGCGCCGCTGAAGCCAGGATTGCCGCCGTTGCCGCCATGGCCGCCGTTGGCACCGGGTGCGACGGCGTTGCCGCCGGTCCCGCCGACCCCACCGTTGCCGCCTTTACCACCGTCCTGGCCACGCTCGCCCGCGGTGGTGGCATTGGCACCCTCGGCACCACTACCACCGAGCCCGCCGTCTGCGCCGCGGCCGCCAGTCCCACCGGCCCCGCCATTGCCGGCGAGAGTTCCGCCGTCGCCGCCGGCGCCGCCCTGGCCGCCGTTGCCGCCGCTATTGCCTTTGCCACCGACTGCGCCCGAATCGCTCGCGTTCGTCCCTGCGGCGCCGTTGGCGCCGTTGCCGCCGGCCCCGCCGGCGCCGCCGTTGCCGACCAGCCCGCCATGGCCGCCGGGTCCGCCGTTGGCGCCGTTGGTGCCCGCGGTGGTGGCGTTGGCGCCGTTGCCGCCGGCACCGCCGTTGCCGCCGCTGGTGGGGGTGGCGCCGATGGCGCCCTGAGCGCCGGTGATGGAGCCGGCTCCGCCGGTGCCTCCGGCCCCGCCGGTGCCGGGGTTGCCGCCGTTGCCGCCGTGACCGCCGGCACCACCGTTGACGGCCTGGTTGCCGTTGGCGCCGGCTCCGCCGATACCGCCGTCCCCGCCGGCCCCGCCGGCACCGGCCAGCCAGCCACCCCGGCCCCCGGCCCCGCCATCGCCGGCGTCGCCGCCGGCCCCGCCGTTGCTACCGTCTGGGAAGATACCGCCCTTACCGGCGGCGCCGGCGATACCCGCAGCGCCGTGTCCGCCGGCACCACCGTGCCCGCCCACGCCCAACAGCCCGGCCGCACCCCCGACACCGCCGTGTCCACCCACACCACCGATCGGGCCGGGCCCGCCGGCACCTCCGTGCCCGCCGGCCCCGTAGAGGGTCCCGCCCAGGCCACCGGCACCACCGGTACCGCCGACCCCGCCGGGCCCGCCGGGCCCGCCGGGCCCGCCGGTTCCGCCGACCCCGAACAGTCCGGCGTTGCCGCCGGCCCCGCCGGTTGCCCCGCCCAGCAGGCTCTGCCCGCCGGCCCCGCCGACTCCACCATTGCCCAGCAGCCAGCCGCCGCTACCCCCGGCCCCACCGGCGGCGCCGGCCCCACCGGCCCCACCGGCCCCGCCGGTGCCGAACAACCCGGCGGCCCCGCCGGCCCCGCCGACTTGGCCGGGCGCGCCCGAGCCGCCGGCCCCACCGTTGCCCCACAAGATCCCGCCGGCCCCGCCGGCCTGCCCGGTGCCGGGTGCTCCAGCCGCCCCATCACCGATCAACGGGCGACCCAGCAACGCCTGGGTGGGCGCATTGAGGGCATTGAGCACGTTGTGCTCCAGCGTCGCCAACGGTGCGGCGTTGGTCGCCTCCGCGCTGACATACGAGCCGACCGCGGCGCTTAACGTCTGCGCAAATCGGTCATGAAACGCTGCCACCTGCGTGCTGATCGCCTGATACTCCCGAGCATGGCTGCCAAACAGCGTCGCGATCGCCGCCGACACCTCATCGGCGCCCGCGGCCAGCACGCTGGTGGTTGACCCCGCCGCCGCGCTGTTGGCTACACCGATCGATGACCCGATGCGCGCCACATCTAAGGCTGCGGCCGCCACCGTCTCCGGGGCCACGATCACCAACGACAT"

In [None]:
protein_translation_negativestrand_check(seqRS07850_ref, seqRS07850_Aer)

MSLVIVAPETVAAAALDVARIGSSIGVANSAAAGSTTSVLAAGADEVSAAIATLFGSHAREYQAISTQVAAFHDRFAQTLSAAVGSYVSAEATNAAPLATLEHNVLNALNAPTQALLGRPLIGDGAAGAPGTGQAGGAGGILWGNGGAGGSGAPGQVGGAGGAAGLFGTGGAGGAGGAGAAGGAGGSGGWLLGNGGVGGAGGQSLLGGATGGAGGNAGLFGVGGTGGPGGPGGPGGVGGTGGAGGLGGTLYGAGGHGGAGGPGPIGGVGGHGGVGGAAGLLGVGGHGGAGGHGAAGIAGAAGKGGIFPDGSNGGAGGDAGDGGAGGRGGWLAGAGGAGGDGGIGGAGANGNQAVNGGAGGHGGNGGNPGTGGAGGTGGAGSITGAQGAIGATPTSGGNGGAGGNGANATTAGTNGANGGPGGHGGLVGNGGAGGAGGNGANGAAGTNASDSGAVGGKGNSGGNGGQGGAGGDGGTLAGNGGAGGTGGRGADGGLGGSGAEGANATTAGERGQDGGKGGNGGVGGTGGNAVAPGANGGHGGNGGNPGFSGAGGLGGLSGDGVTRAAQGATPDFADTGGKGGNGGNGANAVAPGGTGASGGAGGNAGAGGKGGENIIGDGGGGGNGGAGGQGGDGTAGAGGDGGAGGQGGDGTAGAGGDGGAGGKGGDGGDGGSDPTEGRGFGGLGGAGGAGGKGGAGTLLGLTVFGDNGGAGVLGDSTDPDGSGGAGGAGGAGGAGGDPTI
MSLVIVAPETVAAAALDVARIGSSIGVANSAAAGSTTSVLAAGADEVSAAIATLFGSHAREYQAISTQVAAFHDRFAQTLSAAVGSYVSAEATNAAPLATLEHNVLNALNAPTQALLGRPLIGDGAAGAPGTGQAGGAGGILWGNGGAGGSGAPGQVGGAGGAAGLFGTGGAGGAGGAGAAGGAGGSGGWLLGNGGVGGAGGQSLLGGATGGAGGNAGLFGVGGTGGPGGPGGPGGVGGTGGAGGLGGTLYGAGGHGGA

[{'position': 646, 'reference sequence': 'G', 'BCG Aeras sequence': 'D'},
 {'position': 647, 'reference sequence': 'G', 'BCG Aeras sequence': 'I'},
 {'position': 648, 'reference sequence': 'D', 'BCG Aeras sequence': 'I'},
 {'position': 649, 'reference sequence': 'G', 'BCG Aeras sequence': 'V'},
 {'position': 650, 'reference sequence': 'T', 'BCG Aeras sequence': 'A'}]

In [None]:
seqRS03180_ref = "CTAGAAGCTGCCGCCGGCGCCGCCGCCCCCGCCTGCGCCCCCGGCCCCGCCGCGGCCGTCGGCGCCGGGGCTGCCGAACTGGCCAGGCTGGCCGGATTGGCCGATGATGGCCAGGGGCCCGAGGTGTGCGGTGCCGCCGGTGCCACCGGTGCCACCCTTACCGCCAGCCCCAGGGATCGGGAATAAACCGCCGGGGTCGGCCCCTTTGCCGCCGTCCCCACCTCGCCCGCCCGCCCCAGCGGTCCTGAAGCCGTCGCCACCGTGCCCGCCGTCCCCGCCATTCCCACCGGAACTGGCATCAAGGCCGTCGCCGCCGAAGCCGCCCCTTCCGCCGTCACCGCCGGCGCTGACGGTGCTGGTGCCGCCGGCGCCGCCCATGCCGCCGGTGCCGCCGGGGCCAAAGGCGGAGCCAAGGCCGCCACTGCCGCCGACGCCACCGTTTCCGGCGCGGCCGGCCGCCCCTGTCGCACCGGTCGCGCCCAGGGTGGAACCGGTGCCGCCGGCACCGCCGGCACCACCGGTGCCGCCGGTGCCGCCGGTGCCGCCATTTCCGCCAGTCCCGCCAGTGCCAGCGAGGCTGCTGAAGAGAGTGCCGTGGGCACCTCTGCCGCCGTCGCCGCCGGTGCCGCCGGTGCCGCCGGCGCCACCGGCCCCACCATCTCCGCCGGCGCCTTGGCTGCCGTTGTTGCCCGTTGGCGACAGCGCTTTGCCGCCGGCCCCGCCGTTGCCGCCGCCGCCGCCGGCGCCGCCGGTCCCGCCAACCCCGCCGGTGCCACCGTTACCGCCGTGACCGTCCGCGCCAGCGTCGAATGTGCCGGTCGCACCGGTGGCGCCGGTGGTGCCCCGCAGGCCCGTCCCGCCCGTGCCGCCGGCCCCGCCCCGGCCGCCGTCAGCGCCGTCGCCGGCGACGCTCCCACCTTGCCCGCCTACGCCGCCGTCGCCGCCGCGGCCGCCGCTGCCGGTAATGGCTCCGGGATTGCCGTCACTACCGGTGCCGCCGTCTCCGCCATTGCCGCCCGCTCCGCCGTTGCCAATCTGCCCGGCGTTTCCGCCGGCGCCACCGGTTCCGCCGTCACCGCCCATGCCCCTGCTGGCATTGCCGCCGTTGCCGCCGTGGCCGCCGGCCCCACCGCTGCCGCGCAGGCTGCCGTTGCCGCCGTTGCCGCCGTTGCCGCCGGCCGCGCCGTTGCCGCTGAGGGCATGGTCGCCGTTGCCGCCGTTGCCGCCGTTGCCGCCGTTGACGTGAATGCTGCTGCTTGAGCCGGTCGCACCGAAAGTGGAGCCGGCGCCGCCACTCCCGCCGGCCCCGCTGGGGCCGGCGTTGCCGCCGTTGCCGCCGTTGCCGCCGATGCCGTTGTTGGTGAACACGCTGCCGTTAGCGCCGTTGCCGCCGTCACCGGGGTCCCCGCCGGTGCCGCCGCTGCCGCCGTTGCCGCCGGCGCCTTGGCTGCCGGTTGTGCCCGCCGGCCCGGCCCCGCCCGGCCCGCCGGTCCCGCCTCGGCCGCCCTTTCCGCCGGCCCCGCCATCCTGGCCGCGGGCACCCGCGGTGGCGCCGTCGGCGCCGTCAATGCCGCGGCCGCCGTTACCGCCAACTCCGCCGGTCCCACCGTCGCCGCCGGCACCGCCGGGGCCTTGGCTGCCGGCGACGCCGTTGGGTGCGGCCCCGCCGTCCCCGCCGTCCCCACCTTTTCCGCCGGTACCGCCAACTCCGCCGGTGCCGCCGGGGTGCCCGTCCGCGCCCGCGCTGGAACCGTTGACACCGTCGCTGCCGGACCCTCCAGTCCCGCCGACGCCGCCGGTGCCGCCGGCCCCGCCGGTGCCACCGTTGCCCGCCCAGGCGCCGCCGGATCCACCGGCCCCACCGTTTCCGCCGGTGCCGCCATCCAGGCCGGGGTTGCCGAGCCTGCCCAGACCGGGCAGGCCTTTGCTGCCGTTGCCGCCGGCGCCGCCGGCGCCGCCGTTGCCGACCAAACCGCCATCACCGCCCCTGCCGCCGGACGCGCCGGTCTGGCCAAAGCCGGTGGCATCGGCGCCTCTGCCGCCGTTGCCGCCGTTGCCGCCGCTGGTGGGGGTGTTGCCGGGTGCGCCGTTGGCACCGGGGGTGGAGCCGCTTCCGCCCTGGCCGCCGGCACCGCCGACACCGGGATCACCGCCGTGGCCACCGGCGCCACCTACACCACCGTTGACACCGAGCGCGCCGGCGGCGCCGTGACCGCCGTTGCCAGGAGTCCCGCCGTTCCCGCCGGCTCCGCCGTCACCGCCAGCGCCCTGGCTGCCGTTCTGGCCCGAGGCGGCCAACGCGAGACCGCCGGCCCCGCCCTCGCCGCCGGCTCCGCCAGGCCCACCGTTACCGCCATTCCCGCCGGGTGAGCCTGCGGCCCCGGGAGCGGACGCATTGAAGCCGATGCTGCCAGCACCTCCGGATCCGCCATCGCCGCCGGCCCCGCCAGCACCTCCGGTGCCGCCGTCACCGGCCTGAGTTCCGCCGTTGCCGCCGGCCCCGCCGGTGCCGCCGGCCCCGCCGGGGCGACCGGGCGCTTCGGATCCAAATCCGAGACCGCCGGCCCCGCCGCGGCCACCGGCCCCACCGGCACCGCCATTACCCACCTGACCGCCGTCGCCACCCCTGCCACCGTTCGCGCCGGTCTGTCCGCTGCTGATAGCGTCGGCGCCTTTGCCGCCGTCGCCGCCGTTACCACCGCTGGTGGAGGTGGTGCCGGGCGCGCCGTTCGCGCCATGCGCGCTGCCGCCGACGCTGGCGCCACCGGCGCCACCGGCCCCACCGGCGCCCGGGTTGCCGCCATTGCCACCGGTCCCGCCGGCACCAAGGTTGTGACCCCACGTCCCGGTAGCGCCGTTGCCGCCGTCACCGGGAGCTCCGCCGTCACCGCCGCTACCGCCAGCCCCGCCGGCGCCGTGGCTGCCGCCGAGGCCGAGCAGACCGTGGCCGCCGCCGGGCCCGCCGACCCCGCCGGTCCCGCCAGCCCCACCATTCCCGCCGTTTCCGCCGGCTTGACCGTCAGCGCCCAAGTTGGTGGCGTGGGCGCCGCTGGCGCCCGCACCGCCGGCGCCGCCGGGCCCGCCCTCGCCGCCGGCCCCGCCGTTGCCGCCGTTGCCCATCAGCACCCCGCCGGCCCCGCCGGCCCCGCCGTTGCCGCCGATCCCGCCGGCCCCGCCAGCGGTGCCGGATCCACCCGGTGTGCTGGCCGACGTACCCGTGACACCGGCGATGCCGTTGCCTCCGGCCCCACCGGCCCCGCCGACACCGAACAACCCGGCGGTACCGCCGGCCCCGCCGTTGCCGCCGACCGCCCCGGCCCCGCCAAAACCCCCGGCGCCTCCGTTGCCATACAGCCACCCGCCCGCGCCGCCGTGACCACCGGCCCCGCCGGTGGTACCCACGCCGCCGGCTCCACCGTTGCCGCCGTTACCGATTAGGCCCGCCGCCCCGCCGGCCCCGCCTCGTTGTCCTGGCGCCCCAGACCCGCCGTTGCCGCCGTTGCCGTACAAGATGCCGCCTGGCCCGCCGGCCTGCCCGGTCCCGGGGGAGCCGTCGGCGCCGTTGCCGATCAGCGGGCGTCCGAACAGCGCCTGGGTGGGCGCATTGACCGCGGCTAGCAAACTCTGTTCAACGTTGACCGCCTCGGCGGCCACGTACGAGCTCGCGGCCGCGGACAGGGTCTGCACGAACCGGTCATGAAACGTCGCCACTTGGGCGCTGACGGTCTGATATTCCTGGGCGTGCGTGCCGAACAACGCCGCAACGGCCACCGACACCTCGTCGGCTGACGCGGGCAGCACTTTCGCCACCGCGGCCGCTGCGGTGTTGGCCGCAGTGATCGTCGAACCAATTTTCGCCAAATCCGTTGCCGCCGTGGTCAGCATCTCCGGCGTCGCGATTACGAACGACAT"

In [None]:
seqRS03180_Aer = "CTAGAAGCTGCCGCCGGCGCCGCCGCCCCCGCCTGCGCCCCCGGCCCCGCCGCGGCCGTCGGCGCCGGGGCTGCCGAACTGGCCAGGCTGGCCGGATTGGCCGATGATGGCCAGGGGCCCGAGGTGTGCGGTGCCGCCGGTGCCACCGGTGCCACCCTTACCGCCAGCCCCAGGGATCGGGAATAAACCGCCGGGGTCGGCCCCTTTGCCGCCGTCCCCACCTCGCCCGCCCGCCCCAGCGGTCCTGAAGCCGTCGCCACCGTGCCCGCCGTCCCCGCCATTCCCACCGGAACTGGCATCAAGGCCGTCGCCGCCGAAGCCGCCCCTTCCGCCGTCACCGCCGGCGCTGACGGTGCTGGTGCCGCCGGCGCCGCCCATGCCGCCGGTGCCGCCGGGGCCAAAGGCGGAGCCAAGGCCGCCACTGCCGCCGACGCCACCGTTTCCGGCGCGGCCGGCCGCCCCTGTCGCACCGGTCGCGCCCAGGGTGGAACCGGTGCCGCCGGCACCGCCGGCACCACCGGTGCCGCCGGTGCCGCCGGTGCCGCCATTTCCGCCAGTCCCGCCAGTGCCAGCGAGGCTGCTGAAGAGAGTGCCGTGGGCACCTCTGCCGCCGTCGCCGCCGGTGCCGCCGGTGCCGCCGGCGCCACCGGCCCCACCATCTCCGCCGGCGCCTTGGCTGCCGTTGTTGCCCGTTGGCGACAGCGCTTTGCCGCCGGCCCCGCCGTTGCCGCCGCCGCCGCCGGCGCCGCCGGTCCCGCCAACCCCGCCGGTGCCACCGTTACCGCCGTGACCGTCCGCGCCAGCGTCGAATGTGCCGGTCGCACCGGTGGCGCCGGTGGTGCCCCGCAGGCCCGTCCCGCCCGTGCCGCCGGCCCCGCCCCGGCCGCCGTCAGCGCCGTCGCCGGCGACGCTCCCACCTTGCCCGCCTACGCCGCCGTCGCCGCCGCGGCCGCCGCTGCCGGTAATGGCTCCGGGATTGCCGTCACTACCGGTGCCGCCGTCTCCGCCATTGCCGCCCGCTCCGCCGTTGCCAATCTGCCCGGCGTTTCCGCCGGCGCCACCGGTTCCGCCGTCACCGCCCATGCCCCTGCTGGCATTGCCGCCGTTGCCGCCGTGGCCGCCGGCCCCACCGCTGCCGCGCAGGCTGCCGTTGCCGCCGTTGCCGCCGTTGCCGCCGGCCGCGCCGTTGCCGCTGAGGGCATGGTCGCCGTTGCCGCCGTTGCCGCCGTTGCCGCCGTTGACGTGAATGCTGCTGCTTGAGCCGGTCGCACCGAAAGTGGAGCCGGCGCCGCCACTCCCGCCGGCCCCGCTGGGGCCGGCGTTGCCGCCGTTGCCGCCGTTGCCGCCGATGCCGTTGTTGGTGAACACGCTGCCGTTAGCGCCGTTGCCGCCGTCACCGGGGTCCCCGCCGGTGCCGCCGCTGCCGCCGTTGCCGCCGGCGCCTTGGCTGCCGGTTGTGCCCGCCGGCCCGGCCCCGCCCGGCCCGCCGGTCCCGCCTCGGCCGCCCTTTCCGCCGGCCCCGCCATCCTGGCCGCGGGCACCCGCGGTGGCGCCGTCGGCGCCGTCAATGCCGCGGCCGCCGTTACCGCCAACTCCGCCGGTCCCACCGTCGCCGCCGGCACCGCCGGGGCCTTGGCTGCCGGCGACGCCGTTGGGTGCGGCCCCGCCGTCCCCGCCGTCCCCACCTTTTCCGCCGGTACCGCCAACTCCGCCGGTGCCGCCGGGGTGCCCGTCCGCGCCCGCGCTGGAACCGTTGACACCGTCGCTGCCGGACCCTCCAGTCCCGCCGACGCCGCCGGTGCCGCCGGCCCCGCCGGTGCCACCGTTGCCCGCCCAGGCGCCGCCGGATCCACCGGCCCCACCGTTTCCGCCGGTGCCGCCATCCAGGCCGGGGTTGCCGAGCCTGCCCAGACCGGGCAGGCCTTTGCTGCCGTTGCCGCCGGCGCCGCCGGCGCCGCCGTTGCCGACCAAACCGCCATCACCGCCCCTGCCGCCGGACGCGCCGGTCTGGCCAAAGCCGGTGGCATCGGCGCCTCTGCCGCCGTTGCCGCCGTTGCCGCCGCTGGTGGGGGTGTTGCCGGGTGCGCCGTTGGCACCGGGGGTGGAGCCGCTTCCGCCCTGGCCGCCGGCACCGCCGACACCGGGATCACCGCCGTGGCCACCGGCGCCACCTACACCACCGTTGACACCGAGCGCGCCGGCGGCGCCGTGACCGCCGTTGCCAGGAGTCCCGCCGTTCCCGCCGGCTCCGCCGTCACCGCCAGCGCCCTGGCTGCCGTTCTGGCCCGAGGCGGCCAACGCGAGACCGCCGGCCCCGCCCTCGCCGCCGGCTCCGCCAGGCCCACCGTTACCGCCATTCCCGCCGGGTGAGCCTGCGGCCCCGGGAGCGGACGCATTGAAGCCGATGCTGCCAGCACCTCCGGATCCGCCATCGCCGCCGGCCCCGCCAGCACCTCCGGTGCCGCCGTCACCGGCCTGAGTTCCGCCGTTGCCGCCGGCCCCGCCGGTGCCGCCGGCCCCGCCGGGGCGACCGGGCGCTTCGGATCCAAATCCGAGACCGCCGGCCCCGCCGCGGCCACCGGCCCCACCGGCACCGCCATTACCCACCTGACCGCCGTCGCCACCCCTGCCACCGTTCGCGCCGGTCTGTCCGCTGCTGATAGCGTCGGCGCCTTTGCCGCCGTCGCCGCCGTTACCACCGCTGGTGGAGGTGGTGCCGGGCGCGCCGTTCGCGCCGTGGGCGCTGCCGCCGACGCTGGCGCCACCGGCGCCACCGGCCCCACCGGCGCCCGGGTTGCCGCCATTGCCACCGGTCCCGCCGGCACCAAGGTTGTGACCCCACGTCCCGGTAGCGCCGTTGCCGCCGTCACCGGGAGCTCCGCCGTCACCGCCGCTACCGCCAGCCCCGCCGGCGCCGTGGCTGCCGCCGAGGCCGAGCAGACCGTGGCCGCCGCCGGGCCCGCCGACCCCGCCGGTCCCGCCAGCCCCACCATTCCCGCCGTTTCCGCCGGCTTGACCGTCAGCGCCCAAGTTGGTGGCGTGGGCGCCGCTGGCGCCCGCACCGCCGGCGCCGCCGGGCCCGCCCTCGCCGCCGGCCCCGCCGTTGCCGCCGTTGCCCATCAGCACCCCGCCGGCCCCGCCGGCCCCGCCGTTGCCGCCGATCCCGCCGGCCCCGCCAGCGGTGCCGGATCCACCCGGTGTGCTGGCCGACGTACCCGTGACACCGGCGATGCCGTTGCCTCCGGCCCCACCGGCCCCGCCGACACCGAACAACCCGGCGGTACCGCCGGCCCCGCCGTTGCCGCCGACCGCCCCGGCCCCGCCAAAACCCCCGGCGCCTCCGTTGCCATACAGCCACCCGCCCGCGCCGCCGTGACCACCGGCCCCGCCGGTGGTACCCACGCCGCCGGCTCCACCGTTGCCGCCGTTACCGATTAGGCCCGCCGCCCCGCCGGCCCCGCCTCGTTGTCCTGGCGCCCCAGACCCGCCGTTGCCGCCGTTGCCGTACAAGATGCCGCCTGGCCCGCCGGCCTGCCCGGTCCCGGGGGAGCCGTCGGCGCCGTTGCCGATCAGCGGGCGTCCGAACAGCGCCTGGGTGGGCGCATTGACCGCGGCTAGCAAACTCTGTTCAACGTTGACCGCCTCGGCGGCCACGTACGAGCTCGCGGCCGCGGACAGGGTCTGCACGAACCGGTCATGAAACGTCGCCACTTGGGCGCTGACGGTCTGATATTCCTGGGCGTGCGTGCCGAACAACGCCGCAACGGCCACCGACACCTCGTCGGCTGACGCGGGCAGCACTTTCGCCACCGCGGCCGCTGCGGTGTTGGCCGCAGTGATCGTCGAACCAATTTTCGCCAAATCCGTTGCCGCCGTGGTCAGCATCTCCGGCGTCGCGATTACGAACGACAT"

In [None]:
protein_translation_negativestrand_check(seqRS03180_ref,seqRS03180_Aer)

MSFVIATPEMLTTAATDLAKIGSTITAANTAAAAVAKVLPASADEVSVAVAALFGTHAQEYQTVSAQVATFHDRFVQTLSAAASSYVAAEAVNVEQSLLAAVNAPTQALFGRPLIGNGADGSPGTGQAGGPGGILYGNGGNGGSGAPGQRGGAGGAAGLIGNGGNGGAGGVGTTGGAGGHGGAGGWLYGNGGAGGFGGAGAVGGNGGAGGTAGLFGVGGAGGAGGNGIAGVTGTSASTPGGSGTAGGAGGIGGNGGAGGAGGVLMGNGGNGGAGGEGGPGGAGGAGASGAHATNLGADGQAGGNGGNGGAGGTGGVGGPGGGHGLLGLGGSHGAGGAGGSGGDGGAPGDGGNGATGTWGHNLGAGGTGGNGGNPGAGGAGGAGGASVGGSAHGANGAPGTTSTSGGNGGDGGKGADAISSGQTGANGGRGGDGGQVGNGGAGGAGGRGGAGGLGFGSEAPGRPGGAGGTGGAGGNGGTQAGDGGTGGAGGAGGDGGSGGAGSIGFNASAPGAAGSPGGNGGNGGPGGAGGEGGAGGLALAASGQNGSQGAGGDGGAGGNGGTPGNGGHGAAGALGVNGGVGGAGGHGGDPGVGGAGGQGGSGSTPGANGAPGNTPTSGGNGGNGGRGADATGFGQTGASGGRGGDGGLVGNGGAGGAGGNGSKGLPGLGRLGNPGLDGGTGGNGGAGGSGGAWAGNGGTGGAGGTGGVGGTGGSGSDGVNGSSAGADGHPGGTGGVGGTGGKGGDGGDGGAAPNGVAGSQGPGGAGGDGGTGGVGGNGGRGIDGADGATAGARGQDGGAGGKGGRGGTGGPGGAGPAGTTGSQGAGGNGGSGGTGGDPGDGGNGANGSVFTNNGIGGNGGNGGNAGPSGAGGSGGAGSTFGATGSSSSIHVNGGNGGNGGNGDHALSGNGAAGGNGGNGGNGSLRGSGGAGGHGGNGGNASRGMGGDGGTGGAGGNAGQIGNGGAGGNGGDGGTGSDGNPGAITGSGGRGGDGGVGGQGG

In [None]:
seqFCU26_2107_ref = "TCAGTGATCACCAACCCGTGTGGCACGTGGCGACCGGCGACCGGCGAGCCCGCATCGCACCAGGTATCGAGGAACTCGGACCCACCCTGGTCGAAACGGTACGCCGCCGCGACGCACTGCCCCGCATCGCCCAAGCCGTAGTAGTGGCCGCCACCCGCAACTACGGCGTCCCCGACAACGAAACCGACCTACTGCGGTCGCCCAGGCCAAGGTGGCCACCAAACGCTGCTGGCATGCAGGTGGAGTGCACAGACACGGCAGCTGCAATAGCCTTACGCGGGTGACCAACACCCCCCCCCCCCCCCCCCCCCCACCCACCACAGGACAATGGACACCAACCCACCCCCCAGCGCCGCCGCGTTCACGCAATTGGCCGTTGGCGGCGGTGGCCAGCGTCGCGATTGCCGCGGTTGTGCTGGGTGCCGCAGCTTTAATCGTGGCACTGACGCGCCCGACGAACAGCGGTCCAGCCACCGCCGCTGGAACGACCGCCGAGCCGACATACACCGCAGCAGAAACCGCCGCCGCGCACCAAAAGTTATGCGAGGTGTACAAACTGGCAGCGCGGGCGGTCCAAATCGCGACAAACGGCGACAACCCGGCGTTCGCAAACAT"

In [None]:
seqFCU26_2107_Aer = "TCAGTGATCACCAACCCGTGTGGCACGTGGCGACCGGCGACCGGCGAGCCCGCATCGCACCAGGTATCGAGGAACTCGGACCCACCCTGGTCGAAACGGTACGCCGCCGCGACGCACTGCCCCGCATCGCCCAAGCCGTAGTAGTGGCCGCCACCCGCAACTACGGCGTCCCCGACAACGAAACCGACCTACTGCGGTCGCCCAGGCCAAGGTGGCCACCAAACGCTGCTGGCATGCAGGTGGAGTGCACAGACACGGCAGCTGCAATAGCCTTACGCGGGTGACCAACACCCCCCCCCCCCC---------ACCCACCACAGGACAATGGACACCAACCCACCCCCCAGCGCCGCCGCGTTCACGCAATTGGCCGTTGGCGGCGGTGGCCAGCGTCGCGATTGCCGCGGTTGTGCTGGGTGCCGCAGCTTTAATCGTGGCACTGACGCGCCCGACGAACAGCGGTCCAGCCACCGCCGCTGGAACGACCGCCGAGCCGACATACACCGCAGCAGAAACCGCCGCCGCGCACCAAAAGTTATGCGAGGTGTACAAACTGGCAGCGCGGGCGGTCCAAATCGCGACAAACGGCGACAACCCGGCGTTCGCAAACAT"

In [None]:
protein_translation_negativestrand_check(seqFCU26_2107_ref, seqFCU26_2107_Aer)

MFANAGLSPFVAIWTARAASLYTSHNFWCAAAVSAAVYVGSAVVPAAVAGPLFVGRVSATIKAAAPSTTAAIATLATAANGQLRERGGAGGWVGVHCPVVGGGGGGGGVLVTRVRLLQLPCLCTPPACQQRLVATLAWATAVGRFRCRGRRSCGWRPLLRLGRCGAVRRGGVPFRPGWVRVPRYLVRCGLAGRRSPRATRVGDH
MFANAGLSPFVAIWTARAASLYTSHNFWCAAAVSAAVYVGSAVVPAAVAGPLFVGRVSATIKAAAPSTTAAIATLATAANGQLRERGGAGGWVGVHCPVVGGGGGVLVTRVRLLQLPCLCTPPACQQRLVATLAWATAVGRFRCRGRRSCGWRPLLRLGRCGAVRRGGVPFRPGWVRVPRYLVRCGLAGRRSPRATRVGDH
the protein sequence of both reference and BCG Aeres have different length


[{'position': 105, 'reference sequence': 'G', 'BCG Aeras sequence': 'V'},
 {'position': 106, 'reference sequence': 'G', 'BCG Aeras sequence': 'L'},
 {'position': 107, 'reference sequence': 'G', 'BCG Aeras sequence': 'V'},
 {'position': 108, 'reference sequence': 'V', 'BCG Aeras sequence': 'T'},
 {'position': 109, 'reference sequence': 'L', 'BCG Aeras sequence': 'R'},
 {'position': 111, 'reference sequence': 'T', 'BCG Aeras sequence': 'R'},
 {'position': 112, 'reference sequence': 'R', 'BCG Aeras sequence': 'L'},
 {'position': 113, 'reference sequence': 'V', 'BCG Aeras sequence': 'L'},
 {'position': 114, 'reference sequence': 'R', 'BCG Aeras sequence': 'Q'},
 {'position': 116, 'reference sequence': 'L', 'BCG Aeras sequence': 'P'},
 {'position': 117, 'reference sequence': 'Q', 'BCG Aeras sequence': 'C'},
 {'position': 119, 'reference sequence': 'P', 'BCG Aeras sequence': 'C'},
 {'position': 120, 'reference sequence': 'C', 'BCG Aeras sequence': 'T'},
 {'position': 121, 'reference sequence