IMPORT THE INPUT FILE

In [2]:
# Import the Google Colab drive lib
from google.colab import drive
drive.mount('/content/drive')

# Read the input file
input_path = open('/content/drive/My Drive/UFRN/10 - Cursos online/Finding Hidden Messages in DNA (Bioinformatics I)/1.2 Hidden Messages in the Replication Origin/Code challenge/PatternToNumber_debug.txt', 'r')
input = input_path.read()

# Print the input
print(input)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CTTCTCACGTACAACAAAATC


THE CODE LOGIC BEGINS HERE!

In [3]:
# ======================================
# Prefix function
# ======================================
def Prefix(input_):
  return input_[0:(len(input_) - 1)].upper()

# ======================================
# LastSymbol function
# ======================================
def LastSymbol(input_):
  return input_[-1].upper()

# ======================================
# SymbolToNumber function
# ======================================
def SymbolToNumber(input_):
  input_ = input_.upper()
  result = ""

  # Iterate over the input string
  for idx, chr in enumerate(input_):
    # Check the lexicographic order
    switcher = {
      "A": "0",
      "C": "1",
      "G": "2",
      "T": "3"
    }
    current_result = switcher.get(input_[idx], "Invalid base")

    # Verify if the current base is valid!
    if (current_result == "Invalid base"):
      return -1
    else:
      result += current_result

  # Return the final result
  return int(result)

# ======================================
# NumberToSymbol function
# ======================================
def NumberToSymbol(input_):
  input_ = str(input_)
  result = ""

  # Iterate over the input number
  for idx, chr in enumerate(input_):
    # Check the lexicographic order
    switcher = {
      "0": "A",
      "1": "C",
      "2": "G",
      "3": "T"
    }
    current_result = switcher.get(input_[idx], "Invalid number")

    # Verify if the current base is valid!
    if (current_result == "Invalid number"):
      return -1
    else:
      result += current_result

  # Return the final result
  return result

In [4]:
# ======================================
# PatternToNumber function
# ======================================
def PatternToNumber(input_):
  # Check if the input Pattern contains at least 1 char
  if len(input_) == 0:
    return 0

  # Reduce the k-mer size until get the prefix and suffix positions
  return 4 * PatternToNumber(Prefix(input_)) + SymbolToNumber(LastSymbol(input_))

# ======================================
# NumberToPattern function
# ======================================
def NumberToPattern(index_, k_):
  # Check if the input k-mer has size 1
  if k_ == 1:
    return NumberToSymbol(index_)

  # Get the Quotient(index, 4)
  prefixIndex = index_ // 4

  # Get the Remainder(index, 4)
  r = index_ % 4

  # Retrieve the equivalent symbol from remainder
  symbol = NumberToSymbol(r)

  # Recursive call with a smaller k-mer
  prefixPattern = NumberToPattern(prefixIndex, k_ - 1)

  # Return concatenation of PrefixPattern with symbol
  return str(prefixPattern) + str(symbol)  

In [39]:
# =============================================================================
# FindingFrequentWordsBySorting  function
#
# [Loop1] + [Sorting] + [Loop2] + [Loop3] = [|Text|*k] + [|Text|*log|Text|] + 
# [|Text|] + [|Text|*k]
#
# O(|Text|*k + |Text|*log|Text|) -> O(n*log(n) + n*k)
# =============================================================================
def FindingFrequentWordsBySorting (input_, k_):
  # Declare the frequentPatterns
  frequentPatterns = []
  index = []
  count = []

  # Calculate the loop limits
  loopLimit = len(input_) - (k_ - 1)

  # Loop 1: Initialize the arrays
  for idx in range(loopLimit):
    # Get the current k-mer
    current_pattern = input_[idx:(idx + k_)]

    # Get the current k-mer index
    index.append(PatternToNumber(current_pattern))

    # Initiliaze the current k-mer count with 1
    count.append(1)

  # Sort the index array
  sortedIndex = sorted(index)

  # Loop 2: Count the repeated k-mers
  for idx in range(1, loopLimit):
    if sortedIndex[idx] == sortedIndex[(idx - 1)]:
      count[idx] = count[(idx - 1)] + 1

  # Get the max occurrences
  maxCount = max(count)

  # Loop 3: Select the most frequent k-mers
  for idx in range(loopLimit):
    if count[idx] == maxCount:
      # Get the current k-mer by its index
      current_pattern = NumberToPattern(sortedIndex[idx], k_)

      # Add Pattern to the set FrequentPatterns
      frequentPatterns.append(current_pattern)

  # Return the frequentPatterns set
  return frequentPatterns

In [41]:
# ======================================
# Test section
# ======================================
# Test 1
output1 = FindingFrequentWordsBySorting("AAGCAAAGGTGGG", 2)
print("(ACGCGGCTCTGAAA, 2) = " + str(output1))

# Test 2
output2 = FindingFrequentWordsBySorting("AAGCAAAGGTGGG", 3)
print("(ACGCGGCTCTGAAA, 3) = " + str(output2))

# Test 3
output3 = FindingFrequentWordsBySorting("AAGCAAAGGTGGG", 4)
print("(ACGCGGCTCTGAAA, 4) = " + str(output3))

# Test 4
output4 = FindingFrequentWordsBySorting("AAGCAAAGGTGGG", 5)
print("(ACGCGGCTCTGAAA, 5) = " + str(output4))

(ACGCGGCTCTGAAA, 2) = ['AA', 'GG']
(ACGCGGCTCTGAAA, 3) = ['AAG']
(ACGCGGCTCTGAAA, 4) = ['AAAG', 'AAGC', 'AAGG', 'AGCA', 'AGGT', 'CAAA', 'GCAA', 'GGTG', 'GTGG', 'TGGG']
(ACGCGGCTCTGAAA, 5) = ['AAAGG', 'AAGCA', 'AAGGT', 'AGCAA', 'AGGTG', 'CAAAG', 'GCAAA', 'GGTGG', 'GTGGG']
