1. **GetProfileMatrix(motifs)**: Given a list of motifs, returns a dictionary representing the profile matrix of those motifs.
2. **GetProfMostProbKmer(genome, k, prof_dict)**: Given a genome, k-mer size, and a profile matrix (prof_dict), returns the first most probable k-mer from the genome. Uses pseudocounts.
3. **GetProfMostProbMotifs(DNA, prof_dict)**: Given a DNA, k-mer size, and a profile matrix (prof_dict), returns the list of first most probable k-mer in the genomes from the DNA. Uses GetProfMostProbKmer(genome, k, prof_dict).
4. **GetMotifsScore(motifs)**: Given a list of motifs, returns the Score from those motifs. The score is calculated by counting the least frequent sites from every column of the motif matrix.
5. **GetRandomizedMotifSearch(DNA, k, t)**: Given DNA, kmer size k and the number of genomes in the DNA t, returns a list of motifs. From randomly selected motifs from the DNA, it uses these motifs to generate profile matrix, and from this profile matrix, it again generates motifs using GetProfMostProbKmer. This process continues until suitable score of motifs is reached. Remember to run it multiple times from random start to get better motifs.
6. **GetProfProbGeneList(genome, k, prof_dict)**: Given genome, k and a profile matrix prof_dict, returns a list of probability of the all the k-mers in that genome.
7. **GetGibbsSampler(DNA, k, t, N)**: Given DNA, k, number of genes in the DNA t, and the number of times this Gibbs Sample would be run N, it returns a list of motifs. From randomly selected motifs, it generates profile matrix discarding one randomly selected motifs. It uses this profile matrix in GetProfProbGeneList to generate a probability list of that discarded gene, and uses this list of probabilites to choose one k-mer biasly. This k-mer is put back to the list of motifs and its score is evaluated. This process continues N times. Finally it returns the best_motifs it have found within these N iterations.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
main_directory = '/content/drive/My Drive/Colab Notebooks/Bioinformatics Code Challenges/Data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def GetProfileMatrix(motifs):
  Site2Int = {s:i for i,s in enumerate('ACGT')}
  Int2Site = {i:s for i,s in enumerate('ACGT')}
  prof_dict = {s:[] for s in 'ACGT'}
  l = len(motifs) + 1
  k = len(motifs[0])
  for i in range(k):
    sites = [1.0 for _ in range(4)]
    for motif in motifs:
      sites[Site2Int[motif[i]]] += 1
    for i in range(4):
      sites[i] /= l
      prof_dict[Int2Site[i]].append(sites[i])
  return prof_dict

def GetProfMostProbKmer(genome, k, prof_dict):
  _max = -1
  most_prob_kmer = ''
  for i in range(len(genome)-k+1):
    # print(type(k))
    sub_kmer = genome[i:i+k]
    val = 1
    for j,km in enumerate(sub_kmer):
      val *= prof_dict[km][j]
    if val > _max:
      _max = val
      most_prob_kmer = sub_kmer
  return most_prob_kmer

def GetProfMostProbMotifs(DNA, prof_dict):
  k = len(prof_dict['A'])
  motifs = []
  for genome in DNA:
    motif = GetProfMostProbKmer(genome, k, prof_dict)
    motifs.append(motif)
  return motifs

def GetMotifsScore(motifs):
  Site2Int = {s:i for i,s in enumerate('ACGT')}
  Int2Site = {i:s for i,s in enumerate('ACGT')}
  prof_dict = {s:[] for s in 'ACGT'}
  l = len(motifs)
  k = len(motifs[0])
  score = 0
  for i in range(k):
    sites = [0 for _ in range(4)]
    for motif in motifs:
      sites[Site2Int[motif[i]]] += 1
    _max = max(sites)
    score += (l-_max)
  return score

# RandomizedMotifSearch
def GetRandomizedMotifSearch(DNA, k, t):
  import random
  best_motifs = []
  motifs = []
  for m in DNA:
    i = random.randint(0,len(m)-k)
    best_motifs.append(m[i:i+k])
    motifs.append(m[i:i+k])
  best_score = GetMotifsScore(best_motifs)
  while True:
    prof_dict = GetProfileMatrix(motifs)
    motifs = GetProfMostProbMotifs(DNA, prof_dict)
    motifs_score = GetMotifsScore(motifs)
    if motifs_score < best_score:
      best_score = motifs_score
      best_motifs = motifs
    else:
      break
  return best_motifs


In [None]:
def CheckGetRandomizedMotifSearch():
  file = open(main_directory+"Week4/rms1.txt", "r")
  lines = file.readlines()
  k = 15
  t = 20
  DNA = []
  for i in range(1,len(lines)):
    DNA.append(lines[i].strip())
  best_motifs = GetRandomizedMotifSearch(DNA,k,t)
  best_score = GetMotifsScore(best_motifs)
  for i in range(1000):
    motifs = GetRandomizedMotifSearch(DNA,k,t)
    motifs_score = GetMotifsScore(motifs)
    if motifs_score < best_score:
      best_score = motifs_score
      best_motifs = motifs
  for m in best_motifs:
    print(m)
  
CheckGetRandomizedMotifSearch()

In [None]:
x =((585**9/ 586**10)*10 + (585/586)**10 )
x = 1 - x
print(x)

0.00012985670567622343


In [None]:
def GetProfProbGeneList(genome, k, prof_dict):
  prof_prob_list = []
  for i in range(len(genome)-k+1):
    sub_kmer = genome[i:i+k]
    val = 1
    for j,km in enumerate(sub_kmer):
      val *= prof_dict[km][j]
    prof_prob_list.append(val)
  return prof_prob_list

# GibbsSampler(Dna, k, t, N)
def GetGibbsSampler(DNA, k, t, N):
  import random
  best_motifs = []
  motifs = []
  for genome in DNA:
    i = random.randint(0,len(genome)-k)
    best_motifs.append(genome[i:i+k])
    motifs.append(genome[i:i+k])
  best_score = GetMotifsScore(best_motifs)
  for j in range(N):
    i = random.randint(0,t-1)
    temp_motifs = motifs[0:i] + motifs[i+1:]
    prof_dict = GetProfileMatrix(temp_motifs)
    genome = DNA[i]
    prof_prob_list = GetProfProbGeneList(genome,k,prof_dict)
    biased_choice = random.choices(range(len(prof_prob_list)), 
                                   weights=prof_prob_list, k = 1)
    motifs[i] = DNA[i][biased_choice[0]:biased_choice[0]+k]
    motifs_score = GetMotifsScore(motifs)
    if motifs_score < best_score:
      best_score = motifs_score
      best_motifs = motifs[:]
  return best_motifs


In [None]:
def CheckGetGibbsSampler():
  file = open(main_directory+"Week4/gs2.txt", "r")
  lines = file.readlines()
  k = 15
  t = 20
  N = 2000
  DNA = []
  for i in range(1, len(lines)):
    DNA.append(lines[i].strip())
  best_motifs = GetGibbsSampler(DNA, k, t, N)
  best_score = GetMotifsScore(best_motifs)
  for i in range(20):
    motifs = GetGibbsSampler(DNA, k, t, N)
    motifs_score = GetMotifsScore(motifs)
    if motifs_score < best_score:
      best_score = motifs_score
      best_motifs = motifs
  for m in best_motifs:
    print(m)

CheckGetGibbsSampler()

TACAACAGGGGGGTT
CAGCTGGCGGTGGTG
AAGAACGCGGTGGGC
CAGAACGCGTCAGTG
CAGAGTACGGTGGTG
CAGAACGTCATGGTG
CACCTCGCGGTGGTG
CAGAACGCGGCTATG
CAGAAGTGGGTGGTG
TCGAACGCGGTGGTA
CAGAATCGGGTGGTG
CAGGGGGCGGTGGTG
TGAAACGCGGTGGTG
CAGAACCACGTGGTG
CAGAACTGCGTGGTG
CAGAACGCGGTGTGT
CGCGACGCGGTGGTG
CAGAGACCGGTGGTG
CAGAACGCTCGGGTG
CAGAACGCGGTATAG


In [None]:
x = [i for i in range(3)]
y = x[:]
x[0] = 10
y[2] = 20
print(x,y)

[10, 1, 2] [0, 1, 20]
