1. **GetMinSkewIndecies(genome):** Returns a list of indices where the mutation from C to G results the value of #C-#G to be minimum. Computed to find out the location of the Ori.
2. **GetHamDist(s1, s2):** Returns the hamming distance between two equal sized strings s1 and s2.
3. **GetApprxPatMatchIndices(pattern, genome, d):** Returns the list of indices in the genome where the pattern has at most d hamming distance from that corresponding index k-mer.
4. **GetApproximatePatternCount(pattern, genome, d):** Returns the count of k-mers in the genome where the pattern has at most d hamming distance from those corresponding k-mers.
5. **GetImmediateNeighbors(pattern):** Returns the set of k-mers within the 1-neighborhood of the pattern.
6. **GetNeighbors(pattern, d):** Returns the set of k-mers within the d-neighborhoood of the pattern. (k-mers with at max d hamming distance from the pattern)
7. **GetNeighborsEq(pattern, d):** Returns the set of k-mers which are the d-neighbor of the pattern. (k-mers having exactly d hamming distance from the pattern)
8. **GetFreqWordsMisMtach(genome, k, d):** Return a list of k-mers which are most frequent having at most d hamming distance from the k-mers in the genome. **These k-mers may not appear directly in the genome sequences.**
9. **GetFreqWordsMisMtachWithRC(genome, k, d):** Same as GetFreqWordsMisMtach with reverse compliment.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
main_directory = '/content/drive/My Drive/Colab Notebooks/Bioinformatics Code Challenges/Data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Previous Codes

In [None]:
# Minimum Skew Problem

def GetMinSkewIndecies(genome):
  data = []
  min_indx = []
  val = 0
  indx = 0
  data.append((val,indx))
  for i in genome:
    if i == 'C':
      val -= 1
    elif i == 'G':
      val += 1
    indx += 1    
    data.append((val,indx))   
  data.sort(reverse = True)
  for i in data:
    print(i[0], i[1])
  min = data[0][0]
  for i in data:
    if i[0] == min:
      min_indx.append(i[1])
    else:
      break
  return min_indx      

# file = open(main_directory+"Week2/genome.txt", "r")
# lines = file.readlines()
# genome = ''
# for i in range(1, len(lines)):
#   genome += lines[i].strip()

# print(len(lines))
genome = 'GCATACACTTCCCAGTAGGTACTG'
min_indx = GetMinSkewIndecies(genome)
for i in min_indx:
  print(i, end=' ')


1 1
0 5
0 4
0 3
0 2
0 0
-1 7
-1 6
-2 24
-2 21
-2 20
-2 19
-2 10
-2 9
-2 8
-3 23
-3 22
-3 18
-3 11
-4 17
-4 16
-4 15
-4 12
-5 14
-5 13
1 

In [None]:
print(len(genome))

4809037


Hamming Distance

In [None]:
def GetHamDist(s1, s2):
  hd = 0
  for i in range(len(s1)):
    if s1[i] != s2[i]:
      hd += 1
  return hd

# file = open(main_directory+"Week2/hd.txt", "r")
# lines = file.readlines()

s1 = 'CTACAGCAATACGATCATATGCGGATCCGCAGTGGCCGGTAGACACACGT'
s2 = 'CTACCCCGCTGCTCAATGACCGGGACTAAAGAGGCGAAGATTATGGTGTG'
print(GetHamDist(s1, s2))  

36


In [None]:
#Approximate Pattern Matching Problem
def GetApprxPatMatchIndices(pattern, genome, d):
  pat_len = len(pattern)
  ind = []
  for i in range(len(genome)-pat_len+1):
    dis = GetHamDist(genome[i:i+pat_len],pattern)
    if dis <= d:
      ind.append(i)
  return ind

file = open(main_directory+"Week2/apm.txt", "r")
lines = file.readlines()

pattern = lines[0].strip()
genome = lines[1].strip()
d = int(lines[2].strip())

ind = GetApprxPatMatchIndices(pattern, genome, d)
for i in ind:
  print(i, end=' ')

8 12 23 24 26 32 35 36 71 91 92 94 98 100 106 110 119 122 139 141 149 159 160 168 189 192 203 211 215 220 224 230 231 243 245 251 255 265 266 272 282 301 303 304 318 326 327 333 339 345 353 357 359 362 368 377 380 382 390 396 397 398 408 425 432 435 451 458 461 473 476 485 493 495 499 508 510 517 519 541 549 558 582 583 588 589 599 603 615 626 637 660 669 670 671 675 677 689 696 712 724 725 735 739 745 747 749 758 760 771 773 775 783 785 787 809 814 817 820 826 827 841 843 844 847 851 853 854 867 875 884 898 900 908 910 916 924 926 931 933 935 937 941 943 952 953 957 961 982 986 988 990 998 1000 1005 1007 1009 1017 1027 1029 1056 1057 1058 1093 1098 1119 1120 1123 1129 1143 1146 1147 1149 1155 1156 1161 1165 1172 1177 1179 1204 1206 1208 1212 1219 1221 1225 1236 1238 1240 1255 1256 1258 1271 1278 1283 1289 1293 1302 1319 1320 1328 1329 1331 1341 1345 1352 1366 1371 1375 1378 1384 1390 1400 1402 1403 1405 1420 1422 1433 1456 1461 1465 1474 1476 1478 1487 1488 1494 1513 1521 1529 1533 15

In [None]:
#ApproximatePatternCount
def GetApproximatePatternCount(pattern, genome, d):
  pat_len = len(pattern)
  _count = 0
  for i in range(len(genome)-pat_len+1):
    dis = GetHamDist(genome[i:i+pat_len],pattern)
    if dis <= d:
      _count += 1
  return _count

# file = open(main_directory+"Week2/apc.txt", "r")
# lines = file.readlines()

# pattern = lines[0].strip()
# genome = lines[1].strip()
# d = int(lines[2].strip())
pattern = 'TGT'
genome = 'CGTGACAGTGTATGGGCATCTTT'
d = 1

_count = GetApproximatePatternCount(pattern, genome, d)
print(_count)

8


In [None]:
# ImmediateNeighbors
def GetImmediateNeighbors(pattern):
  neighood = set()
  nucs = 'ATGC'
  for i in range(len(pattern)):
    site = pattern[i]
    for n in nucs:
      if n != site:
        neig = pattern[0:i]+n+pattern[i+1:]
        neighood.add(neig)
  return neighood

pattern = 'ACG'
neighood = GetImmediateNeighbors(pattern)
print(len(neighood))
for n in neighood:
  print(n, end=' ')


9
GCG AGG AAG ATG ACT ACA ACC TCG CCG 

In [None]:
# Neighbors
def GetFirstSymbol(pattern):
  return pattern[0]

def GetSuffix(pattern):
  return pattern[1:]

def GetNeighbors(pattern, d):
  if d == 0:
    return pattern
  if len(pattern) == 1:
    return ['A', 'C', 'G', 'T']
  neighood = set()
  suffix_neigs = GetNeighbors(GetSuffix(pattern), d)
  for neig in suffix_neigs:
    if GetHamDist(GetSuffix(pattern), neig) < d:
      for x in ['A', 'C', 'G', 'T']:
        neighood.add(x+neig)
    else:
      neighood.add(GetFirstSymbol(pattern) + neig)
  return neighood

# file = open(main_directory+"Week2/neig.txt", "r")
# lines = file.readlines()

# pattern = lines[0].strip()
# d = int(lines[1].strip())

pattern = 'TGCAT'
d = 2
neighood = GetNeighbors(pattern, d)
print(len(neighood))
for n in neighood:
  print(n)


106
CGAAT
AGCAC
TGCCA
TTCAG
TCCGT
TGGCT
TTAAT
TACAC
TGCAA
TACCT
TCCTT
TTCAT
GGAAT
TGAAA
GGCAA
AGTAT
TGCTG
AGCAA
CGCCT
TGCGA
ATCAT
TTGAT
TGACT
TGTCT
TGAAG
AGCCT
AGCTT
CGCAA
TAAAT
AGAAT
TACAA
TGGAA
TGTAT
TAGAT
GGCTT
TCCCT
TTCCT
AGCAG
TGTGT
TGCCC
GGCAG
GTCAT
TGCAG
TGCTC
TGCGG
TGAAC
TGAAT
TGTAC
TACGT
CGCAG
CGCAT
TGCAT
CGCGT
TGATT
TCTAT
TTCAA
CGTAT
TGTAG
TGTAA
TGCCT
TACAT
TCAAT
TACTT
CGCAC
TGAGT
TTCTT
CGCTT
GCCAT
CTCAT
TACAG
AGCGT
TGGTT
TGCGT
ACCAT
GGGAT
TGGAG
AACAT
TCCAC
TGCTA
TCCAA
GGTAT
TTCAC
GACAT
TCGAT
CACAT
AGGAT
TGTTT
TGCTT
TGCAC
TCCAT
TATAT
TGCGC
TGGAC
GGCGT
CCCAT
TGCCG
CGGAT
GGCAC
AGCAT
GGCCT
TGGAT
TGGGT
GGCAT
TTCGT
TCCAG
TTTAT


In [None]:
def GetNeighborsEq(pattern, d):
  neighood = set()
  if d == 0:
    return pattern
  if len(pattern) == 1:
    return ['A', 'C', 'G', 'T']
  suffix_neigs = GetNeighbors(GetSuffix(pattern), d)
  for neig in suffix_neigs:
    if GetHamDist(GetSuffix(pattern), neig) == d-1:
      for x in ['A', 'C', 'G', 'T']:
        if x != GetFirstSymbol(pattern):
          neighood.add(x+neig)
    elif GetHamDist(GetSuffix(pattern), neig) == d:
      neighood.add(GetFirstSymbol(pattern) + neig)
  return neighood

pattern = 'AGC'
d = 2

neighood = GetNeighborsEq(pattern, d)
print(len(neighood))
for n in neighood:
  print(n, GetHamDist(n,pattern))


27
GAC 2
TGT 2
TGG 2
AAG 2
ACA 2
ACG 2
TCC 2
ATT 2
GGG 2
CTC 2
GCC 2
CGA 2
GGA 2
CCC 2
GTC 2
AAT 2
ACT 2
ATA 2
TGA 2
ATG 2
TTC 2
AAA 2
TAC 2
CGT 2
GGT 2
CAC 2
CGG 2


In [None]:
# Frequent Words with Mismatches Problem:
def GetFreqWordsMisMtach(genome, k, d):
  pat_dict = {}
  for i in range(len(genome)-k+1):
    pattern = genome[i:i+k]
    neighood = GetNeighbors(pattern, d)
    for pat in neighood:
      if pat in pat_dict:
        continue
      _count = GetApproximatePatternCount(pat, genome, d)
      pat_dict[pat] = _count
  pat_sorted = sorted(pat_dict.items(), key=lambda x: x[1], reverse=True)
  _max = pat_sorted[0][1]
  freq_words = []
  for i in pat_sorted:
    if i[1] != _max:
      break
    # print(i[0], end=' ')
    freq_words.append(i[0])
  return freq_words    

file = open(main_directory+"Week2/fw.txt", "r")
lines = file.readlines()

genome = lines[0].strip()
k = 5
d = 3

freq_words = GetFreqWordsMisMtach(genome, k, d)
for fw in freq_words:
  print(fw, end=' ')
    

AAAAA 

# New Code

In [None]:
# Frequent Words with Mismatches and Reverse Complements Problem: 

def GetReverseCompliment(genome):
  rc = []
  for x in genome[-1::-1]:
    if x == 'A':
      rc.append('T')
    elif x == 'T':
      rc.append('A')
    elif x == 'C':
      rc.append('G')
    else:
      rc.append('C')
  return rc

def List2String(l):
  s = ''
  for i in l:
    s += i
  return s

def GetFreqWordsMisMtachWithRC(genome, k, d):
  pat_dict = {}
  for i in range(len(genome)-k+1):
    pattern = genome[i:i+k]
    neighood = GetNeighbors(pattern, d)
    for pat in neighood:
      pat_rc = GetReverseCompliment(pat)
      _key = pat + List2String(pat_rc)
      if _key in pat_dict:
        continue
      _count1 = GetApproximatePatternCount(pat, genome, d)
      _count2 = GetApproximatePatternCount(pat_rc, genome, d)
      pat_dict[_key] = _count1 + _count2
  pat_sorted = sorted(pat_dict.items(), key=lambda x: x[1], reverse=True)
  _max = pat_sorted[0][1]
  freq_words = []
  for i in pat_sorted:
    if i[1] != _max:
      break
    # print(i[0], end=' ')
    freq_words.append(i[0][0:k])
  return freq_words    

# file = open(main_directory+"Week2/fwrc.txt", "r")
# lines = file.readlines()

# genome = lines[0].strip()
# # genome = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
# k = 7
# d = 2

# freq_words = GetFreqWordsMisMtachWithRC(genome, k, d)
# for fw in freq_words:
#   print(fw, end=' ')

In [None]:
file = open(main_directory+"Week2/genome.txt", "r")
lines = file.readlines()
genome = ''
for i in range(1, len(lines)):
  genome += lines[i].strip()

start = 3764856 
genome = genome[start-500:start+500]
k = 9
d = 1

freq_words = GetFreqWordsMisMtachWithRC(genome, k, d)
for fw in freq_words:
  print(fw, end=' ')

TTATCCACA TGTGGATAA 