<a href="https://colab.research.google.com/github/jonaMclaurin/bioinformatics/blob/main/Bioinformatics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===Helper functions classes===

class MyList(list):
  def last_index(self):
    return len(self) - 1

In [None]:
#Validate DNA sequence
def validate_DNA(dna_seq):
  seqm = dna_seq.upper()
  valid = seqm.count('A') + seqm.count('C') + seqm.count('G') + seqm.count('T')
  if valid == len(seqm): 
    return True
  else: 
    return False

#Frequency of each symbol
def frequency(seq):
  dic = {}
  for symbol in seq:
    if symbol not in dic:
      dic[symbol] = 1
    else: 
      dic[symbol] += 1
  return dic

#GC content percentage
def gc_content(dna_seq):
  """
    Returns percentage of G and C nucleotides in a DNA sequence
  """
  gc_count = 0
  for s in dna_seq:
    if s in "GgCc":
      gc_count += 1
  return gc_count/ len(dna_seq)

#frequency sorted by count
def sorted_frequency(seq):
  freq = frequency(seq)
  list_f = sorted(freq.items(), key=lambda x: x[1], reverse=True)
  for (b, f) in list_f:
    print(f'Base {b} => {f}')

#gc content in a sub-sequence
def gc_content_subseq(dna_seq, k=4):
  res = []
  #print(list(range(0, len(dna_seq)-k + 1, k)))
  for i in range(0, len(dna_seq)-k + 1, k):
    subseq = dna_seq[i:i+k]
    gc = gc_content(subseq)
    print(f'{subseq} => {gc * 100} % ')
    res.append(gc)
  return res

#transcription 
def transcription(dna_seq):
  assert validate_DNA(dna_seq), 'Invalid DNA sequence'
  return dna_seq.upper().replace('T', 'U')

#Reverse complement 
def reverse_complement(dna_seq):
  assert validate_DNA(dna_seq), 'Invalid DNA sequence'
  comp = ''
  for n in dna_seq.upper():
    if n == 'A':
      comp = 'T' + comp
    elif n == 'C':
      comp = 'G' + comp
    elif n == 'T':
      comp = 'A' + comp
    elif n == 'G':
      comp = 'C' + comp
  return comp 

#Translation

def translate_codon (cod):
  """Translates a codon into an aminoacid using an internal
      dictionary with the standard genetic code.
  """
  cod = cod.upper()
  tc = {"GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A",
  "TGT":"C", "TGC":"C",
  "GAT":"D", "GAC":"D",
  "GAA":"E", "GAG":"E",
  "TTT":"F", "TTC":"F",
  "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G",
  "CAT":"H", "CAC":"H",
  "ATA":"I", "ATT":"I", "ATC":"I",
  "AAA":"K", "AAG":"K",
  "TTA":"L", "TTG":"L", "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L",
  "ATG":"M", "AAT":"N", "AAC":"N",
  "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P",
  "CAA":"Q", "CAG":"Q",
  "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R", "AGA":"R", "AGG":"R",
  "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S", "AGT":"S", "AGC":"S",
  "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T",
  "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V",
  "TGG":"W",
  "TAT":"Y", "TAC":"Y",
  "TAA":"_", "TAG":"_", "TGA":"_"}
  if cod in tc: return tc[cod]
  else: return None

#Translate sequence 
def translate_seq(dna_seq, init_pos=0):
  assert validate_DNA(dna_seq), 'Invalid DNA sequence'
  seq_m = dna_seq.upper()
  seq_aa = ''
  for pos in (range(init_pos, len(seq_m)-2, 3)):
    cod = seq_m[pos:pos+3]
    seq_aa += translate_codon(cod)
  return seq_aa

In [None]:
gc_content_subseq('AAAACGGGGCTTTGCCC')

In [None]:
seq = list('AAAACGGGGCTTTGCCX')
last = list(range(0, len(seq) - 4 + 1, 4))

In [None]:
translate_seq('aaacccgggcga')

In [None]:
#DEEP CHEM
!pip install --pre deepchem

In [None]:
import deepchem as dc
import numpy as np

In [None]:
x = np.random.random((4,5))
y = np.random.random((4,1))

In [None]:
dataset = dc.data.NumpyDataset(x, y)

In [None]:
print(dataset.X)

In [None]:
import string 

characters = string.printable

In [None]:
token_index = dict(zip(range(1, len(characters) + 1), characters))

In [None]:
import numpy as np
import re

In [None]:
seq = list('AGAAG')
pattern = list('AGAA')

In [None]:
def overlap(s1, s2):
  maxov = min(len(s1), len(s2))
  for i in range(maxov, 0, -1):
    if s1[-i:] == s2[:i]:
      return i
  return 0

In [None]:
import pandas as pd