## My Seq

In [23]:
def translate_codon (cod):
    """Translates a codon into an aminoacid using an internal dictionary with the standard genetic code."""
    tc = {"GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A", 
      "TGT":"C", "TGC":"C",
      "GAT":"D", "GAC":"D",
      "GAA":"E", "GAG":"E",
      "TTT":"F", "TTC":"F",
      "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G",
      "CAT":"H", "CAC":"H",
      "ATA":"I", "ATT":"I", "ATC":"I",
      "AAA":"K", "AAG":"K",
      "TTA":"L", "TTG":"L", "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L",
      "ATG":"M", "AAT":"N", "AAC":"N",
      "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P",
      "CAA":"Q", "CAG":"Q",
      "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R", "AGA":"R", "AGG":"R",
      "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S", "AGT":"S", "AGC":"S",
      "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T",
      "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V",
      "TGG":"W",
      "TAT":"Y", "TAC":"Y",
      "TAA":"_", "TAG":"_", "TGA":"_"}
    if cod in tc: return tc[cod]
    else: return None

class MySeq: 
    """ Class for biological sequences. """
    
    def __init__ (self, seq, seq_type = "DNA"): 
        self.seq = seq.upper()
        self.seq_type = seq_type

    def __len__(self):
        return len(self.seq)
    
    def __getitem__(self, n):
        return self.seq[n]

    def __getslice__(self, i, j):
        return self.seq[i:j]

    def __str__(self):
        return self.seq
        
    def get_seq_biotype (self):
        return self.seq_type
        
    def show_info_seq (self):
        print ("Sequence: " + self.seq + " biotype: " + self.seq_type)
        
    def alphabet (self):
        if (self.seq_type=="DNA" or self.seq_type=="dna"): return "ACGT"
        elif (self.seq_type=="RNA" or self.seq_type=="rna"): return "ACGU"
        elif (self.seq_type=="PROTEIN" or self.seq_type=="protein"): return "ACDEFGHIKLMNPQRSTVWY"
        else: return None
        
    def validate (self):
        alp = self.alphabet()
        res = True
        i = 0
        while i < len(self.seq) and res:
            if self.seq[i] not in alp: res = False
            else: i += 1
        return res 
        
    def transcription (self):
        if (self.seq_type == "DNA"):
            return MySeq(self.seq.replace("T","U"), "RNA")
        else:
            return None
    
    def reverse_comp (self):
        if (self.seq_type != "DNA"): return None
        comp = ""
        for c in self.seq:
            if (c == 'A'): comp = "T" + comp 
            elif (c == "T"): comp = "A" + comp 
            elif (c == "G"): comp = "C" + comp
            elif (c== "C"): comp = "G" + comp
        return MySeq(comp, "DNA")
        
    def translate (self, iniPos= 0):
        if (self.seq_type != "DNA"): return None
        seq_aa = ""
        for pos in range(iniPos,len(self.seq)-2,3):
            cod = self.seq[pos:pos+3]
            seq_aa += translate_codon(cod)
        return MySeq(seq_aa, "PROTEIN")      
        
if __name__ == "__main__":   
    s1 = MySeq("ATGTGATAAGAATAGAATGCTGAATAAATAGAATGACAT")
    s2 = MySeq("MKVVLSVQERSVVSLL", "PROTEIN")
    print(s1.validate(), s2.validate())
    print(s1)
    s3 = s1.transcription()
    s3.show_info_seq()
    s4 = s1.reverse_comp().translate()
    s4.show_info_seq()

True True
ATGTGATAAGAATAGAATGCTGAATAAATAGAATGACAT
Sequence: AUGUGAUAAGAAUAGAAUGCUGAAUAAAUAGAAUGACAU biotype: RNA
Sequence: MSFYLFSILFLSH biotype: PROTEIN


# Магадлалт мотиф ба Санамсаргүй алгоритмууд(Probabilistic Motifs and Stochastic Algorithms)

## 1. Магадлалт мотиф

Магадлалт мотифыг ерөнхийдөө Магадлалт жингийн матриц **(Probabilistic Weight Matrics - PWM)** - аар дүрсэлдэг.

## My Motif

In [21]:
def create_matrix_zeros (nrows, ncols):
    res = [ ] 
    for i in range(0, nrows):
        res.append([0]*ncols)
    return res

def print_matrix(mat):
    for i in range(0, len(mat)): print(mat[i])
        
# 1. PWM үүсгэх
# 2. Мотифын детерминистик дүрслэлүүдийг гаргаж авах,
# 3. Дараалал дээр мотиф илрэх магадлалыг тодорхойлох

class MyMotifs:

    def __init__(self, seqs = [], pwm = [], alphabet = None):
        if seqs:
            self.size = len(seqs[0])
            self.seqs = seqs # objet from class MySeq
            self.alphabet = seqs[0].alphabet()
            self.do_counts()
            self.create_pwm()
        else:
            self.pwm = pwm
            self.size = len(pwm[0])
            self.alphabet = alphabet   
        
    def __len__ (self):
        return self.size        
    
    # do_counts болон create_pwm функцуудээр үндсэн аттрифутуудыг байгуулна.
    # PWM загварт ашигласан дарааллууд;
    # Дарааллын нийт тоо;
    # Цагаан толгой;
    # Тоо ширхэгийн матриц
    # Мотифын байрлал бүр дээрх тэмдэгтийн давтамжын матриц.
    def do_counts(self):
        self.counts = create_matrix_zeros(len(self.alphabet), self.size)
        for s in self.seqs:
            for i in range(self.size):
                lin = self.alphabet.index(s[i])
                self.counts[lin][i] += 1
                
    def create_pwm(self):
        if self.counts == None: self.do_counts()
        self.pwm = create_matrix_zeros(len(self.alphabet), self.size)
        for i in range(len(self.alphabet)):
            for j in range(self.size):
                self.pwm[i][j] = float(self.counts[i][j]) / len(self.seqs)
    
    # МОТИФЫН ДЕТЕРМИНИСТИК ДҮРСЛЭЛҮҮД
    
    # Мотифын байрлал бүрт хамгийн их илэрсэн тэмдэгт бүхий дундаж дүрслэлийг PWM - ээс үүсгэнэ/
    # - consnsus: Мотифын байрлал бүрийг сканнердаж, байрлал бүрт хамгийн олон давтамжтай тэмдэгтийг сонгоно.
    def consensus(self):
        """ returns the sequence motif obtained with the most frequent symbol at each position of the motif"""
        res = ""
        for j in range(self.size):
            maxcol = self.counts[0][j]
            maxcoli = 0
            for i in range(1, len(self.alphabet) ):
                if self.counts[i][j] > maxcol: 
                    maxcol = self.counts[i][j]
                    maxcoli = i
            res += self.alphabet[maxcoli]        
        return res
    
    # - masked_consensus: Төстэй байдлаар ажилладаг боловч байрлал бүрийн давтамж нь 50% - иас бага бол "-"
    # үгүй бол цагаан толгойн тэмдэгтийг гаргадаг. 
    def masked_consensus(self):
        """ returns the sequence motif obtained with the symbol that occurrs in at least 50% of the input sequences"""
        res = ""
        for j in range(self.size):
            maxcol = self.counts[0][j]
            maxcoli = 0
            for i in range(1, len(self.alphabet) ):
                if self.counts[i][j] > maxcol: 
                    maxcol = self.counts[i][j]
                    maxcoli = i
            if maxcol > len(self.seqs) / 2:
                res += self.alphabet[maxcoli]        
            else:
                res += "-"
        return res

    # - probability_sequence болон probability_all_positions
    #   - Тухайн байрлалд илрэх бүх тэмдэгтийг магадлуудын үржвэр
    #   - Урт дараалал дээр илрэх N урттай дэд дараалал бүр дээр мотиф илрэх магадлал
    #   - Эхний индексээс |S| - N + 1 байрлалд хүртэл скан хийнэ.
    #   - Дэд дараалал бүрийн магадлалыг жагсаалтад хадаглана.
    def probability_sequence(self, seq):
        res = 1.0
        for i in range(self.size):
            lin = self.alphabet.index(seq[i])
            res *= self.pwm[lin][i]
        return res
    
    def probability_all_positions(self, seq):
        res = []
        for k in range(len(seq)-self.size+1):
            res.append(self.probability_sequence(seq))
        return res

    # most_probable_sequence 
    # - Мотифт тохирох хамгийн өндөр магадлалтай дэд дарааллыг олно.
    # - Энэ нь мотифыг шинэчилэх, сайжруулах боломжийг олгоно.
    def most_probable_sequence(self, seq):
        """ Returns the index of the most probable sub-sequence of the input sequence"""
        maximum = -1.0
        maxind = -1
        for k in range(len(seq)-self.size):
            p = self.probability_sequence(seq[k:k+ self.size])
            if(p > maximum):
                maximum = p
                maxind = k
        return maxind

    # create_motif
    # - Оролтын дарааллуудыг сканнердаж, MyMotifs классын обьектыг буцаана
    # - Шинэ мотифыг байгуулах хамгийн боломжит дэд дарааллуудыг сонгоно.
    def create_motif(self, seqs):
        l = []
        for s in seqs: 
            ind = self.most_probable_sequence(s.seq)
            subseq = MySeq ( s[ind:(ind+self.size)], s.get_seq_biotype() )
            l.append(subseq)
        
        return MyMotifs(l) 

In [22]:
def test():
    # - Эхлээд мотиф үүсгэх 8 дарааллыг авч тоо ширхэг болон 
    # давтамжийн матрицыг дүрслэнэ.
    # - Дундаж болон максласан дундаж мотифыг олно.
    # - Магадлалын тооцооллыг хэд хэдэн оролтын дараалал дээр гүйцэтгэнэ.
    seq1 = MySeq("AAAGTT")
    seq2 = MySeq("CACGTG")
    seq3 = MySeq("TTGGGT")
    seq4 = MySeq("GACCGT")
    seq5 = MySeq("AACCAT")
    seq6 = MySeq("AACCCT")
    seq7 = MySeq("AAACCT")
    seq8 = MySeq("GAACCT")
    lseqs = [seq1, seq2, seq3, seq4, seq5, seq6, seq7, seq8]
    motifs = MyMotifs(lseqs) # object 
    print ("Counts matrix: ")
    print_matrix(motifs.counts)
    print ("PWM") #(Probabilistic Weight Matrics - PWM)
    print_matrix(motifs.pwm)
    print ("Sequence alphabet")
    print(motifs.alphabet)
    
    print()
    print("probability_seq: ")
    print(motifs.probability_sequence("AAACCT"))
    print(motifs.probability_sequence("ATACAG"))
    print()
    print("most_probable_sequence: ")
    print(motifs.most_probable_sequence("CTATAAACCTTACATC"))

if __name__ == '__main__':
    test()

Counts matrix: 
[4, 7, 3, 0, 1, 0]
[1, 0, 4, 5, 3, 0]
[2, 0, 1, 3, 2, 1]
[1, 1, 0, 0, 2, 7]
PWM
[0.5, 0.875, 0.375, 0.0, 0.125, 0.0]
[0.125, 0.0, 0.5, 0.625, 0.375, 0.0]
[0.25, 0.0, 0.125, 0.375, 0.25, 0.125]
[0.125, 0.125, 0.0, 0.0, 0.25, 0.875]
Sequence alphabet
ACGT

probability_seq: 
0.0336456298828125
0.0002288818359375

most_probable_sequence: 
4


## Motif Finding

In [20]:
class MotifFinding:
    
    def __init__(self, size = 8, seqs = None):
        self.motif_size = size
        if (seqs != None):
            self.seqs = seqs
            self.alphabet = seqs[0].alphabet()
        else:
            self.seqs = []
                    
    def __len__ (self):
        return len(self.seqs)
    
    def __getitem__(self, n):
        return self.seqs[n]
    
    def seq_size (self, i):
        return len(self.seqs[i])
    
    def read_file(self, fic, t):
        for s in open(fic, "r"):
            self.seqs.append(MySeq(s.strip().upper(),t))
        self.alphabet = self.seqs[0].alphabet()
           
    # create_motif_from_indexes
    # - Мотифын загварыг бүтээхэд ашигласан дэд дарааллын анхдагч байрлалыг агуулсан индексүүдийн 
    # жагсаалтыг авч MyMotifs төрлийн магадлалын мотиф үүсгэдэг.
    def create_motif_from_indexes(self, indexes):
        pseqs = []
        for i,ind in enumerate(indexes):
            pseqs.append( MySeq(self.seqs[i][ind:(ind+self.motif_size)], self.seqs[i].get_seq_biotype()) )
        return MyMotifs(pseqs)
        
        
    # SCORES (include in deterministic motif finding)
    def score(self, s):
        score = 0
        motif = self.create_motif_from_indexes(s)
        motif.do_counts()
        mat = motif.counts
        for j in range(len(mat[0])):
            maxcol = mat[0][j]
            for  i in range(1, len(mat)):
                if mat[i][j] > maxcol: 
                    maxcol = mat[i][j]
            score += maxcol
        return score
   
    def score_multiplicative(self, s):
        score = 1.0
        motif = self.create_motif_from_indexes(s)
        motif.create_pwm()
        mat = motif.pwm
        for j in range(len(mat[0])):
            maxcol = mat[0][j]
            for  i in range(1, len(mat)):
                if mat[i][j] > maxcol: 
                    maxcol = mat[i][j]
            score *= maxcol
        return score     
       
    # Probabilistic Motif Finding  
    # heuristic stochastic
    # Оролтын дарааллууд дээрх санамсаргүй сонгосон анхдагч байрлалууд болон харгалзах 
    # PWM (үржүүлсэн оноо) - ээр мотиф үүсгэдэг.
    def heuristic_stochastic (self):
        from random import randint
        s = [0]* len(self.seqs) 
        for k in range(len(s)):
            s[k] = randint(0, self.seq_size(k)- self.motif_size)
        motif = self.create_motif_from_indexes(s)
        motif.create_pwm()
        sc = self.score_multiplicative(s)
        bestsol = s
        improve = True
        while(improve):
            for k in range(len(s)):
                s[k] = motif.most_probable_sequence(self.seqs[k])
            if self.score_multiplicative(s) > sc: 
                sc = self.score_multiplicative(s)
                bestsol = sc
                motif = self.create_motif_from_indexes(s)
                motif.create_pwm()
            else: improve = False    
        return bestsol


    # GIBS SAMPLING
    # Мотиф загварыг санамсаргүй байдлаар сонгосон дэд дараллаар эхлүүлж, 
    # улмаар анхны загвартай харьцуулсан оноог тооцдог.
    # давталт бүрийн хувьд:
    # - мотифын аль нэг илрэлийг шинэчлэх эсэх магадлалаар локал хайлт хийдэг
    # - Өгөгдсөн оролтын дарааллуудаас мотифыг загварчилахад ашигласан дэд дарааллыг арилгана.
    # - Сүүлд нь өөр дэд дарааллаар солихыг оролдох. Мотифын оноог тооцоолох, Сайжруулалтыг хадаглах эсэхийг шийддэх.
    def gibbs (self, iterations = 100):
        from random import randint
        s = []
        for i in range(len(self.seqs)):
            s.append(randint(0, len(self.seqs[0]) - self.motif_size - 1))
        best_s = list(s)
        best_score = self.score_multiplicative(s)
        for it in range(iterations):
            # randomly pick a sequence
            seq_idx = randint(0, len(self.seqs)-1) 
            seq_sel = self.seqs[seq_idx]
            s.pop(seq_idx)
            removed = self.seqs.pop(seq_idx)
            motif = self.create_motif_from_indexes(s)            
            motif.create_pwm()
            self.seqs.insert(seq_idx, removed)
            r = motif.probability_all_positions(seq_sel)
            pos = self.roulette(r)
            s.insert(seq_idx, pos)
            score = self.score_multiplicative(s)
            if score > best_score:
                best_score = score
                best_s = list(s)
        return best_s
    
    def roulette(self, f):
        from random import random
        tot = 0.0
        for x in f: tot += (0.01 + x) 
        val = random() * tot
        acum = 0.0
        idx = 0
        while acum < val:
            acum += (f[idx] + 0.01)
            idx += 1
        return idx-1 

In [25]:
def test1():  
    sm = MotifFinding()
    sm.read_file("./files/exampleMotifs.txt","dna")
    sol = [25,20,2,55,59]
    sa = sm.score(sol)
    print("score: ")
    print(sa)
    scm = sm.score_multiplicative(sol)
    print()
    print("score_multiplicatice:")
    print(scm)


test1()
print()

score: 
30

score_multiplicatice:
0.08847360000000001



In [27]:
def test2():
    mf = MotifFinding()
    mf.read_file("./files/exampleMotifs.txt","dna")
    print("Heuristic stochastic")
    sol = mf.heuristic_stochastic()
    print ("Solution: " , sol)
    print ("Score:" , mf.score(sol))
    print ("Score mult:" , mf.score_multiplicative(sol))
    print("Consensus:", mf.create_motif_from_indexes(sol).consensus())
    sol2 = mf.gibbs(10000)
    print ("Score:" , mf.score(sol2))
    print ("Score mult:" , mf.score_multiplicative(sol2))
    print("Consensus:", mf.create_motif_from_indexes(sol2).consensus())

test2()

Heuristic stochastic
Solution:  0.01990656


TypeError: 'float' object is not iterable

## Bio Python

In [30]:
from Bio.Seq import Seq
from Bio import motifs
from Bio.Alphabet import IUPAC

instances = []
instances.append(Seq("TATAA",IUPAC.unambiguous_dna))
instances.append(Seq("TATTA",IUPAC.unambiguous_dna))
instances.append(Seq("TTTAT",IUPAC.unambiguous_dna))
instances.append(Seq("TATAC",IUPAC.unambiguous_dna))

m = motifs.create(instances, IUPAC.unambiguous_dna)

print(type(m))
print(m)
print(len(m))
print(m.consensus)
print(m.pwm)
print(m.counts)
print(m.pssm)

m.weblogo(",/files/mymotif.png")

pwm = m.counts.normalize(pseudocounts=0.5)
pssm = pwm.log_odds()
print(pwm)
print(pssm)

# exact matches of the instances
test_seq=Seq("TTTTATACACTGCATATAACAACCCAAGCATTATAA", IUPAC.unambiguous_dna)

for pos, seq in m.instances.search(test_seq):
    print (pos, " ", seq)
    
# using PSSM to score matches
for position, score in pssm.search(test_seq, threshold=4.0):
    print ("Position %d: score = %5.3f" % (position, score) )
    
# scores for all positions
print(pssm.calculate(test_seq))



ImportError: Bio.Alphabet has been removed from Biopython. In many cases, the alphabet can simply be ignored and removed from scripts. In a few cases, you may need to specify the ``molecule_type`` as an annotation on a SeqRecord for your script to work correctly. Please see https://biopython.org/wiki/Alphabet for more information.