# 생물정보학 및 실습 1 - Term Project (Free Analysis) 2/3
생물정보학 및 실습 1   
서울대학교 협동과정 생물정보학전공 2022년 1학기

In [57]:
from collections import Counter, defaultdict
import math
import os
import sys
import pickle
import time

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
import pysam
import hmmlearn
from hmmlearn import base, hmm

## 1. Load Dataset and Denoise

In [42]:
dfOctadecamers = pd.read_table('../stats/octadecamers.txt')
dfOctadecamers

Unnamed: 0,octadecamer,counts
0,GTGGGTAAGAGCACCCGA,18
1,CTTCTGGAGTGTCTGAAG,14
2,AAAAAAAAAAAAAAAAAA,14
3,CATAATTTGTGGTAGTGG,11
4,TTTTTTTTTTTTTTTTTT,8
...,...,...
24724,AGTAATTTGTGCGGGTTT,1
24725,GGTCAAGAGAGTATGAAC,1
24726,ACAGAGCAGTGGTTCTGC,1
24727,CTTAGAACGTGGATGCCA,1


In [43]:
dfOctadecamers = dfOctadecamers[dfOctadecamers.octadecamer.str[8] == 'G'].reset_index(drop=True)
dfOctadecamers

Unnamed: 0,octadecamer,counts
0,GTGGGTAAGAGCACCCGA,18
1,CTTCTGGAGTGTCTGAAG,14
2,CATAATTTGTGGTAGTGG,11
3,AAATATAAGAGTTCGGTT,5
4,CTTCTGGTGTGTCTGAAG,5
...,...,...
22844,AGTAATTTGTGCGGGTTT,1
22845,GGTCAAGAGAGTATGAAC,1
22846,ACAGAGCAGTGGTTCTGC,1
22847,CTTAGAACGTGGATGCCA,1


## 2. Prepare Training and Test Set

In [44]:
dctOctadecamers = dfOctadecamers.set_index('octadecamer')['counts'].to_dict()
dctOctadecamers['GTGGGTAAGAGCACCCGA']

18

In [45]:
lstOctadecamers = []
for seq, counts in dctOctadecamers.items():
    lstOctadecamers += [seq] * counts

In [47]:
lstOctadecamers[:10]

['GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA']

In [48]:
len(lstOctadecamers)

23257

In [51]:
sTraining = ''.join(lstOctadecamers[:20000])
sTest     = ''.join(lstOctadecamers[20000:])

In [61]:
dctEncoding = {'A':0, 'C':1, 'G':2, 'T':3}

In [63]:
arrTraining = np.array([[dctEncoding[x]] for x in sTraining])
arrTest     = np.array([[dctEncoding[x]] for x in sTest])

In [64]:
arrTest

array([[3],
       [2],
       [3],
       ...,
       [1],
       [3],
       [2]])

In [54]:
lstTrainingLen = [len(x) for x in lstOctadecamers[:20000]]
lstTestLen     = [len(x) for x in lstOctadecamers[20000:]]
len(lstTestLen)

3257

## 3. Model init

In [68]:
model = hmm.MultinomialHMM(n_components=7, random_state=99, init_params='')

In [67]:
help(model)

Help on MultinomialHMM in module hmmlearn.hmm object:

class MultinomialHMM(hmmlearn.base.BaseHMM)
 |  Hidden Markov Model with multinomial (discrete) emissions.
 |  
 |  Attributes
 |  ----------
 |  n_features : int
 |      Number of possible symbols emitted by the model (in the samples).
 |  
 |  monitor_ : ConvergenceMonitor
 |      Monitor object used to check the convergence of EM.
 |  
 |  startprob_ : array, shape (n_components, )
 |      Initial state occupation distribution.
 |  
 |  transmat_ : array, shape (n_components, n_components)
 |      Matrix of transition probabilities between states.
 |  
 |  emissionprob_ : array, shape (n_components, n_features)
 |      Probability of emitting a given symbol when in each state.
 |  
 |  Examples
 |  --------
 |  >>> from hmmlearn.hmm import MultinomialHMM
 |  >>> MultinomialHMM(n_components=2)  #doctest: +ELLIPSIS
 |  MultinomialHMM(algorithm='viterbi',...
 |  
 |  Method resolution order:
 |      MultinomialHMM
 |      hmmlearn.

In [78]:
model.n_features = 4
model.startprob_ = np.array([1,] + [0,] * 6)
model.transmat_  = np.array([
    [0.800, 0.100, 0.020, 0.020, 0.020, 0.020, 0.020],
    [0.000, 0.050, 0.900, 0.013, 0.013, 0.012, 0.012],
    [0.005, 0.000, 0.050, 0.900, 0.015, 0.015, 0.015],
    [0.015, 0.000, 0.000, 0.050, 0.900, 0.020, 0.015],
    [0.020, 0.000, 0.000, 0.000, 0.050, 0.900, 0.030],
    [0.050, 0.000, 0.000, 0.000, 0.000, 0.050, 0.900],
    [0.950, 0.000, 0.000, 0.000, 0.000, 0.000, 0.050],
])
model.emissionprob_ = np.array([[0.25,] * 4] * 7)

In [79]:
model.transmat_.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1.])

## 4. Training

In [81]:
model.n_iter = 1000
model.tol = 0.01
model.verbose = True
class ThresholdMonitor(base.ConvergenceMonitor):
  @property
  def converged(self):
    return (self.iter == self.n_iter or self.history[-1] >= self.tol)
model.monitor_ = ThresholdMonitor(model.n_iter, model.tol, model.verbose)
model.fit(arrTraining, lstTrainingLen)

         1     -476136.9003             +nan
         2     -476013.4784        +123.4219
         3     -475896.5391        +116.9394
         4     -475785.2833        +111.2557
         5     -475678.9913        +106.2921
         6     -475577.0152        +101.9761
         7     -475478.7753         +98.2399
         8     -475383.7572         +95.0181
         9     -475291.5106         +92.2467
        10     -475201.6486         +89.8620
        11     -475113.8486         +87.8000
        12     -475027.8527         +85.9959
        13     -474943.4683         +84.3844
        14     -474860.5687         +82.8996
        15     -474779.0922         +81.4766
        16     -474699.0396         +80.0525
        17     -474620.4706         +78.5690
        18     -474543.4962         +76.9744
        19     -474468.2696         +75.2266
        20     -474394.9737         +73.2959
        21     -474323.8071         +71.1666
        22     -474254.9681         +68.8390
        23

       184     -472155.5209          +0.1974
       185     -472155.3289          +0.1920
       186     -472155.1421          +0.1868
       187     -472154.9605          +0.1817
       188     -472154.7838          +0.1767
       189     -472154.6119          +0.1719
       190     -472154.4447          +0.1672
       191     -472154.2821          +0.1626
       192     -472154.1240          +0.1581
       193     -472153.9702          +0.1538
       194     -472153.8206          +0.1496
       195     -472153.6752          +0.1454
       196     -472153.5338          +0.1414
       197     -472153.3962          +0.1375
       198     -472153.2625          +0.1337
       199     -472153.1324          +0.1301
       200     -472153.0060          +0.1265
       201     -472152.8830          +0.1230
       202     -472152.7634          +0.1196
       203     -472152.6472          +0.1163
       204     -472152.5341          +0.1130
       205     -472152.4242          +0.1099
       206

       367     -472147.4701          +0.0110
       368     -472147.4593          +0.0109
       369     -472147.4485          +0.0108
       370     -472147.4379          +0.0106
       371     -472147.4274          +0.0105
       372     -472147.4170          +0.0104
       373     -472147.4068          +0.0102
       374     -472147.3967          +0.0101
       375     -472147.3867          +0.0100
       376     -472147.3769          +0.0099
       377     -472147.3671          +0.0097
       378     -472147.3575          +0.0096
       379     -472147.3480          +0.0095
       380     -472147.3387          +0.0094
       381     -472147.3295          +0.0092
       382     -472147.3203          +0.0091
       383     -472147.3113          +0.0090
       384     -472147.3025          +0.0089
       385     -472147.2937          +0.0088
       386     -472147.2850          +0.0086
       387     -472147.2765          +0.0085
       388     -472147.2681          +0.0084
       389

       550     -472146.6769          +0.0013
       551     -472146.6756          +0.0013
       552     -472146.6744          +0.0013
       553     -472146.6731          +0.0012
       554     -472146.6719          +0.0012
       555     -472146.6707          +0.0012
       556     -472146.6695          +0.0012
       557     -472146.6683          +0.0012
       558     -472146.6671          +0.0012
       559     -472146.6660          +0.0012
       560     -472146.6648          +0.0011
       561     -472146.6637          +0.0011
       562     -472146.6626          +0.0011
       563     -472146.6615          +0.0011
       564     -472146.6604          +0.0011
       565     -472146.6593          +0.0011
       566     -472146.6583          +0.0011
       567     -472146.6572          +0.0011
       568     -472146.6562          +0.0010
       569     -472146.6551          +0.0010
       570     -472146.6541          +0.0010
       571     -472146.6531          +0.0010
       572

       733     -472146.5851          +0.0001
       734     -472146.5850          +0.0001
       735     -472146.5849          +0.0001
       736     -472146.5847          +0.0001
       737     -472146.5846          +0.0001
       738     -472146.5845          +0.0001
       739     -472146.5844          +0.0001
       740     -472146.5843          +0.0001
       741     -472146.5842          +0.0001
       742     -472146.5841          +0.0001
       743     -472146.5840          +0.0001
       744     -472146.5839          +0.0001
       745     -472146.5838          +0.0001
       746     -472146.5837          +0.0001
       747     -472146.5836          +0.0001
       748     -472146.5835          +0.0001
       749     -472146.5834          +0.0001
       750     -472146.5834          +0.0001
       751     -472146.5833          +0.0001
       752     -472146.5832          +0.0001
       753     -472146.5831          +0.0001
       754     -472146.5830          +0.0001
       755

       916     -472146.5778          +0.0000
       917     -472146.5778          +0.0000
       918     -472146.5778          +0.0000
       919     -472146.5778          +0.0000
       920     -472146.5777          +0.0000
       921     -472146.5777          +0.0000
       922     -472146.5777          +0.0000
       923     -472146.5777          +0.0000
       924     -472146.5777          +0.0000
       925     -472146.5777          +0.0000
       926     -472146.5777          +0.0000
       927     -472146.5777          +0.0000
       928     -472146.5777          +0.0000
       929     -472146.5777          +0.0000
       930     -472146.5777          +0.0000
       931     -472146.5777          +0.0000
       932     -472146.5777          +0.0000
       933     -472146.5777          +0.0000
       934     -472146.5777          +0.0000
       935     -472146.5777          +0.0000
       936     -472146.5776          +0.0000
       937     -472146.5776          +0.0000
       938

MultinomialHMM(algorithm='viterbi', implementation='log', init_params='',
        n_components=7, n_iter=1000, params='ste',
        random_state=RandomState(MT19937) at 0x7FEA1B235570,
        startprob_prior=1.0, tol=0.01, transmat_prior=1.0, verbose=True)

In [82]:
with open('../data/models/hmm-2206090217.pk', 'wb') as fPk:
    pickle.dump(model, fPk)

## 5. Test

In [83]:
model.monitor_.converged

False

In [84]:
model.transmat_

array([[5.22808488e-001, 4.77191512e-001, 1.17353098e-015,
        3.28922020e-049, 3.35637736e-061, 1.79320514e-017,
        6.64167013e-014],
       [0.00000000e+000, 5.25391513e-001, 4.74608487e-001,
        1.14527914e-028, 4.25542964e-031, 2.54080620e-094,
        9.02192173e-067],
       [9.83138397e-083, 0.00000000e+000, 3.97514902e-001,
        6.02485098e-001, 5.02619614e-043, 1.17833513e-033,
        7.41095575e-138],
       [2.66986903e-111, 0.00000000e+000, 0.00000000e+000,
        2.68876058e-001, 7.31123942e-001, 1.38726823e-033,
        1.30407394e-031],
       [5.27407131e-045, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 4.13103772e-001, 5.80452382e-001,
        6.44384596e-003],
       [1.10308866e-038, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 2.26207871e-001,
        7.73792129e-001],
       [8.71008336e-001, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        1.28991664

In [90]:
float(5.22808488e-001)

0.522808488

In [93]:
score = model.score(arrTest, lstTestLen)

In [94]:
score

-76788.57168100735

## 6. Predict

In [99]:
sOctadecamers   = ''.join(lstOctadecamers)
lstLen          = [len(x) for x in lstOctadecamers]
arrOctadecamers = np.array([[dctEncoding[x],] for x in sOctadecamers])
arrOctadecamers[:10]

array([[2],
       [3],
       [2],
       [2],
       [2],
       [3],
       [0],
       [0],
       [2],
       [0]])

In [100]:
states = model.predict(arrOctadecamers, lstLen)

In [103]:
states.ndim

1

In [106]:
states.shape[0] / 18

23257.0

In [109]:
states = states.reshape(23257, 18)

In [113]:
states[10000:10050]

array([[0, 0, 0, 1, 1, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 0, 0, 0],
       [0, 0, 1, 1, 2, 3, 4, 5, 6, 6, 0, 0, 0, 1, 2, 3, 3, 4],
       [0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 0, 0, 1, 2],
       [0, 0, 0, 1, 2, 3, 3, 4, 4, 4, 5, 6, 0, 1, 1, 2, 3, 3],
       [0, 1, 2, 3, 3, 4, 5, 5, 6, 0, 1, 1, 2, 3, 4, 5, 6, 0],
       [0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2],
       [0, 1, 2, 3, 4, 5, 5, 5, 6, 0, 1, 2, 3, 4, 5, 6, 6, 0],
       [0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 5, 6, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 4, 4, 5, 6, 0, 0],
       [0, 1, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 0, 0, 1, 2, 3, 3],
       [0, 0, 0, 0, 1, 1, 2, 3, 4, 5, 5, 5, 6, 0, 0, 1, 1, 2],
       [0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 5, 6, 0],
       [0, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 5, 6, 0, 0],
       [0, 1, 1, 2, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 1, 1, 1, 2],
       [0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 6, 0, 1, 2, 3, 4, 5],
       [0, 1, 1, 2, 2, 3, 4, 4, 4, 5, 5, 6, 0, 0, 1, 2,