# 생물정보학 및 실습 1 - Term Project (Free Analysis) 2/3
생물정보학 및 실습 1   
서울대학교 협동과정 생물정보학전공 2022년 1학기

In [1]:
from collections import Counter, defaultdict
import math
import os
import sys
import pickle
import time

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
import pysam
import hmmlearn
from hmmlearn import base, hmm

## 1. Load Dataset and Denoise

In [9]:
dfHexamers = pd.read_table('../stats/hexamers-cres12.txt')
dfHexamers

Unnamed: 0,hexamer,counts
0,AAGGAG,591
1,AAGAAG,426
2,AAGAGA,368
3,AAGAGG,363
4,GAGGAG,354
...,...,...
2088,GAAACT,1
2089,GTTAGA,1
2090,GTCAGC,1
2091,GGACAC,1


In [10]:
dfHexamers = dfHexamers[(dfHexamers.counts >= 10) & (dfHexamers.hexamer.str[2] == 'G')].reset_index(drop=True)
dfHexamers

Unnamed: 0,hexamer,counts
0,AAGGAG,591
1,AAGAAG,426
2,AAGAGA,368
3,AAGAGG,363
4,GAGGAG,354
...,...,...
404,CTGGAA,10
405,GAGGTC,10
406,AAGGCC,10
407,ATGGTT,10


## 2. Prepare Training and Test Set

In [11]:
dctHexamers = dfHexamers.set_index('hexamer')['counts'].to_dict()
dctHexamers['AAGGAG']

591

In [12]:
lstHexamers = []
for seq, counts in dctHexamers.items():
    lstHexamers += [seq] * counts

In [53]:
np.random.shuffle(lstHexamers)

In [54]:
lstHexamers[:10]

['CAGTGC',
 'AAGGGG',
 'CAGGTG',
 'GTGTGT',
 'CTGGTG',
 'GAGGAC',
 'AAGAGG',
 'GAGGGA',
 'CAGAGT',
 'AGGACT']

In [55]:
len(lstHexamers)

21274

In [56]:
sTraining = ''.join(lstHexamers[:20000])
sTest     = ''.join(lstHexamers[20000:])

In [57]:
dctEncoding = {'A':0, 'C':1, 'G':2, 'T':3}

In [58]:
arrTraining = np.array([[dctEncoding[x]] for x in sTraining])
arrTest     = np.array([[dctEncoding[x]] for x in sTest])

In [59]:
arrTest

array([[3],
       [3],
       [2],
       ...,
       [2],
       [0],
       [2]])

In [60]:
lstTrainingLen = [len(x) for x in lstHexamers[:20000]]
lstTestLen     = [len(x) for x in lstHexamers[20000:]]
len(lstTrainingLen), len(lstTestLen)

(20000, 1274)

## 3. Model init

In [61]:
model = hmm.MultinomialHMM(n_components=7, random_state=99, init_params='e')

In [67]:
help(model)

Help on MultinomialHMM in module hmmlearn.hmm object:

class MultinomialHMM(hmmlearn.base.BaseHMM)
 |  Hidden Markov Model with multinomial (discrete) emissions.
 |  
 |  Attributes
 |  ----------
 |  n_features : int
 |      Number of possible symbols emitted by the model (in the samples).
 |  
 |  monitor_ : ConvergenceMonitor
 |      Monitor object used to check the convergence of EM.
 |  
 |  startprob_ : array, shape (n_components, )
 |      Initial state occupation distribution.
 |  
 |  transmat_ : array, shape (n_components, n_components)
 |      Matrix of transition probabilities between states.
 |  
 |  emissionprob_ : array, shape (n_components, n_features)
 |      Probability of emitting a given symbol when in each state.
 |  
 |  Examples
 |  --------
 |  >>> from hmmlearn.hmm import MultinomialHMM
 |  >>> MultinomialHMM(n_components=2)  #doctest: +ELLIPSIS
 |  MultinomialHMM(algorithm='viterbi',...
 |  
 |  Method resolution order:
 |      MultinomialHMM
 |      hmmlearn.

In [62]:
model.n_features = 4
model.startprob_ = np.array([1,] + [0,] * 6)
model.transmat_  = np.array([
    [0.000, 1.000, 0.000, 0.000, 0.000, 0.000, 0.000],
    [0.001, 0.050, 0.930, 0.007, 0.006, 0.004, 0.002],
    [0.003, 0.000, 0.050, 0.930, 0.007, 0.006, 0.004],
    [0.005, 0.000, 0.000, 0.050, 0.930, 0.008, 0.007],
    [0.010, 0.000, 0.000, 0.000, 0.050, 0.930, 0.010],
    [0.020, 0.000, 0.000, 0.000, 0.000, 0.050, 0.930],
    [0.950, 0.000, 0.000, 0.000, 0.000, 0.000, 0.050],
])

In [63]:
model.transmat_.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1.])

## 4. Training

In [64]:
model.n_iter = 1000
model.tol = 0.01
model.verbose = True
class ThresholdMonitor(base.ConvergenceMonitor):
  @property
  def converged(self):
    return (self.iter == self.n_iter or self.history[-1] >= self.tol)
model.monitor_ = ThresholdMonitor(model.n_iter, model.tol, model.verbose)
model.fit(arrTraining, lstTrainingLen)

         1     -189727.4628             +nan
         2     -131266.4676      +58460.9951
         3     -126287.6988       +4978.7689
         4     -122959.0024       +3328.6963
         5     -121168.7127       +1790.2897
         6     -120183.6372        +985.0755
         7     -119511.1636        +672.4736
         8     -119033.3484        +477.8152
         9     -118659.3513        +373.9971
        10     -118307.5736        +351.7777
        11     -117947.9189        +359.6547
        12     -117586.3849        +361.5340
        13     -117241.5318        +344.8531
        14     -116931.6100        +309.9218
        15     -116670.1041        +261.5058
        16     -116463.8563        +206.2479
        17     -116311.6848        +152.1714
        18     -116205.6072        +106.0776
        19     -116134.5677         +71.0394
        20     -116087.9879         +46.5798
        21     -116057.5321         +30.4558
        22     -116037.3418         +20.1903
        23

       184     -115767.4610          +1.3743
       185     -115766.0761          +1.3849
       186     -115764.6808          +1.3953
       187     -115763.2754          +1.4054
       188     -115761.8603          +1.4152
       189     -115760.4357          +1.4245
       190     -115759.0022          +1.4335
       191     -115757.5601          +1.4421
       192     -115756.1100          +1.4502
       193     -115754.6522          +1.4577
       194     -115753.1875          +1.4647
       195     -115751.7163          +1.4712
       196     -115750.2393          +1.4770
       197     -115748.7571          +1.4822
       198     -115747.2702          +1.4868
       199     -115745.7795          +1.4908
       200     -115744.2855          +1.4940
       201     -115742.7889          +1.4966
       202     -115741.2904          +1.4985
       203     -115739.7908          +1.4996
       204     -115738.2906          +1.5001
       205     -115736.7907          +1.4999
       206

       367     -115528.5059          +0.7076
       368     -115527.8239          +0.6820
       369     -115527.1661          +0.6578
       370     -115526.5313          +0.6348
       371     -115525.9181          +0.6132
       372     -115525.3253          +0.5928
       373     -115524.7516          +0.5737
       374     -115524.1958          +0.5558
       375     -115523.6568          +0.5390
       376     -115523.1334          +0.5234
       377     -115522.6246          +0.5088
       378     -115522.1294          +0.4952
       379     -115521.6468          +0.4826
       380     -115521.1761          +0.4708
       381     -115520.7163          +0.4597
       382     -115520.2668          +0.4495
       383     -115519.8269          +0.4399
       384     -115519.3960          +0.4309
       385     -115518.9736          +0.4225
       386     -115518.5591          +0.4145
       387     -115518.1520          +0.4070
       388     -115517.7521          +0.4000
       389

       550     -115483.0237          +0.0827
       551     -115482.9423          +0.0814
       552     -115482.8621          +0.0802
       553     -115482.7831          +0.0790
       554     -115482.7052          +0.0778
       555     -115482.6286          +0.0767
       556     -115482.5530          +0.0755
       557     -115482.4786          +0.0744
       558     -115482.4053          +0.0733
       559     -115482.3331          +0.0722
       560     -115482.2619          +0.0712
       561     -115482.1918          +0.0701
       562     -115482.1226          +0.0691
       563     -115482.0545          +0.0681
       564     -115481.9874          +0.0671
       565     -115481.9213          +0.0661
       566     -115481.8561          +0.0652
       567     -115481.7918          +0.0643
       568     -115481.7285          +0.0633
       569     -115481.6660          +0.0624
       570     -115481.6045          +0.0615
       571     -115481.5438          +0.0607
       572

       733     -115477.3610          +0.0095
       734     -115477.3516          +0.0094
       735     -115477.3423          +0.0093
       736     -115477.3331          +0.0092
       737     -115477.3240          +0.0091
       738     -115477.3150          +0.0090
       739     -115477.3061          +0.0089
       740     -115477.2972          +0.0088
       741     -115477.2884          +0.0088
       742     -115477.2798          +0.0087
       743     -115477.2712          +0.0086
       744     -115477.2627          +0.0085
       745     -115477.2542          +0.0084
       746     -115477.2459          +0.0084
       747     -115477.2376          +0.0083
       748     -115477.2294          +0.0082
       749     -115477.2213          +0.0081
       750     -115477.2133          +0.0080
       751     -115477.2053          +0.0080
       752     -115477.1974          +0.0079
       753     -115477.1896          +0.0078
       754     -115477.1819          +0.0077
       755

       916     -115476.5281          +0.0018
       917     -115476.5262          +0.0018
       918     -115476.5244          +0.0018
       919     -115476.5226          +0.0018
       920     -115476.5208          +0.0018
       921     -115476.5191          +0.0018
       922     -115476.5173          +0.0018
       923     -115476.5156          +0.0017
       924     -115476.5138          +0.0017
       925     -115476.5121          +0.0017
       926     -115476.5104          +0.0017
       927     -115476.5087          +0.0017
       928     -115476.5071          +0.0017
       929     -115476.5054          +0.0017
       930     -115476.5038          +0.0016
       931     -115476.5022          +0.0016
       932     -115476.5005          +0.0016
       933     -115476.4989          +0.0016
       934     -115476.4973          +0.0016
       935     -115476.4958          +0.0016
       936     -115476.4942          +0.0016
       937     -115476.4927          +0.0016
       938

MultinomialHMM(algorithm='viterbi', implementation='log', init_params='e',
        n_components=7, n_iter=1000, params='ste',
        random_state=RandomState(MT19937) at 0x7F64370C9888,
        startprob_prior=1.0, tol=0.01, transmat_prior=1.0, verbose=True)

In [65]:
with open('../data/models/hmm-2206091326.pk', 'wb') as fPk:
    pickle.dump(model, fPk)

## 5. Test

In [66]:
model.monitor_.converged

False

In [67]:
model.transmat_

array([[0.00000000e+000, 1.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 9.99925549e-001,
        0.00000000e+000, 7.37016286e-158, 0.00000000e+000,
        7.44513550e-005],
       [1.35025998e-001, 0.00000000e+000, 6.20380859e-008,
        4.85517192e-001, 3.51189947e-001, 5.24610405e-004,
        2.77421918e-002],
       [5.92699303e-003, 0.00000000e+000, 0.00000000e+000,
        5.52022566e-016, 6.58419384e-014, 2.21694451e-023,
        9.94073007e-001],
       [1.22382912e-004, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 2.47659524e-001, 7.51151409e-001,
        1.06668332e-003],
       [1.06655451e-001, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 5.47841819e-002,
        8.38560367e-001],
       [9.25662929e-001, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        7.43370712

In [68]:
float(5.22808488e-001)

0.522808488

In [69]:
score = model.score(arrTest, lstTestLen)

In [70]:
score

-7332.602193293957

## 6. Predict

In [80]:
sHexamers   = ''.join(lstHexamers)
lstLen          = [len(x) for x in lstHexamers]
arrHexamers = np.array([[dctEncoding[x],] for x in sHexamers])
arrHexamers[:10]

array([[1],
       [0],
       [2],
       [3],
       [2],
       [1],
       [0],
       [0],
       [2],
       [2]])

In [81]:
states = model.predict(arrHexamers, lstLen)

In [82]:
states.ndim

1

In [84]:
states.shape[0] / 6

21274.0

In [85]:
states = states.reshape(21274, 6)

In [86]:
states[10000:10050]

array([[0, 1, 2, 3, 6, 0],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 4, 4, 5],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 4, 4, 5],
       [0, 1, 2, 3, 6, 0],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 4, 5, 0],
       [0, 1, 2, 4, 5, 6],
       [0, 1, 2, 3, 6, 0],
 