# 생물정보학 및 실습 1 - Term Project (Free Analysis) 2/3
생물정보학 및 실습 1   
서울대학교 협동과정 생물정보학전공 2022년 1학기

In [1]:
from collections import Counter, defaultdict
import math
import os
import sys
import pickle
import time

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
import pysam
import hmmlearn
from hmmlearn import base, hmm

## 1. Load Dataset and Denoise

In [2]:
dfOctadecamers = pd.read_table('../stats/octadecamers.txt')
dfOctadecamers

Unnamed: 0,octadecamer,counts
0,GTGGGTAAGAGCACCCGA,18
1,CTTCTGGAGTGTCTGAAG,14
2,AAAAAAAAAAAAAAAAAA,14
3,CATAATTTGTGGTAGTGG,11
4,TTTTTTTTTTTTTTTTTT,8
...,...,...
24724,AGTAATTTGTGCGGGTTT,1
24725,GGTCAAGAGAGTATGAAC,1
24726,ACAGAGCAGTGGTTCTGC,1
24727,CTTAGAACGTGGATGCCA,1


In [3]:
dfOctadecamers = dfOctadecamers[dfOctadecamers.octadecamer.str[8] == 'G'].reset_index(drop=True)
dfOctadecamers

Unnamed: 0,octadecamer,counts
0,GTGGGTAAGAGCACCCGA,18
1,CTTCTGGAGTGTCTGAAG,14
2,CATAATTTGTGGTAGTGG,11
3,AAATATAAGAGTTCGGTT,5
4,CTTCTGGTGTGTCTGAAG,5
...,...,...
22844,AGTAATTTGTGCGGGTTT,1
22845,GGTCAAGAGAGTATGAAC,1
22846,ACAGAGCAGTGGTTCTGC,1
22847,CTTAGAACGTGGATGCCA,1


## 2. Prepare Training and Test Set

In [4]:
dctOctadecamers = dfOctadecamers.set_index('octadecamer')['counts'].to_dict()
dctOctadecamers['GTGGGTAAGAGCACCCGA']

18

In [5]:
lstOctadecamers = []
for seq, counts in dctOctadecamers.items():
    lstOctadecamers += [seq] * counts

In [6]:
lstOctadecamers[:10]

['GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA',
 'GTGGGTAAGAGCACCCGA']

In [7]:
len(lstOctadecamers)

23257

In [8]:
sTraining = ''.join(lstOctadecamers[:20000])
sTest     = ''.join(lstOctadecamers[20000:])

In [9]:
dctEncoding = {'A':0, 'C':1, 'G':2, 'T':3}

In [10]:
arrTraining = np.array([[dctEncoding[x]] for x in sTraining])
arrTest     = np.array([[dctEncoding[x]] for x in sTest])

In [11]:
arrTest

array([[3],
       [2],
       [3],
       ...,
       [1],
       [3],
       [2]])

In [12]:
lstTrainingLen = [len(x) for x in lstOctadecamers[:20000]]
lstTestLen     = [len(x) for x in lstOctadecamers[20000:]]
len(lstTestLen)

3257

## 3. Model init

In [13]:
model = hmm.MultinomialHMM(n_components=7, random_state=99, init_params='')

In [14]:
model.n_features = 4
model.startprob_ = np.array([1,] + [0,] * 6)
model.transmat_  = np.array([
    [0.800, 0.100, 0.020, 0.020, 0.020, 0.020, 0.020],
    [0.000, 0.050, 0.900, 0.013, 0.013, 0.012, 0.012],
    [0.005, 0.000, 0.050, 0.900, 0.015, 0.015, 0.015],
    [0.015, 0.000, 0.000, 0.050, 0.900, 0.020, 0.015],
    [0.020, 0.000, 0.000, 0.000, 0.050, 0.900, 0.030],
    [0.050, 0.000, 0.000, 0.000, 0.000, 0.050, 0.900],
    [0.950, 0.000, 0.000, 0.000, 0.000, 0.000, 0.050],
])
model.emissionprob_ = np.array([[0.25,] * 4] * 7)

In [15]:
model.transmat_.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1.])

## 4. Learning

In [16]:
model.n_iter = 1000
model.tol = 0.01
model.verbose = True
class ThresholdMonitor(base.ConvergenceMonitor):
  @property
  def converged(self):
    return (self.iter == self.n_iter or self.history[-1] >= self.tol)
model.monitor_ = ThresholdMonitor(model.n_iter, model.tol, model.verbose)
model.fit(arrTest, lstTestLen)

         1      -81272.8932             +nan
         2      -79384.0278       +1888.8654
         3      -79370.5323         +13.4955
         4      -79355.2590         +15.2732
         5      -79337.1280         +18.1310
         6      -79315.0363         +22.0917
         7      -79287.8644         +27.1719
         8      -79254.5393         +33.3251
         9      -79214.1439         +40.3954
        10      -79166.0418         +48.1021
        11      -79109.9552         +56.0866
        12      -79045.9289         +64.0262
        13      -78974.1499         +71.7790
        14      -78894.6842         +79.4657
        15      -78807.2937         +87.3906
        16      -78711.5383         +95.7554
        17      -78607.3066        +104.2317
        18      -78495.7066        +111.6000
        19      -78379.9101        +115.7965
        20      -78265.2436        +114.6666
        21      -78158.0033        +107.2402
        22      -78063.3817         +94.6216
        23

       184      -76774.7242          +0.1294
       185      -76774.5981          +0.1260
       186      -76774.4752          +0.1229
       187      -76774.3554          +0.1199
       188      -76774.2384          +0.1170
       189      -76774.1241          +0.1143
       190      -76774.0124          +0.1117
       191      -76773.9033          +0.1092
       192      -76773.7965          +0.1068
       193      -76773.6920          +0.1045
       194      -76773.5898          +0.1023
       195      -76773.4896          +0.1001
       196      -76773.3915          +0.0981
       197      -76773.2954          +0.0961
       198      -76773.2011          +0.0943
       199      -76773.1087          +0.0924
       200      -76773.0180          +0.0907
       201      -76772.9291          +0.0890
       202      -76772.8417          +0.0873
       203      -76772.7560          +0.0858
       204      -76772.6717          +0.0842
       205      -76772.5890          +0.0828
       206

       367      -76767.8987          +0.0037
       368      -76767.8951          +0.0036
       369      -76767.8916          +0.0035
       370      -76767.8882          +0.0034
       371      -76767.8849          +0.0033
       372      -76767.8816          +0.0033
       373      -76767.8784          +0.0032
       374      -76767.8753          +0.0031
       375      -76767.8723          +0.0030
       376      -76767.8694          +0.0030
       377      -76767.8665          +0.0029
       378      -76767.8636          +0.0028
       379      -76767.8609          +0.0028
       380      -76767.8582          +0.0027
       381      -76767.8556          +0.0026
       382      -76767.8530          +0.0026
       383      -76767.8505          +0.0025
       384      -76767.8481          +0.0024
       385      -76767.8457          +0.0024
       386      -76767.8433          +0.0023
       387      -76767.8410          +0.0023
       388      -76767.8388          +0.0022
       389

       550      -76767.7290          +0.0004
       551      -76767.7286          +0.0004
       552      -76767.7281          +0.0004
       553      -76767.7277          +0.0005
       554      -76767.7272          +0.0005
       555      -76767.7267          +0.0005
       556      -76767.7263          +0.0005
       557      -76767.7258          +0.0005
       558      -76767.7253          +0.0005
       559      -76767.7248          +0.0005
       560      -76767.7243          +0.0005
       561      -76767.7238          +0.0005
       562      -76767.7233          +0.0005
       563      -76767.7227          +0.0005
       564      -76767.7222          +0.0005
       565      -76767.7217          +0.0005
       566      -76767.7211          +0.0006
       567      -76767.7205          +0.0006
       568      -76767.7200          +0.0006
       569      -76767.7194          +0.0006
       570      -76767.7188          +0.0006
       571      -76767.7182          +0.0006
       572

       733      -76767.5244          +0.0007
       734      -76767.5237          +0.0007
       735      -76767.5230          +0.0007
       736      -76767.5223          +0.0007
       737      -76767.5216          +0.0007
       738      -76767.5209          +0.0007
       739      -76767.5203          +0.0007
       740      -76767.5196          +0.0007
       741      -76767.5190          +0.0006
       742      -76767.5183          +0.0006
       743      -76767.5177          +0.0006
       744      -76767.5171          +0.0006
       745      -76767.5165          +0.0006
       746      -76767.5159          +0.0006
       747      -76767.5154          +0.0006
       748      -76767.5148          +0.0006
       749      -76767.5142          +0.0006
       750      -76767.5137          +0.0005
       751      -76767.5132          +0.0005
       752      -76767.5127          +0.0005
       753      -76767.5121          +0.0005
       754      -76767.5116          +0.0005
       755

       916      -76767.4880          +0.0000
       917      -76767.4880          +0.0000
       918      -76767.4880          +0.0000
       919      -76767.4880          +0.0000
       920      -76767.4879          +0.0000
       921      -76767.4879          +0.0000
       922      -76767.4879          +0.0000
       923      -76767.4879          +0.0000
       924      -76767.4879          +0.0000
       925      -76767.4879          +0.0000
       926      -76767.4878          +0.0000
       927      -76767.4878          +0.0000
       928      -76767.4878          +0.0000
       929      -76767.4878          +0.0000
       930      -76767.4878          +0.0000
       931      -76767.4878          +0.0000
       932      -76767.4878          +0.0000
       933      -76767.4877          +0.0000
       934      -76767.4877          +0.0000
       935      -76767.4877          +0.0000
       936      -76767.4877          +0.0000
       937      -76767.4877          +0.0000
       938

MultinomialHMM(algorithm='viterbi', implementation='log', init_params='',
        n_components=7, n_iter=1000, params='ste',
        random_state=RandomState(MT19937) at 0x7FE213B85570,
        startprob_prior=1.0, tol=0.01, transmat_prior=1.0, verbose=True)

In [17]:
model.startprob_

array([1., 0., 0., 0., 0., 0., 0.])

In [21]:
model.transmat_

array([[5.27194799e-001, 4.72805187e-001, 1.40481252e-008,
        5.23130745e-058, 1.97836669e-064, 2.86235378e-020,
        1.70874994e-018],
       [0.00000000e+000, 5.14157635e-001, 4.85842365e-001,
        2.60897858e-029, 1.51740054e-040, 2.85644202e-089,
        1.54499206e-065],
       [3.87170572e-084, 0.00000000e+000, 4.04874311e-001,
        5.95125689e-001, 2.10309641e-042, 6.78566360e-033,
        1.29377196e-130],
       [2.04457147e-120, 0.00000000e+000, 0.00000000e+000,
        2.87269800e-001, 7.12730200e-001, 3.28504717e-034,
        1.40882401e-030],
       [9.00920648e-045, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 4.07157219e-001, 5.69053076e-001,
        2.37897044e-002],
       [1.93334468e-039, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 2.26999492e-001,
        7.73000508e-001],
       [8.61679490e-001, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        1.38320510

In [22]:
model.emissionprob_

array([[3.59328064e-01, 1.72555994e-01, 2.34108838e-01, 2.34007103e-01],
       [1.78951304e-01, 1.09761995e-01, 3.83058197e-01, 3.28228504e-01],
       [2.81822328e-01, 4.75966920e-01, 1.39029861e-01, 1.03180891e-01],
       [4.32934223e-01, 3.13497607e-02, 1.85762071e-02, 5.17139809e-01],
       [7.64899557e-02, 1.26876135e-01, 7.85145963e-01, 1.14879467e-02],
       [5.63642149e-01, 2.29617365e-02, 7.55989545e-02, 3.37797160e-01],
       [4.95764549e-03, 2.19552555e-28, 9.95042355e-01, 1.96090510e-55]])