In [1]:
import pandas as pd
import numpy as np
import time 

from pomegranate import State, DiscreteDistribution, HiddenMarkovModel
from sklearn.model_selection import train_test_split
from utils import load_saved_gfp_data, load_saved_mutated_gfp_data, one_hot_decode, one_hot_encode, normalize, count_substring_mismatch

In [2]:
print("Loading data...")
start_time = time.time()
X_train, X_test, y_train, y_test = load_saved_gfp_data()
mutated_df = load_saved_mutated_gfp_data()
print("Finished loading data in {0:.2f} seconds".format(time.time() - start_time))

Loading data...
Finished loading data in 3.90 seconds


In [51]:
def convert_to_pomegranate_shape(X_train_sequences): 
    lst = []
    for sequence in X_train_sequences:
        lst.append([c for c in sequence])
    return np.array(lst)
data = convert_to_pomegranate_shape(X_train_sequences)
data.shape

array([['A', 'G', 'C', ..., 'T', 'G', 'A'],
       ['A', 'G', 'C', ..., 'T', 'G', 'A'],
       ['A', 'G', 'C', ..., 'T', 'G', 'A'],
       ...,
       ['A', 'G', 'C', ..., 'T', 'G', 'A'],
       ['A', 'G', 'C', ..., 'T', 'G', 'A'],
       ['A', 'G', 'C', ..., 'T', 'G', 'A']], dtype='<U1')

In [52]:
model = HiddenMarkovModel.from_samples(DiscreteDistribution, n_components=5, X=data, max_iterations = 100, verbose=True)

x = [i for i in "ACTG"]
np.testing.assert_almost_equal(1, sum([model.probability(i) for i in x]))
for i in "ACTG":
    for j in "ACTG": 
        print("Symbol {0} with probability: {1:.3f}".format(i + j, model.probability(i + j)))
json = model.to_json()


[1] Improvement: 2729.529208475433	Time (s): 0.2089
[2] Improvement: 30.693405409969273	Time (s): 0.2086
[3] Improvement: 25.052473565461696	Time (s): 0.2286
[4] Improvement: 20.977391579712275	Time (s): 0.2236
[5] Improvement: 18.636600419849856	Time (s): 0.229
[6] Improvement: 17.600828106806148	Time (s): 0.2107
[7] Improvement: 17.391817551484564	Time (s): 0.2211
[8] Improvement: 17.65883097317419	Time (s): 0.2177
[9] Improvement: 18.177440533254412	Time (s): 0.2241
[10] Improvement: 18.808297113442677	Time (s): 0.2179
[11] Improvement: 19.462921818878385	Time (s): 0.2128
[12] Improvement: 20.083538723250967	Time (s): 0.2216
[13] Improvement: 20.6334770268586	Time (s): 0.2279
[14] Improvement: 21.093763508688426	Time (s): 0.2052
[15] Improvement: 21.462528693751665	Time (s): 0.214
[16] Improvement: 21.754860109620495	Time (s): 0.1978
[17] Improvement: 22.001576128328452	Time (s): 0.2147
[18] Improvement: 22.2461833893758	Time (s): 0.2208
[19] Improvement: 22.540050305178738	Time (s)

In [61]:
jss = model.to_json()
import json
with open('data.json', 'w') as outfile:
    json.dump(jss, outfile)
with open('data.json', 'r') as f:
    qss = json.load(f)
assert(qss == jss)
x = HiddenMarkovModel.from_json(qss)

False

In [3]:
X_train_sequences = one_hot_decode(X_train)
X_train_sequences = X_train_sequences[0:100]
a_count = sum([seq.count("A") for seq in X_train_sequences])
c_count = sum([seq.count("C") for seq in X_train_sequences])
t_count = sum([seq.count("T") for seq in X_train_sequences])
g_count = sum([seq.count("G") for seq in X_train_sequences])
print("A Count: {0}\nC Count: {1}\nT Count: {2}\nG Count: {3}".format(a_count, c_count, t_count, g_count))

A Count: 17498
C Count: 23705
T Count: 10186
G Count: 20011


In [37]:
hidden_state_size = 5
base_pair_lst = ["A", "C", "T", "G"]
dists = []
for _ in range(hidden_state_size): 
    emission_probs = np.random.random(4)
    emission_probs = emission_probs / emission_probs.sum()
    dists.append(DiscreteDistribution(dict(zip(base_pair_lst, emission_probs))))
trans_mat = np.random.random((hidden_state_size, hidden_state_size))
trans_mat = trans_mat / trans_mat.sum(axis = 1, keepdims = 1)
print(trans_mat)
starts = np.random.random((hidden_state_size))
print(starts)
print(model)
starts = starts / starts.sum()
# testing initializations
np.testing.assert_almost_equal(starts.sum(), 1)
np.testing.assert_array_almost_equal(np.ones(hidden_state_size), trans_mat.sum(axis = 1))
model = HiddenMarkovModel.from_matrix(trans_mat, dists, starts)
model.bake()
history = model.fit(X_train_sequences, 
                    algorithm = 'baum-welch', 
                    return_history = True, 
                    verbose = True,
                    max_iterations = 100,
                    n_jobs = 1)

[[0.06826557 0.1653473  0.4006184  0.32524956 0.04051917]
 [0.13868028 0.30978831 0.08781665 0.31674373 0.14697104]
 [0.0675592  0.18468565 0.32748842 0.34537547 0.07489126]
 [0.25022633 0.18824969 0.16783266 0.29132924 0.10236208]
 [0.07205194 0.46538696 0.06776236 0.15126656 0.24353219]]
[0.99830984 0.06446566 0.31982703 0.86107203 0.26533274]
None:{
    "class" : "State",
    "distribution" : {
        "class" : "Distribution",
        "dtype" : "str",
        "name" : "DiscreteDistribution",
        "parameters" : [
            {
                "A" : 0.3025210084033603,
                "C" : 0.22642857142858633,
                "T" : 0.13071428571427499,
                "G" : 0.34033613445378025
            }
        ],
        "frozen" : false
    },
    "name" : "s0",
    "weight" : 1.0
}{
    "class" : "State",
    "distribution" : {
        "class" : "Distribution",
        "dtype" : "str",
        "name" : "DiscreteDistribution",
        "parameters" : [
            {
       

In [39]:
print(model.dense_transition_matrix())
print(model)
print(history)

[[9.20536339e-041 5.94092075e-002 9.40590792e-001 1.43685796e-045
  9.21823605e-089 0.00000000e+000 0.00000000e+000]
 [5.55639144e-044 1.37046264e-122 9.67361944e-085 1.00000000e+000
  3.12569962e-090 0.00000000e+000 0.00000000e+000]
 [2.54998077e-029 1.32894826e-070 3.55826506e-050 1.00000000e+000
  2.26495349e-045 0.00000000e+000 0.00000000e+000]
 [8.58607665e-001 1.68036409e-047 3.81221354e-031 4.35731055e-076
  1.41392335e-001 0.00000000e+000 0.00000000e+000]
 [2.57002870e-057 9.04028933e-001 9.59710674e-002 2.98444870e-058
  2.07031765e-089 0.00000000e+000 0.00000000e+000]
 [1.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
  7.60458031e-054 0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000
  0.00000000e+000 0.00000000e+000 0.00000000e+000]]
None:{
    "class" : "State",
    "distribution" : {
        "class" : "Distribution",
        "dtype" : "str",
        "name" : "DiscreteDistribution",
        "parameters" : [
  

In [50]:
random_sample = "".join(model.sample(n = 1, length = 714)[0])
random_sample

'ACGGTCGTGATAGGCCACTATCCCAGCTGCAACGACCTCCAATGCATGTACGAGTGGAGCATGAAATTCACCCCGTACTAGATGCACGGCAATTACTTCAGGCATCTCAACCTCCCGGTCAAAGACCTCGGTAACTTCACCAAGCACAACTTCGAGACCAGCACGTACCGCGTCCCCTTGGTGCTGTAGGACGGCGACAATGAGAGCCACGACTGCTACGACTACAACCCACAGCCCCGGGACAAGCGCCCGTTCATGGTCAAGATGGTGCCCCGCGTCGTGAAGTCCGACAGCTTCATGAGGAGCCGCCTTTGCCTATAGTGGGGCAGCGTCGTCCAGCCGCCCCAGACCGGCGACCCCCGCCACGACCTGTAGGACAAGAATTGGCTCGACGAGGAGATCCACATGGACGAGGTGTTGAACGGCAAGAACGACAAGCACGAGATGGAGAGGCCCTTGCTCCTCGACCACAACCTCGACAGGGTCCACATGATGGTCCAGTTGTACACGTCCATCACCGAGTTCCACGGGCGCACCACCCTCCAGGGCGGGGTCAACCTCGAGAACGCCACCGCGCTCGTCACGATCCGCGACTAGAACCACTCCCTCGAGGGCCTCCACGCTGAGGCGGTCGTGTTGACCCAGTGGCACAACCTCGACCACGGCGTCAAGTTCAGCAACGACATCGTGCACAACGCGTGCTACCCCATCGGT'

In [86]:
"""
Num of Cores, Hidden State Size, Num of Sequences, Iterations, Time 
1, 3, 100, , 5
3, 3, 100, 200, 5
3, 10, 100, 1709, 398
"""

''

In [120]:
wild_type = pd.read_csv("./data/gfp_data.csv")["nucSequence"].values[0]
seq2 = pd.read_csv("./data/gfp_data.csv")["nucSequence"].values[1]
count_substring_mismatch(wild_type, random_sample)

458

In [121]:
count_substring_mismatch(wild_type, seq2)

1

In [None]:
## TODO: write test for random sample make sure it makes sense
## TODO: write test for probabilities make sure random samples have lower probability than wild type