In [35]:
import pandas as pd
import numpy as np
from hmmlearn import hmm
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('../training_data/yeast_3-mers.csv')
df.head()

Unnamed: 0,3mer,coding
0,CCA,0
1,CAC,0
2,ACA,0
3,CAC,0
4,ACC,0


In [39]:
# Need to converting observations to numeric
unique_3mers = df['3mer'].unique()
mer_to_idx = {mer: i for i, mer in enumerate(unique_3mers)}
X = np.array([mer_to_idx[mer] for mer in df['3mer']]).reshape(-1, 1)

In [33]:

# Creating HMM with 2 states (coding vs. non coding)
n_states = 2
n_features = len(unique_3mers) # Number of features is 64

model = hmm.CategoricalHMM(
    n_components=n_states,
    n_features=n_features,
    random_state=42,
    n_iter=100 
)

# Fitting the model using EM algorithm
model.fit(X, lengths=[len(X)])

# Check what we've got
print("Initial probabilities:", model.startprob_)
print("\nTransition matrix:\n", model.transmat_)
print("\nEmission probabilities shape:", model.emissionprob_.shape)

Initial probabilities: [1. 0.]

Transition matrix:
 [[4.13450034e-01 5.86549966e-01]
 [1.00000000e+00 4.33100928e-27]]

Emission probabilities shape: (2, 64)

Predicted states: [0 1 0 1 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0]


In [36]:
# Predict using the HMMs
predicted_states = model.predict(X)

accuracy = accuracy_score(df['coding'].values.reshape(-1,1), predicted_states)
print(f'Accuracy:{accuracy}')

# Accuracy seems pretty low

Accuracy:0.4500987611933534


In [38]:
# check the log probability of that path based on the learning.
logprob, best_path = model.decode(X, algorithm="viterbi")
print("Log probability of the Viterbi path:", logprob)
accuracy = accuracy_score(df['coding'].values.reshape(-1,1), best_path)
print(f'Accuracy:{accuracy}')

Log probability of the Viterbi path: -48140418.692857005
Accuracy:0.4500987611933534
