In [21]:
import pandas as pd
import numpy as np
from hmmlearn import hmm
from sklearn.metrics import accuracy_score

In [22]:
df = pd.read_csv('../training_data/yeast_3-mers.csv')
df.head()

Unnamed: 0,3mer,coding
0,CCA,0
1,CAC,0
2,ACA,0
3,CAC,0
4,ACC,0


In [23]:
# Convert observations to numeric
unique_3mers = df['3mer'].unique()
mer_to_idx = {mer: i for i, mer in enumerate(unique_3mers)}
X = np.array([mer_to_idx[mer] for mer in df['3mer']]).reshape(-1, 1)

# Get true states
y = df['coding'].values

print(f"Total 3-mers: {len(X)}")
print(f"Class distribution: Non-coding={np.sum(y==0)}, Coding={np.sum(y==1)}")

Total 3-mers: 12157103
Class distribution: Non-coding=3403065, Coding=8754038


In [24]:
# Split into train and test sets (80/20)
split_idx = int(0.8 * len(X))

X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

baseline_accuracy = max(np.mean(y_test == 0), np.mean(y_test == 1))

Train size: 9725682, Test size: 2431421


In [25]:
# Estimate HMM parameters using supervised learning
n_states = 2
n_features = len(unique_3mers)

# Calculate initial state probabilities
initial_probs = np.zeros(n_states)
for state in range(n_states):
    initial_probs[state] = np.mean(y_train[:1000] == state)
initial_probs = initial_probs / initial_probs.sum()

# Calculate transition probabilities from training labels
transition_matrix = np.zeros((n_states, n_states))
for i in range(len(y_train) - 1):
    current_state = y_train[i]
    next_state = y_train[i + 1]
    transition_matrix[current_state, next_state] += 1

for state in range(n_states):
    row_sum = transition_matrix[state].sum()
    if row_sum > 0:
        transition_matrix[state] = transition_matrix[state] / row_sum
    else:
        transition_matrix[state] = 1.0 / n_states

# Calculate emission probabilities from training labels
emission_matrix = np.zeros((n_states, n_features))
for i in range(len(y_train)):
    state = y_train[i]
    observation = X_train[i, 0]
    emission_matrix[state, observation] += 1

for state in range(n_states):
    emission_matrix[state] += 1e-6
    emission_matrix[state] = emission_matrix[state] / emission_matrix[state].sum()

In [26]:
# Create the HMM and set parameters
model = hmm.CategoricalHMM(n_components=n_states, n_features=n_features, random_state=42)
model.startprob_ = initial_probs
model.transmat_ = transition_matrix
model.emissionprob_ = emission_matrix

# Predict on test set using Viterbi
logprob, predictions = model.decode(X_test, algorithm="viterbi")

accuracy = accuracy_score(y_test, predictions)

print(f"Test Accuracy: {accuracy*100:.2f}%")
print(f"Baseline: {baseline_accuracy*100:.2f}%")

# Check training accuracy to see if we're overfitting
_, train_predictions = model.decode(X_train, algorithm="viterbi")
train_accuracy = accuracy_score(y_train, train_predictions)
print(f"Train Accuracy: {train_accuracy*100:.2f}%")

Test Accuracy: 79.76%
Baseline: 70.30%
Train Accuracy: 79.09%
