In [1]:
import numpy as np

In [2]:
np.set_printoptions(suppress=True, precision=4)
np.set_printoptions(linewidth=100, threshold=np.inf)
np.set_printoptions(formatter={'int': '{:5d}'.format})

### Create HMM training dataset

In [60]:
num_states = 8
num_observations = 8
train_length = 5000
test_length = 50

In [61]:
transition_probs = np.random.dirichlet(np.ones(num_states) * 1, size=num_states)
emission_probs = np.random.dirichlet(np.ones(num_observations) * 2, size=num_states)
initial_state_dist = np.ones(num_states) / num_states

In [62]:
transition_probs, emission_probs

(array([[0.1097, 0.1999, 0.0571, 0.0213, 0.0663, 0.1007, 0.2501, 0.1949],
        [0.2329, 0.2754, 0.1432, 0.0207, 0.0027, 0.0313, 0.093 , 0.2008],
        [0.1971, 0.2997, 0.2024, 0.0066, 0.1085, 0.1113, 0.0559, 0.0185],
        [0.0246, 0.1841, 0.0158, 0.2438, 0.1029, 0.0929, 0.2034, 0.1323],
        [0.1195, 0.0736, 0.0811, 0.1586, 0.1553, 0.2585, 0.0199, 0.1334],
        [0.0482, 0.0847, 0.2676, 0.1191, 0.1433, 0.271 , 0.0259, 0.0403],
        [0.1097, 0.0223, 0.2019, 0.0439, 0.0381, 0.0443, 0.1647, 0.3751],
        [0.0002, 0.0479, 0.0306, 0.048 , 0.1427, 0.4083, 0.2216, 0.1008]]),
 array([[0.0971, 0.1318, 0.1552, 0.0544, 0.2   , 0.095 , 0.167 , 0.0994],
        [0.096 , 0.341 , 0.0229, 0.0133, 0.049 , 0.1898, 0.2422, 0.0458],
        [0.0147, 0.1546, 0.2585, 0.3291, 0.044 , 0.0649, 0.0344, 0.0997],
        [0.0037, 0.0068, 0.1421, 0.1003, 0.0526, 0.2036, 0.1184, 0.3725],
        [0.0758, 0.206 , 0.0859, 0.0061, 0.1954, 0.1299, 0.2165, 0.0844],
        [0.169 , 0.0531, 0.0684, 0.1

In [53]:
def int_to_one_hot(n, length=num_observations):
    if n >= length or n < 0:
        raise ValueError("n must be within the range [0, length-1].")
    one_hot_vector = [0] * length
    one_hot_vector[n] = 1
    return one_hot_vector

In [54]:
print(int_to_one_hot(4))

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]


In [55]:
# Simulate the HMM and generate a long sequence 
def simulate_hmm(seq_len, start_prob, trans_prob, emis_prob):
    observation_sequence = []
    state_sequence = []
    
    current_state = np.random.choice(num_states, p=start_prob)
    for _ in range(seq_len):
        state_sequence.append(current_state)
        observation = np.random.choice(num_observations, p=emis_prob[current_state])
        # print(observation, int_to_one_hot(observation))
        observation_sequence.append(int_to_one_hot(observation))
        current_state = np.random.choice(num_states, p=trans_prob[current_state])
    
    return observation_sequence, state_sequence

In [63]:
train_obs, train_hid = simulate_hmm(train_length, initial_state_dist, transition_probs, emission_probs)
test_obs, test_hid = simulate_hmm(test_length, initial_state_dist, transition_probs, emission_probs)

In [64]:
train_obs

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0,

In [65]:
file_path = f"../../../data/hmm_train_dataset(state-{num_states}_obs-{num_observations}_length-{train_length}).npz"
np.savez(file_path, train_observations=np.array(train_obs), train_hidden_states=np.array(train_hid), train_length=train_length, test_observations=np.array(test_obs), test_hidden_states=np.array(test_hid), test_length=test_length)