In [1]:
import numpy as np

In [2]:
np.set_printoptions(suppress=True, precision=4)
np.set_printoptions(linewidth=100, threshold=np.inf)
np.set_printoptions(formatter={'int': '{:5d}'.format})

### Create HMM training dataset

In [13]:
num_states = 10
num_observations = 500
train_length = 5000
test_length = 500

In [4]:
transition_probs = np.random.dirichlet(np.ones(num_states) * 1, size=num_states)
emission_probs = np.random.dirichlet(np.ones(num_observations) * 2, size=num_states)
initial_state_dist = np.ones(num_states) / num_states

In [5]:
transition_probs

array([[0.0626, 0.2056, 0.0385, 0.0248, 0.0344, 0.1093, 0.4691, 0.0399, 0.01  , 0.0058],
       [0.037 , 0.1543, 0.0358, 0.0338, 0.0007, 0.0506, 0.0186, 0.1352, 0.4422, 0.0919],
       [0.0349, 0.0916, 0.0084, 0.0699, 0.3041, 0.0683, 0.1126, 0.0252, 0.0336, 0.2513],
       [0.0769, 0.0916, 0.0635, 0.187 , 0.1108, 0.2428, 0.0181, 0.1525, 0.03  , 0.0268],
       [0.0069, 0.0559, 0.0175, 0.0715, 0.1058, 0.1149, 0.2578, 0.0948, 0.0554, 0.2197],
       [0.1134, 0.0122, 0.1936, 0.0445, 0.082 , 0.2256, 0.2373, 0.0284, 0.0552, 0.0078],
       [0.0082, 0.0619, 0.0095, 0.0873, 0.0067, 0.1257, 0.0069, 0.2802, 0.2354, 0.1782],
       [0.1487, 0.2586, 0.0429, 0.1066, 0.1015, 0.0006, 0.0371, 0.1756, 0.0289, 0.0994],
       [0.0573, 0.048 , 0.0933, 0.0214, 0.0103, 0.048 , 0.2972, 0.1664, 0.2278, 0.0303],
       [0.4567, 0.0012, 0.1385, 0.0723, 0.0014, 0.0977, 0.0702, 0.0101, 0.0709, 0.081 ]])

In [6]:
# Simulate the HMM and generate a long sequence 
def simulate_hmm(seq_len, start_prob, trans_prob, emis_prob):
    observation_sequence = []
    state_sequence = []
    
    current_state = np.random.choice(num_states, p=start_prob)
    for _ in range(seq_len):
        state_sequence.append(current_state)
        observation = np.random.choice(num_observations, p=emis_prob[current_state])
        observation_sequence.append(observation)
        current_state = np.random.choice(num_states, p=trans_prob[current_state])
    
    return state_sequence, observation_sequence

In [15]:
train_obs, train_hid = simulate_hmm(train_length, initial_state_dist, transition_probs, emission_probs)
test_obs, test_hid = simulate_hmm(train_length, initial_state_dist, transition_probs, emission_probs)

In [16]:
file_path = f"../../../data/hmm_train_dataset(state-{num_states}_obs-{num_observations}_length-{train_length}).npz"
np.savez(file_path, train_observations=np.array(train_obs), train_hidden_states=np.array(train_hid), train_length=train_length, test_observations=np.array(test_obs), test_hidden_states=np.array(test_hid), test_length=test_length)