In [3]:
import numpy as np

In [4]:
np.set_printoptions(suppress=True, precision=4)
np.set_printoptions(linewidth=100, threshold=np.inf)
np.set_printoptions(formatter={'int': '{:5d}'.format})

### Create HMM training dataset

In [12]:
num_states = 10
num_observations = 500
train_length = 10000
test_length = 500

In [5]:
transition_probs = np.random.dirichlet(np.ones(num_states) * 1, size=num_states)
emission_probs = np.random.dirichlet(np.ones(num_observations) * 2, size=num_states)
initial_state_dist = np.ones(num_states) / num_states

In [6]:
transition_probs

array([[0.045 , 0.0007, 0.1839, 0.1207, 0.0337, 0.0269, 0.0921, 0.1006, 0.2484, 0.1481],
       [0.0449, 0.0154, 0.0813, 0.0741, 0.1922, 0.1145, 0.0252, 0.0559, 0.1629, 0.2337],
       [0.1105, 0.055 , 0.0325, 0.0069, 0.1945, 0.1051, 0.0961, 0.3111, 0.0739, 0.0145],
       [0.1066, 0.0468, 0.061 , 0.1443, 0.0716, 0.1653, 0.0015, 0.054 , 0.1752, 0.1738],
       [0.0297, 0.1671, 0.255 , 0.0598, 0.0562, 0.0937, 0.0548, 0.1955, 0.0285, 0.0597],
       [0.0081, 0.111 , 0.2089, 0.0799, 0.0987, 0.1475, 0.1374, 0.06  , 0.0278, 0.1208],
       [0.0544, 0.0363, 0.1572, 0.2436, 0.1501, 0.0224, 0.09  , 0.0489, 0.0409, 0.1562],
       [0.0249, 0.0173, 0.0336, 0.1705, 0.0055, 0.0478, 0.0022, 0.445 , 0.2058, 0.0474],
       [0.055 , 0.0315, 0.1065, 0.0011, 0.3801, 0.0522, 0.1855, 0.1153, 0.047 , 0.0258],
       [0.0448, 0.0786, 0.035 , 0.0315, 0.1329, 0.2844, 0.1709, 0.0304, 0.1867, 0.0048]])

In [7]:
# Simulate the HMM and generate a long sequence 
def simulate_hmm(seq_len, start_prob, trans_prob, emis_prob):
    observation_sequence = []
    state_sequence = []
    
    current_state = np.random.choice(num_states, p=start_prob)
    for _ in range(seq_len):
        state_sequence.append(current_state)
        observation = np.random.choice(num_observations, p=emis_prob[current_state])
        observation_sequence.append(observation)
        current_state = np.random.choice(num_states, p=trans_prob[current_state])
    
    return state_sequence, observation_sequence

In [13]:
train_obs, train_hid = simulate_hmm(train_length, initial_state_dist, transition_probs, emission_probs)
test_obs, test_hid = simulate_hmm(train_length, initial_state_dist, transition_probs, emission_probs)

In [14]:
file_path = f"../../../data/hmm_train_dataset(state-{num_states}_obs-{num_observations}_length-{train_length}).npz"
np.savez(file_path, train_observations=np.array(train_obs), train_hidden_states=np.array(train_hid), train_length=train_length, test_observations=np.array(test_obs), test_hidden_states=np.array(test_hid), test_length=test_length)