In [1]:
import numpy as np
import tqdm

### Create Synthetic Dataset on Predefined HMM

In [17]:
np.set_printoptions(suppress=True, precision=4)
np.set_printoptions(linewidth=100, threshold=np.inf)
np.set_printoptions(formatter={'int': '{:5d}'.format})

In [62]:
num_states = 10
num_observations = 500
size = 50000

In [63]:
transition_probs = np.random.dirichlet(np.ones(num_states), size=num_states)
emission_probs = np.random.dirichlet(np.ones(num_observations), size=num_states)
initial_state_dist = np.ones(num_states) / num_states

In [64]:
transition_probs, emission_probs

(array([[0.2263, 0.0619, 0.0997, 0.1683, 0.0372, 0.012 , 0.0884, 0.0824, 0.0513, 0.1726],
        [0.0834, 0.2673, 0.1002, 0.1859, 0.0201, 0.1596, 0.0255, 0.0959, 0.0504, 0.0118],
        [0.0221, 0.1873, 0.0807, 0.0274, 0.0166, 0.1011, 0.0262, 0.2694, 0.1997, 0.0695],
        [0.0049, 0.0647, 0.0086, 0.0535, 0.0257, 0.2058, 0.1258, 0.2122, 0.2639, 0.0346],
        [0.0537, 0.0643, 0.0442, 0.0755, 0.4416, 0.032 , 0.0631, 0.1622, 0.0235, 0.0399],
        [0.1531, 0.0334, 0.0446, 0.238 , 0.2541, 0.0235, 0.0171, 0.0617, 0.1093, 0.0654],
        [0.0736, 0.0961, 0.0058, 0.0843, 0.0381, 0.1714, 0.0993, 0.0516, 0.2553, 0.1245],
        [0.0971, 0.0073, 0.2484, 0.0612, 0.203 , 0.0134, 0.0007, 0.004 , 0.3043, 0.0605],
        [0.0587, 0.2304, 0.0295, 0.0032, 0.0398, 0.0496, 0.2744, 0.1638, 0.0493, 0.1012],
        [0.0831, 0.0374, 0.1841, 0.324 , 0.0129, 0.0501, 0.0281, 0.1144, 0.1274, 0.0384]]),
 array([[0.0001, 0.0081, 0.0016, 0.0042, 0.0016, 0.0008, 0.001 , 0.0018, 0.0049, 0.0004, 0.0009,
 

In [65]:
# Simulate the HMM and generate strings (observation sequences)
def simulate_hmm(num_sequences, min_length, max_length, start_prob, trans_prob, emis_prob):
    sequences = []
    hidden_states = []
    for _ in range(num_sequences):
        sequence_length = np.random.randint(min_length, max_length+1)
        current_state = np.random.choice(num_states, p=start_prob)
        observation_sequence = []
        state_sequence = []
        for _ in range(sequence_length):
            state_sequence.append(current_state)
            observation = np.random.choice(num_observations, p=emis_prob[current_state])
            observation_sequence.append(observation)
            current_state = np.random.choice(num_states, p=trans_prob[current_state])
        sequences.append(observation_sequence)
        hidden_states.append(state_sequence)
    
    return sequences, hidden_states

In [73]:
syn_sequences, syn_hidden_states = simulate_hmm(
    num_sequences=size,
    min_length=10,
    max_length=30,
    start_prob=initial_state_dist,
    trans_prob=transition_probs,
    emis_prob=emission_probs
)

In [67]:
def add_noise_to_states(hidden_states, number_states, flip_prob=0.5):
    noisy_hidden_states = []
    for sequence in hidden_states:
        noisy_sequence = []
        for state in sequence:
            if np.random.rand() < flip_prob:
                # Flip the state to a different random state
                possible_states = list(range(number_states))
                possible_states.remove(state)  # Remove the current state from possibilities
                new_state = np.random.choice(possible_states)
                noisy_sequence.append(new_state)
            else:
                noisy_sequence.append(state)
        noisy_hidden_states.append(noisy_sequence)
    return noisy_hidden_states

In [74]:
noisy_level = 0.5
noisy_hidden_states = add_noise_to_states(syn_hidden_states, num_states, flip_prob=noisy_level)

In [76]:
print(transition_probs)
for seq in syn_sequences[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(noisy_hidden_states[:5])):
    print('[' + ', '.join(map(str, syn_hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noisy_hidden_states[:5][index])) + ']')
    print("--------------------")

[[0.2263 0.0619 0.0997 0.1683 0.0372 0.012  0.0884 0.0824 0.0513 0.1726]
 [0.0834 0.2673 0.1002 0.1859 0.0201 0.1596 0.0255 0.0959 0.0504 0.0118]
 [0.0221 0.1873 0.0807 0.0274 0.0166 0.1011 0.0262 0.2694 0.1997 0.0695]
 [0.0049 0.0647 0.0086 0.0535 0.0257 0.2058 0.1258 0.2122 0.2639 0.0346]
 [0.0537 0.0643 0.0442 0.0755 0.4416 0.032  0.0631 0.1622 0.0235 0.0399]
 [0.1531 0.0334 0.0446 0.238  0.2541 0.0235 0.0171 0.0617 0.1093 0.0654]
 [0.0736 0.0961 0.0058 0.0843 0.0381 0.1714 0.0993 0.0516 0.2553 0.1245]
 [0.0971 0.0073 0.2484 0.0612 0.203  0.0134 0.0007 0.004  0.3043 0.0605]
 [0.0587 0.2304 0.0295 0.0032 0.0398 0.0496 0.2744 0.1638 0.0493 0.1012]
 [0.0831 0.0374 0.1841 0.324  0.0129 0.0501 0.0281 0.1144 0.1274 0.0384]]
[465, 115, 373, 42, 192, 301, 469, 156, 288, 317, 417, 7, 243, 213, 328, 408, 228, 408]
[355, 409, 273, 332, 258, 167, 255, 177, 199, 388, 213, 65, 84, 402, 187, 101, 182, 275, 288, 288, 163]
[251, 142, 252, 271, 463, 273, 368, 401, 482, 78, 268, 400, 86, 103, 17, 404,

In [70]:
for i in range(len(syn_hidden_states)):
    for j in range(len(syn_hidden_states[i])):
        syn_hidden_states[i][j] += 1
        noisy_hidden_states[i][j] += 1
        
for i in range(len(syn_hidden_states)):
    syn_hidden_states[i].insert(0, 0)
    noisy_hidden_states[i].insert(0, 0)
    
    syn_sequences[i].insert(0, -1)

In [71]:
for seq in syn_sequences[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(noisy_hidden_states[:5])):
    print('[' + ', '.join(map(str, syn_hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noisy_hidden_states[:5][index])) + ']')
    print("--------------------")

[-1, 225, 359, 283, 475, 78, 179, 201, 497, 322, 397, 170, 360, 22, 352, 416, 96, 435, 30, 398, 308]
[-1, 110, 437, 367, 463, 180, 61, 104, 184, 337, 240, 475, 417, 196, 203, 478, 310, 435, 291, 292, 77, 170, 50, 225, 128, 98]
[-1, 210, 200, 225, 66, 297, 199, 100, 135, 30, 91, 321, 241]
[-1, 227, 38, 146, 374, 78, 256, 379, 346, 444, 253, 131, 408, 43, 282, 339, 91, 236, 459, 442, 462, 429, 288, 129, 229, 282]
[-1, 476, 412, 93, 134, 218, 322, 451, 120, 151, 278, 267, 237, 101, 125, 479, 380, 354, 148, 418, 475, 378, 113, 254, 25, 416, 442, 216]
[0, 6, 5, 8, 10, 4, 8, 3, 2, 4, 8, 3, 9, 10, 4, 4, 9, 5, 5, 1, 4]
[0, 6, 5, 2, 10, 6, 8, 6, 2, 4, 8, 4, 9, 3, 4, 4, 9, 3, 3, 1, 6]
--------------------
[0, 3, 9, 5, 5, 5, 7, 7, 7, 9, 6, 4, 8, 4, 8, 9, 7, 6, 8, 10, 3, 3, 10, 8, 4, 8]
[0, 6, 9, 8, 9, 4, 7, 1, 7, 9, 10, 4, 3, 4, 8, 5, 1, 6, 9, 9, 1, 6, 10, 8, 4, 1]
--------------------
[0, 2, 4, 6, 5, 6, 5, 9, 7, 10, 6, 2, 2]
[0, 2, 4, 1, 5, 3, 5, 9, 1, 10, 6, 2, 2]
--------------------
[0, 7, 8,

In [75]:
file_path = f"../data/hmm_synthetic_dataset(noise-{noisy_level}_state-{num_states}_obs-{num_observations}_size-{size}).npz"
seq_object = np.array(syn_sequences, dtype=object)
hid_object = np.array(syn_hidden_states, dtype=object)
noisy_hid_object = np.array(noisy_hidden_states, dtype=object)
trans_object = transition_probs
emis_object = emission_probs
np.savez(file_path, observation=seq_object, real_hidden=hid_object, noisy_hidden=noisy_hid_object, real_trans=trans_object, emis=emis_object , noisy_level=noisy_level)

In [2]:
loaded_npz = np.load("../data/hmm_synthetic_dataset.npz", allow_pickle=True)
observations = list(loaded_npz['observation'])
hidden_states = list(loaded_npz['real_hidden'])
noi_hidden_states = list(loaded_npz['noisy_hidden'])
transition_dist = np.vstack(loaded_npz['real_trans'])

FileNotFoundError: [Errno 2] No such file or directory: '../data/hmm_synthetic_dataset.npz'

In [3]:
print(transition_dist)
for seq in observations[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(hidden_states[:5])):
    print('[' + ', '.join(map(str, hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noi_hidden_states[:5][index])) + ']')
    print("--------------------")

NameError: name 'transition_dist' is not defined

### Initailise my model with pre-defined params

In [45]:
split = 5000
param_observed = observations[:split]
param_hidden = hidden_states[:split]
train_observed = observations[split:]
train_hidden = hidden_states[split:]

In [41]:
trans_count = np.zeros((num_states, num_states), dtype='int')
emis_count = np.zeros((num_observations, num_states), dtype='int')

In [88]:
for i in range(split):
    for t in range(len(param_hidden[i])):
        emis_count[param_observed[i][t], param_hidden[i][t]] += 1
        if t > 0: 
            trans_count[param_hidden[i][t], param_hidden[i][t - 1]] += 1

In [60]:
read_npz = np.load("../data/noise-0.5_iter-300_state-10_size-50000_timestamp-0121_065808_result.npz")
distance = read_npz['result']

In [61]:
distance

array([3.3506, 3.3362, 3.3476, 3.3389, 3.3153, 3.3417, 3.3399, 3.3498, 3.3624, 3.3622, 3.3164,
       3.304 , 3.2982, 3.2846, 3.3118, 3.3038, 3.2682, 3.2504, 3.2656, 3.2778, 3.2616, 3.2701,
       3.2789, 3.2693, 3.2571, 3.2239, 3.2193, 3.2221, 3.251 , 3.2168, 3.2407, 3.2068, 3.2423,
       3.2131, 3.2025, 3.1882, 3.1518, 3.1939, 3.157 , 3.1286, 3.1196, 3.0794, 3.0922, 3.0766,
       3.0504, 3.0317, 3.0761, 3.0634, 3.0471, 3.0182, 2.9836, 2.9932, 2.9678, 2.9618, 2.9478,
       2.963 , 2.9729, 2.9692, 3.0059, 3.0148, 3.0034, 2.9748, 2.9862, 2.9987, 2.9379, 2.9668,
       2.9786, 2.9594, 2.9768, 2.9485, 2.9581, 2.9023, 2.9207, 2.889 , 2.8646, 2.8715, 2.8395,
       2.8607, 2.8792, 2.8844, 2.8536, 2.8883, 2.8993, 2.8947, 2.9122, 2.9455, 2.9548, 2.9274,
       2.94  , 2.9549, 2.959 , 2.9829, 2.9873, 2.9609, 2.9932, 2.9972, 3.0214, 3.0065, 3.0283,
       3.0086, 3.0556, 3.0562, 3.1182, 3.116 , 3.0997, 3.0787, 3.094 , 3.0953, 3.1177, 3.1215,
       3.1104, 3.0887, 3.1259, 3.1679, 3.1138, 3.1