In [2]:
import numpy as np
import tqdm

### Create Synthetic Dataset on Predefined HMM

In [3]:
np.set_printoptions(suppress=True, precision=4)
np.set_printoptions(linewidth=100, threshold=np.inf)
np.set_printoptions(formatter={'int': '{:5d}'.format})

In [4]:
num_states = 10
num_observations = 500
size = 50000

In [5]:
transition_probs = np.random.dirichlet(np.ones(num_states) * 1, size=num_states)
emission_probs = np.random.dirichlet(np.ones(num_observations) * 2, size=num_states)
initial_state_dist = np.ones(num_states) / num_states

In [6]:
emission_probs.shape

(10, 500)

In [18]:
# Simulate the HMM and generate strings (observation sequences)
def simulate_hmm(num_sequences, min_length, max_length, start_prob, trans_prob, emis_prob):
    sequences = []
    hidden_states = []
    for _ in range(num_sequences):
        sequence_length = np.random.randint(min_length, max_length+1)
        current_state = np.random.choice(num_states, p=start_prob)
        observation_sequence = []
        state_sequence = []
        for _ in range(sequence_length):
            state_sequence.append(current_state)
            observation = np.random.choice(num_observations, p=emis_prob[current_state])
            observation_sequence.append(observation)
            current_state = np.random.choice(num_states, p=trans_prob[current_state])
        sequences.append(observation_sequence)
        hidden_states.append(state_sequence)
    
    return sequences, hidden_states

array([[0.0297, 0.0982, 0.0105, 0.0253, 0.0812, 0.0148, 0.0749, 0.0151, 0.5495, 0.1009],
       [0.3441, 0.1869, 0.261 , 0.0018, 0.1289, 0.0529, 0.    , 0.002 , 0.0133, 0.009 ],
       [0.0034, 0.0987, 0.0876, 0.    , 0.0016, 0.0115, 0.0006, 0.0058, 0.6721, 0.1187],
       [0.7578, 0.1083, 0.0302, 0.    , 0.0029, 0.0162, 0.0495, 0.0123, 0.0148, 0.008 ],
       [0.01  , 0.1662, 0.1885, 0.0278, 0.0315, 0.0499, 0.0674, 0.1576, 0.2684, 0.0328],
       [0.0044, 0.2439, 0.0039, 0.2448, 0.0014, 0.0016, 0.0888, 0.3746, 0.0344, 0.0022],
       [0.0472, 0.1738, 0.0164, 0.0002, 0.    , 0.0075, 0.0092, 0.3522, 0.2906, 0.103 ],
       [0.2798, 0.0052, 0.0524, 0.0077, 0.0794, 0.    , 0.0236, 0.1097, 0.0106, 0.4317],
       [0.057 , 0.0087, 0.0309, 0.0129, 0.031 , 0.494 , 0.1351, 0.1599, 0.0163, 0.0542],
       [0.1163, 0.0115, 0.001 , 0.0042, 0.0005, 0.0484, 0.2185, 0.0342, 0.0655, 0.4998]])

In [33]:
syn_sequences, syn_hidden_states = simulate_hmm(
    num_sequences=size,
    min_length=10,
    max_length=30,
    start_prob=initial_state_dist,
    trans_prob=transition_probs,
    emis_prob=emission_probs
)

In [34]:
def add_noise_to_states(hidden_states, number_states, flip_prob=0.5):
    noisy_hidden_states = []
    for sequence in hidden_states:
        noisy_sequence = []
        for state in sequence:
            if np.random.rand() < flip_prob:
                # Flip the state to a different random state
                possible_states = list(range(number_states))
                possible_states.remove(state)  # Remove the current state from possibilities
                new_state = np.random.choice(possible_states)
                noisy_sequence.append(new_state)
            else:
                noisy_sequence.append(state)
        noisy_hidden_states.append(noisy_sequence)
    return noisy_hidden_states

In [35]:
noisy_level = 0.5
noisy_hidden_states = add_noise_to_states(syn_hidden_states, num_states, flip_prob=noisy_level)

In [36]:
print(transition_probs)
for seq in syn_sequences[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(noisy_hidden_states[:5])):
    print('[' + ', '.join(map(str, syn_hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noisy_hidden_states[:5][index])) + ']')
    print("--------------------")

[[0.1821 0.1229 0.4174 0.0546 0.0237 0.0057 0.1313 0.0064 0.0105 0.0455]
 [0.1898 0.2178 0.2813 0.0187 0.0038 0.0134 0.0224 0.0376 0.1665 0.0486]
 [0.1055 0.0926 0.081  0.0609 0.0659 0.0855 0.0666 0.1091 0.0799 0.253 ]
 [0.2165 0.0123 0.1887 0.0021 0.0846 0.0509 0.1874 0.035  0.1293 0.0933]
 [0.0666 0.059  0.1515 0.0023 0.2476 0.1113 0.0792 0.0281 0.2419 0.0126]
 [0.0831 0.2488 0.0738 0.032  0.0412 0.1616 0.0046 0.2375 0.11   0.0075]
 [0.0906 0.1309 0.1675 0.0205 0.0609 0.0875 0.0421 0.0103 0.1993 0.1904]
 [0.1333 0.1623 0.0229 0.0267 0.0462 0.3118 0.0646 0.0209 0.1404 0.0709]
 [0.0811 0.0338 0.103  0.3329 0.0224 0.0137 0.0702 0.0619 0.0401 0.241 ]
 [0.0036 0.015  0.0067 0.04   0.4144 0.0035 0.3599 0.0789 0.074  0.004 ]]
[441, 368, 13, 108, 169, 66, 359, 106, 344, 274, 195, 68, 34, 115, 54, 224, 28, 86, 487, 75, 225, 94]
[270, 202, 290, 492, 236, 212, 452, 290, 451, 387, 358, 404, 453, 67]
[149, 496, 46, 291, 153, 42, 474, 253, 7, 39, 277, 416, 50, 371]
[447, 112, 234, 87, 98, 160, 463

In [11]:
for i in range(len(syn_hidden_states)):
    for j in range(len(syn_hidden_states[i])):
        syn_hidden_states[i][j] += 1
        noisy_hidden_states[i][j] += 1
        
for i in range(len(syn_hidden_states)):
    syn_hidden_states[i].insert(0, 0)
    noisy_hidden_states[i].insert(0, 0)
    
    syn_sequences[i].insert(0, -1)

In [12]:
for seq in syn_sequences[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(noisy_hidden_states[:5])):
    print('[' + ', '.join(map(str, syn_hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noisy_hidden_states[:5][index])) + ']')
    print("--------------------")

[-1, 360, 235, 376, 443, 78, 315, 262, 398, 215, 37, 246, 165, 181, 246, 304, 303, 220, 469, 131]
[-1, 495, 408, 99, 409, 396, 302, 422, 176, 224, 307, 28, 219, 331, 252, 391, 448]
[-1, 293, 460, 240, 370, 362, 118, 100, 335, 472, 228, 170, 499, 217, 397, 371, 50, 222, 207, 254, 382, 413, 71, 139, 81]
[-1, 159, 479, 45, 474, 94, 13, 225, 117, 315, 25, 431, 168, 330]
[-1, 109, 432, 17, 138, 323, 46, 422, 477, 497, 340, 334, 147, 453]
[0, 4, 10, 7, 9, 4, 3, 3, 9, 10, 7, 1, 3, 8, 6, 8, 9, 4, 9, 4]
[0, 1, 10, 10, 6, 4, 10, 3, 9, 3, 5, 9, 9, 8, 1, 6, 7, 3, 9, 4]
--------------------
[0, 6, 9, 4, 7, 9, 4, 1, 3, 9, 6, 8, 6, 1, 3, 6, 2]
[0, 6, 9, 9, 4, 9, 4, 1, 7, 1, 2, 3, 6, 1, 3, 6, 7]
--------------------
[0, 6, 6, 6, 5, 9, 10, 5, 9, 4, 7, 4, 3, 2, 10, 7, 10, 5, 9, 6, 6, 8, 10, 5, 5]
[0, 7, 6, 6, 2, 9, 10, 2, 7, 8, 6, 4, 7, 2, 10, 9, 10, 2, 1, 6, 6, 2, 10, 9, 5]
--------------------
[0, 2, 9, 3, 10, 5, 5, 1, 7, 6, 8, 6, 2, 2]
[0, 2, 9, 9, 9, 5, 4, 7, 2, 6, 2, 6, 9, 3]
--------------------
[

In [37]:
file_path = f"../data/hmm_synthetic_dataset(noise-{noisy_level}_state-{num_states}_obs-{num_observations}_size-{size}).npz"
seq_object = np.array(syn_sequences, dtype=object)
hid_object = np.array(syn_hidden_states, dtype=object)
noisy_hid_object = np.array(noisy_hidden_states, dtype=object)
trans_object = transition_probs
emis_object = emission_probs
np.savez(file_path, observation=seq_object, real_hidden=hid_object, noisy_hidden=noisy_hid_object, real_trans=trans_object, emis=emis_object , noisy_level=noisy_level)

In [41]:
loaded_npz = np.load("../data/hmm_synthetic_dataset.npz", allow_pickle=True)
observations = list(loaded_npz['observation'])
hidden_states = list(loaded_npz['real_hidden'])
noi_hidden_states = list(loaded_npz['noisy_hidden'])
transition_dist = np.vstack(loaded_npz['real_trans'])

(5, 101)

In [3]:
print(transition_dist)
for seq in observations[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(hidden_states[:5])):
    print('[' + ', '.join(map(str, hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noi_hidden_states[:5][index])) + ']')
    print("--------------------")

NameError: name 'transition_dist' is not defined

### Initailise my model with pre-defined params

In [45]:
split = 5000
param_observed = observations[:split]
param_hidden = hidden_states[:split]
train_observed = observations[split:]
train_hidden = hidden_states[split:]

In [41]:
trans_count = np.zeros((num_states, num_states), dtype='int')
emis_count = np.zeros((num_observations, num_states), dtype='int')

In [88]:
for i in range(split):
    for t in range(len(param_hidden[i])):
        emis_count[param_observed[i][t], param_hidden[i][t]] += 1
        if t > 0: 
            trans_count[param_hidden[i][t], param_hidden[i][t - 1]] += 1

In [60]:
read_npz = np.load("../data/noise-0.5_iter-300_state-10_size-50000_timestamp-0121_065808_result.npz")
distance = read_npz['result']

In [61]:
distance

array([3.3506, 3.3362, 3.3476, 3.3389, 3.3153, 3.3417, 3.3399, 3.3498, 3.3624, 3.3622, 3.3164,
       3.304 , 3.2982, 3.2846, 3.3118, 3.3038, 3.2682, 3.2504, 3.2656, 3.2778, 3.2616, 3.2701,
       3.2789, 3.2693, 3.2571, 3.2239, 3.2193, 3.2221, 3.251 , 3.2168, 3.2407, 3.2068, 3.2423,
       3.2131, 3.2025, 3.1882, 3.1518, 3.1939, 3.157 , 3.1286, 3.1196, 3.0794, 3.0922, 3.0766,
       3.0504, 3.0317, 3.0761, 3.0634, 3.0471, 3.0182, 2.9836, 2.9932, 2.9678, 2.9618, 2.9478,
       2.963 , 2.9729, 2.9692, 3.0059, 3.0148, 3.0034, 2.9748, 2.9862, 2.9987, 2.9379, 2.9668,
       2.9786, 2.9594, 2.9768, 2.9485, 2.9581, 2.9023, 2.9207, 2.889 , 2.8646, 2.8715, 2.8395,
       2.8607, 2.8792, 2.8844, 2.8536, 2.8883, 2.8993, 2.8947, 2.9122, 2.9455, 2.9548, 2.9274,
       2.94  , 2.9549, 2.959 , 2.9829, 2.9873, 2.9609, 2.9932, 2.9972, 3.0214, 3.0065, 3.0283,
       3.0086, 3.0556, 3.0562, 3.1182, 3.116 , 3.0997, 3.0787, 3.094 , 3.0953, 3.1177, 3.1215,
       3.1104, 3.0887, 3.1259, 3.1679, 3.1138, 3.1