In [1]:
import numpy as np
import tqdm

### Create Synthetic Dataset on Predefined HMM

In [4]:
np.set_printoptions(suppress=True, precision=4)
np.set_printoptions(linewidth=100, threshold=np.inf)
np.set_printoptions(formatter={'int': '{:5d}'.format})

In [64]:
num_states = 20
num_observations = 5000
size = 200000

In [65]:
transition_probs = np.random.dirichlet(np.ones(num_states), size=num_states)
emission_probs = np.random.dirichlet(np.ones(num_observations), size=num_states)
initial_state_dist = np.ones(num_states) / num_states

In [66]:
transition_probs, emission_probs

(array([[0.1444, 0.0642, 0.0586, 0.0179, 0.0066, 0.0146, 0.1305, 0.0004, 0.0371, 0.0727, 0.0359,
         0.0195, 0.0187, 0.1366, 0.0021, 0.0188, 0.0836, 0.073 , 0.0599, 0.005 ],
        [0.0893, 0.0038, 0.0214, 0.0116, 0.0395, 0.0195, 0.1215, 0.0671, 0.0255, 0.025 , 0.0156,
         0.0295, 0.0271, 0.1754, 0.0408, 0.0715, 0.1151, 0.0403, 0.0265, 0.034 ],
        [0.061 , 0.0171, 0.0089, 0.0905, 0.0208, 0.0271, 0.0267, 0.0136, 0.1206, 0.065 , 0.1037,
         0.1947, 0.014 , 0.0115, 0.0927, 0.0059, 0.054 , 0.0321, 0.0194, 0.0209],
        [0.1428, 0.0289, 0.0779, 0.0268, 0.1097, 0.0192, 0.0192, 0.0204, 0.0042, 0.0034, 0.0928,
         0.0871, 0.1107, 0.07  , 0.0753, 0.0018, 0.0342, 0.0469, 0.0022, 0.0265],
        [0.0041, 0.0476, 0.0582, 0.0111, 0.0776, 0.0725, 0.0181, 0.1924, 0.0038, 0.025 , 0.0176,
         0.0197, 0.0341, 0.003 , 0.0595, 0.2401, 0.0518, 0.0307, 0.0176, 0.0154],
        [0.0497, 0.0669, 0.0147, 0.0155, 0.0296, 0.0301, 0.0158, 0.0298, 0.025 , 0.0228, 0.0438,
        

In [8]:
# Simulate the HMM and generate strings (observation sequences)
def simulate_hmm(num_sequences, min_length, max_length, start_prob, trans_prob, emis_prob):
    sequences = []
    hidden_states = []
    for _ in range(num_sequences):
        sequence_length = np.random.randint(min_length, max_length+1)
        current_state = np.random.choice(num_states, p=start_prob)
        observation_sequence = []
        state_sequence = []
        for _ in range(sequence_length):
            state_sequence.append(current_state)
            observation = np.random.choice(num_observations, p=emis_prob[current_state])
            observation_sequence.append(observation)
            current_state = np.random.choice(num_states, p=trans_prob[current_state])
        sequences.append(observation_sequence)
        hidden_states.append(state_sequence)
    
    return sequences, hidden_states

In [67]:
syn_sequences, syn_hidden_states = simulate_hmm(
    num_sequences=size,
    min_length=10,
    max_length=30,
    start_prob=initial_state_dist,
    trans_prob=transition_probs,
    emis_prob=emission_probs
)

In [10]:
def add_noise_to_states(hidden_states, number_states, flip_prob=0.5):
    noisy_hidden_states = []
    for sequence in hidden_states:
        noisy_sequence = []
        for state in sequence:
            if np.random.rand() < flip_prob:
                # Flip the state to a different random state
                possible_states = list(range(number_states))
                possible_states.remove(state)  # Remove the current state from possibilities
                new_state = np.random.choice(possible_states)
                noisy_sequence.append(new_state)
            else:
                noisy_sequence.append(state)
        noisy_hidden_states.append(noisy_sequence)
    return noisy_hidden_states

In [68]:
noisy_level = 0.3
noisy_hidden_states = add_noise_to_states(syn_hidden_states, num_states, flip_prob=noisy_level)

In [69]:
print(transition_probs)
for seq in syn_sequences[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(noisy_hidden_states[:5])):
    print('[' + ', '.join(map(str, syn_hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noisy_hidden_states[:5][index])) + ']')
    print("--------------------")

[[0.1444 0.0642 0.0586 0.0179 0.0066 0.0146 0.1305 0.0004 0.0371 0.0727 0.0359 0.0195 0.0187
  0.1366 0.0021 0.0188 0.0836 0.073  0.0599 0.005 ]
 [0.0893 0.0038 0.0214 0.0116 0.0395 0.0195 0.1215 0.0671 0.0255 0.025  0.0156 0.0295 0.0271
  0.1754 0.0408 0.0715 0.1151 0.0403 0.0265 0.034 ]
 [0.061  0.0171 0.0089 0.0905 0.0208 0.0271 0.0267 0.0136 0.1206 0.065  0.1037 0.1947 0.014
  0.0115 0.0927 0.0059 0.054  0.0321 0.0194 0.0209]
 [0.1428 0.0289 0.0779 0.0268 0.1097 0.0192 0.0192 0.0204 0.0042 0.0034 0.0928 0.0871 0.1107
  0.07   0.0753 0.0018 0.0342 0.0469 0.0022 0.0265]
 [0.0041 0.0476 0.0582 0.0111 0.0776 0.0725 0.0181 0.1924 0.0038 0.025  0.0176 0.0197 0.0341
  0.003  0.0595 0.2401 0.0518 0.0307 0.0176 0.0154]
 [0.0497 0.0669 0.0147 0.0155 0.0296 0.0301 0.0158 0.0298 0.025  0.0228 0.0438 0.0659 0.1426
  0.0663 0.2259 0.0328 0.0158 0.0223 0.0629 0.0216]
 [0.0371 0.0577 0.0529 0.0041 0.0101 0.0391 0.0124 0.0452 0.0413 0.0883 0.099  0.0207 0.0423
  0.079  0.0872 0.1027 0.0618 0.0171 0

In [70]:
file_path = f"../data/hmm_synthetic_dataset(noise-{noisy_level}_state-{num_states}_obs-{num_observations}_size-{size}).npz"
seq_object = np.array(syn_sequences, dtype=object)
hid_object = np.array(syn_hidden_states, dtype=object)
noisy_hid_object = np.array(noisy_hidden_states, dtype=object)
trans_object = transition_probs
emis_object = emission_probs
np.savez(file_path, observation=seq_object, real_hidden=hid_object, noisy_hidden=noisy_hid_object, real_trans=trans_object, emis=emis_object , noisy_level=noisy_level)

In [2]:
loaded_npz = np.load("../data/hmm_synthetic_dataset.npz", allow_pickle=True)
observations = list(loaded_npz['observation'])
hidden_states = list(loaded_npz['real_hidden'])
noi_hidden_states = list(loaded_npz['noisy_hidden'])
transition_dist = np.vstack(loaded_npz['real_trans'])

FileNotFoundError: [Errno 2] No such file or directory: '../data/hmm_synthetic_dataset.npz'

In [3]:
print(transition_dist)
for seq in observations[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(hidden_states[:5])):
    print('[' + ', '.join(map(str, hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noi_hidden_states[:5][index])) + ']')
    print("--------------------")

NameError: name 'transition_dist' is not defined

### Initailise my model with pre-defined params

In [45]:
split = 5000
param_observed = observations[:split]
param_hidden = hidden_states[:split]
train_observed = observations[split:]
train_hidden = hidden_states[split:]

In [41]:
trans_count = np.zeros((num_states, num_states), dtype='int')
emis_count = np.zeros((num_observations, num_states), dtype='int')

In [88]:
for i in range(split):
    for t in range(len(param_hidden[i])):
        emis_count[param_observed[i][t], param_hidden[i][t]] += 1
        if t > 0: 
            trans_count[param_hidden[i][t], param_hidden[i][t - 1]] += 1

In [75]:
read_npz = np.load("../data/noise-0.5_iter-150_timestamp-0108_071414_result.npz")
distance = read_npz['result']

In [76]:
distance

array([1.5531, 1.551 , 1.5534, 1.5185, 1.5098, 1.5009, 1.4714, 1.4881, 1.4526, 1.4545, 1.4377,
       1.4448, 1.4297, 1.4079, 1.4389, 1.4283, 1.4096, 1.3499, 1.3546, 1.3276, 1.3314, 1.3105,
       1.317 , 1.325 , 1.3136, 1.3008, 1.2995, 1.3041, 1.2796, 1.2539, 1.2513, 1.2356, 1.2104,
       1.1972, 1.1715, 1.1761, 1.1776, 1.1725, 1.1636, 1.1544, 1.174 , 1.1456, 1.1508, 1.1482,
       1.1336, 1.1058, 1.0957, 1.0913, 1.0945, 1.0835])