In [3]:
import numpy as np
import tqdm

### Create my synthetic dataset

In [4]:
np.set_printoptions(suppress=True, precision=4)
np.set_printoptions(linewidth=100, threshold=np.inf)
np.set_printoptions(formatter={'int': '{:5d}'.format})

In [5]:
num_states = 10
num_observations = 100

In [6]:
transition_probs = np.random.dirichlet(np.ones(num_states), size=num_states)
emission_probs = np.random.dirichlet(np.ones(num_observations), size=num_states)
initial_state_dist = np.ones(num_states) / num_states

In [7]:
transition_probs, emission_probs

(array([[0.1851, 0.3632, 0.0797, 0.0335, 0.0389, 0.049 , 0.0035, 0.0981, 0.0865, 0.0625],
        [0.1029, 0.064 , 0.0194, 0.0925, 0.3036, 0.0824, 0.0247, 0.0134, 0.2101, 0.087 ],
        [0.136 , 0.0061, 0.2452, 0.0169, 0.2428, 0.0198, 0.1776, 0.0269, 0.0139, 0.1147],
        [0.0929, 0.0821, 0.1675, 0.0098, 0.1082, 0.0807, 0.1139, 0.1401, 0.1957, 0.0092],
        [0.0166, 0.0852, 0.0489, 0.0615, 0.0514, 0.2068, 0.0625, 0.0203, 0.4396, 0.0073],
        [0.1463, 0.0244, 0.0181, 0.1109, 0.0449, 0.0525, 0.4599, 0.058 , 0.0687, 0.0164],
        [0.1638, 0.2945, 0.1397, 0.0152, 0.0392, 0.0079, 0.2076, 0.0678, 0.0199, 0.0443],
        [0.0623, 0.1885, 0.022 , 0.0774, 0.1774, 0.142 , 0.0637, 0.1139, 0.1479, 0.0051],
        [0.0101, 0.1721, 0.0459, 0.1131, 0.0685, 0.0556, 0.1991, 0.019 , 0.1833, 0.1334],
        [0.1738, 0.1015, 0.0237, 0.0564, 0.4082, 0.1302, 0.0559, 0.    , 0.039 , 0.0113]]),
 array([[0.0242, 0.0072, 0.0084, 0.0003, 0.0024, 0.004 , 0.0288, 0.0019, 0.006 , 0.0144, 0.0039,
 

In [14]:
# Simulate the HMM and generate strings (observation sequences)
def simulate_hmm(num_sequences, min_length, max_length, start_prob, trans_prob, emis_prob):
    sequences = []
    hidden_states = []
    for _ in range(num_sequences):
        sequence_length = np.random.randint(min_length, max_length+1)
        current_state = np.random.choice(num_states, p=start_prob)
        observation_sequence = []
        state_sequence = []
        for _ in range(sequence_length):
            state_sequence.append(current_state)
            observation = np.random.choice(num_observations, p=emis_prob[current_state])
            observation_sequence.append(observation)
            current_state = np.random.choice(num_states, p=trans_prob[current_state])
        sequences.append(observation_sequence)
        hidden_states.append(state_sequence)
    
    return sequences, hidden_states

In [34]:
syn_sequences, syn_hidden_states = simulate_hmm(
    num_sequences=50000,
    min_length=10,
    max_length=30,
    start_prob=initial_state_dist,
    trans_prob=transition_probs,
    emis_prob=emission_probs
)

In [35]:
def add_noise_to_states(hidden_states, flip_prob=0.1):
    noisy_hidden_states = []
    for sequence in hidden_states:
        noisy_sequence = []
        for state in sequence:
            if np.random.rand() < flip_prob:
                # Flip the state to a different random state
                possible_states = list(range(num_states))
                possible_states.remove(state)  # Remove the current state from possibilities
                new_state = np.random.choice(possible_states)
                noisy_sequence.append(new_state)
            else:
                noisy_sequence.append(state)
        noisy_hidden_states.append(noisy_sequence)
    return noisy_hidden_states

In [36]:
noisy_hidden_states = add_noise_to_states(syn_hidden_states, 0.1)

In [37]:
for seq in syn_sequences[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(noisy_hidden_states[:5])):
    print('[' + ', '.join(map(str, syn_hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noisy_hidden_states[:5][index])) + ']')
    print("--------------------")

[27, 80, 74, 75, 54, 27, 39, 67, 42, 74, 75, 67, 66, 53, 98, 5, 36, 54, 85, 70, 9, 74, 50, 36, 8, 0, 65, 66, 54]
[92, 13, 42, 81, 69, 17, 24, 88, 4, 67, 90, 88, 58, 23]
[4, 4, 73, 32, 28, 76, 97, 57, 52, 16, 81, 53, 8, 91, 0, 57, 14, 74, 50, 17, 53, 76, 2, 74, 14, 94, 95, 52]
[31, 96, 91, 52, 0, 12, 19, 5, 88, 97, 52, 32, 98, 15]
[13, 95, 19, 85, 77, 77, 68, 6, 58, 96, 14]
[2, 4, 8, 5, 4, 8, 6, 1, 1, 9, 1, 5, 6, 1, 8, 6, 2, 4, 8, 1, 9, 4, 8, 9, 4, 1, 4, 7, 4]
[2, 4, 8, 5, 4, 8, 6, 1, 1, 9, 1, 5, 6, 1, 8, 6, 1, 4, 8, 1, 9, 4, 8, 9, 4, 1, 4, 5, 4]
--------------------
[2, 2, 6, 1, 4, 8, 4, 8, 9, 6, 6, 7, 4, 4]
[2, 2, 6, 1, 3, 8, 4, 8, 9, 6, 6, 7, 4, 4]
--------------------
[3, 7, 1, 1, 4, 8, 1, 4, 5, 6, 1, 8, 3, 2, 3, 4, 8, 9, 4, 8, 8, 3, 1, 9, 8, 1, 4, 8]
[3, 7, 5, 1, 4, 8, 1, 4, 5, 6, 1, 8, 1, 2, 3, 4, 8, 9, 4, 8, 8, 3, 1, 6, 5, 1, 4, 0]
--------------------
[4, 8, 9, 4, 2, 4, 8, 1, 8, 1, 3, 4, 1, 4]
[4, 8, 9, 4, 2, 4, 8, 1, 8, 1, 3, 4, 1, 4]
--------------------
[1, 4, 2, 0, 1, 4, 6, 

In [38]:
file_path = "../data/hmm_synthetic_dataset.npz"
seq_object = np.array(syn_sequences, dtype=object)
hid_object = np.array(syn_hidden_states, dtype=object)
noisy_hid_object = np.array(noisy_hidden_states, dtype=object)
np.savez(file_path, observation=seq_object, real_hidden=hid_object, noisy_hidden=noisy_hid_object)

In [31]:
loaded_npz = np.load("../data/hmm_synthetic_dataset.npz", allow_pickle=True)
observations = list(loaded_npz['observation'])
hidden_states = list(loaded_npz['real_hidden'])
noi_hidden_states = list(loaded_npz['noisy_hidden'])

In [32]:
for seq in observations[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(hidden_states[:5])):
    print('[' + ', '.join(map(str, hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noi_hidden_states[:5][index])) + ']')
    print("--------------------")

[44, 47, 52, 52, 12, 2, 58, 16, 26, 80, 54, 76, 73, 12, 22, 0, 41, 35, 94, 39, 5, 90, 54]
[50, 11, 73, 45, 98, 62, 14, 49, 83, 89, 63, 49, 10, 53, 80]
[54, 42, 69, 94, 70, 54, 68, 2, 77, 10, 53, 39, 33]
[52, 64, 39, 88, 98, 75, 65, 53, 55, 64, 93, 41]
[62, 69, 16, 31, 40, 81, 12, 14, 92, 65, 52, 89, 10, 15, 81, 25, 52, 43, 1, 0, 79, 91, 17]
[8, 6, 1, 3, 8, 1, 4, 5, 2, 4, 5, 7, 0, 1, 2, 2, 6, 7, 8, 9, 8, 6, 1]
[8, 6, 1, 3, 8, 1, 3, 7, 2, 4, 5, 7, 0, 1, 2, 2, 6, 7, 8, 9, 8, 6, 1]
--------------------
[5, 6, 1, 6, 1, 5, 8, 3, 6, 6, 6, 0, 0, 1, 4]
[2, 6, 1, 6, 1, 5, 8, 3, 6, 6, 6, 0, 6, 1, 4]
--------------------
[5, 0, 1, 4, 5, 0, 1, 6, 2, 4, 6, 6, 2]
[5, 0, 1, 4, 5, 0, 1, 6, 6, 4, 6, 6, 2]
--------------------
[7, 1, 4, 4, 8, 1, 5, 6, 3, 0, 4, 8]
[7, 1, 4, 4, 8, 1, 5, 6, 3, 7, 4, 8]
--------------------
[9, 4, 5, 3, 7, 1, 8, 6, 1, 8, 3, 5, 7, 8, 5, 6, 2, 6, 1, 4, 5, 7, 2]
[9, 4, 5, 3, 7, 1, 8, 6, 1, 4, 7, 5, 7, 8, 5, 6, 2, 6, 1, 4, 5, 7, 3]
--------------------


### Initailise my model with pre-defined params

In [45]:
split = 5000
param_observed = observations[:split]
param_hidden = hidden_states[:split]
train_observed = observations[split:]
train_hidden = hidden_states[split:]

In [41]:
trans_count = np.zeros((num_states, num_states), dtype='int')
emis_count = np.zeros((num_observations, num_states), dtype='int')

In [88]:
for i in range(split):
    for t in range(len(param_hidden[i])):
        emis_count[param_observed[i][t], param_hidden[i][t]] += 1
        if t > 0: 
            trans_count[param_hidden[i][t], param_hidden[i][t - 1]] += 1

In [1]:
trans_count, emis_count

NameError: name 'trans_count' is not defined

### Test my model on the synthetic dataset

In [30]:
len(dataset[2])

28

In [110]:
%load_ext autoreload
%autoreload 2

In [2]:
from hdp_hmm import HDPHMM
from direct_assign_gibbs_pos import DirectAssignmentPOS
model = HDPHMM()
sampler = DirectAssignmentPOS(model, dataset, num_observations)
sampler.K = 10
sampler.transition_count = trans_count.copy()
sampler.token_state_matrix = emis_count.copy()
sampler.hidden_states = [np.zeros(seq_len, dtype='int') for seq_len in sampler.seq_length]
for i in range(9):
    sampler.model.update_beta_with_new_state()
    
iterations = 5
for iteration in tqdm.tqdm(range(iterations), desc="training sampler:"):
    for index in range(len(train_observed)):
        for t in range(1, sampler.seq_length[index] - 1):
            sampler.sample_hidden_states_on_last_next_state(index, t)
            
        sampler.sample_hidden_states_on_last_state(index, sampler.seq_length[index] - 1)
        
    sampler.update_K()
    # print("hidden states after update K:", sampler.hidden_states[:5])
    print("new K: ", sampler.K)
    sampler.sample_m()
    sampler.sample_beta()
    sampler.sample_alpha()
    sampler.sample_gamma()

NameError: name 'dataset' is not defined