In [146]:
import numpy as np
import tqdm

### Create Synthetic Dataset on Predefined HMM

In [147]:
np.set_printoptions(suppress=True, precision=4)
np.set_printoptions(linewidth=100, threshold=np.inf)
np.set_printoptions(formatter={'int': '{:5d}'.format})

In [202]:
num_states = 20
num_observations = 5000

In [203]:
transition_probs = np.random.dirichlet(np.ones(num_states), size=num_states)
emission_probs = np.random.dirichlet(np.ones(num_observations), size=num_states)
initial_state_dist = np.ones(num_states) / num_states

In [204]:
transition_probs, emission_probs

(array([[0.3184, 0.0076, 0.0223, 0.0204, 0.0015, 0.0026, 0.0021, 0.0258, 0.0401, 0.0619, 0.0322,
         0.0403, 0.0188, 0.0027, 0.061 , 0.0169, 0.0102, 0.1726, 0.0398, 0.1028],
        [0.0894, 0.0011, 0.1111, 0.0587, 0.0323, 0.0116, 0.0362, 0.0881, 0.0659, 0.002 , 0.0422,
         0.035 , 0.0307, 0.1189, 0.0635, 0.0524, 0.0214, 0.0379, 0.0328, 0.0686],
        [0.0148, 0.0148, 0.0161, 0.0272, 0.0262, 0.0963, 0.0205, 0.1177, 0.0552, 0.1014, 0.0359,
         0.0658, 0.151 , 0.1496, 0.0009, 0.0453, 0.0038, 0.0157, 0.0378, 0.0038],
        [0.0357, 0.0488, 0.0271, 0.0396, 0.028 , 0.0077, 0.1633, 0.0063, 0.0951, 0.0064, 0.0958,
         0.0046, 0.0131, 0.1053, 0.0271, 0.0845, 0.029 , 0.1377, 0.0154, 0.0297],
        [0.0034, 0.0498, 0.0043, 0.0362, 0.0671, 0.0025, 0.0354, 0.0374, 0.0218, 0.0044, 0.0189,
         0.1147, 0.0512, 0.0652, 0.037 , 0.0487, 0.0715, 0.1756, 0.1362, 0.0185],
        [0.0037, 0.0002, 0.0339, 0.1066, 0.0481, 0.1379, 0.0193, 0.0691, 0.0344, 0.0048, 0.017 ,
        

In [205]:
# Simulate the HMM and generate strings (observation sequences)
def simulate_hmm(num_sequences, min_length, max_length, start_prob, trans_prob, emis_prob):
    sequences = []
    hidden_states = []
    for _ in range(num_sequences):
        sequence_length = np.random.randint(min_length, max_length+1)
        current_state = np.random.choice(num_states, p=start_prob)
        observation_sequence = []
        state_sequence = []
        for _ in range(sequence_length):
            state_sequence.append(current_state)
            observation = np.random.choice(num_observations, p=emis_prob[current_state])
            observation_sequence.append(observation)
            current_state = np.random.choice(num_states, p=trans_prob[current_state])
        sequences.append(observation_sequence)
        hidden_states.append(state_sequence)
    
    return sequences, hidden_states

In [206]:
syn_sequences, syn_hidden_states = simulate_hmm(
    num_sequences=100000,
    min_length=10,
    max_length=30,
    start_prob=initial_state_dist,
    trans_prob=transition_probs,
    emis_prob=emission_probs
)

In [207]:
def add_noise_to_states(hidden_states, number_states, flip_prob=0.5):
    noisy_hidden_states = []
    for sequence in hidden_states:
        noisy_sequence = []
        for state in sequence:
            if np.random.rand() < flip_prob:
                # Flip the state to a different random state
                possible_states = list(range(number_states))
                possible_states.remove(state)  # Remove the current state from possibilities
                new_state = np.random.choice(possible_states)
                noisy_sequence.append(new_state)
            else:
                noisy_sequence.append(state)
        noisy_hidden_states.append(noisy_sequence)
    return noisy_hidden_states

In [208]:
noisy_level = 0.3
noisy_hidden_states = add_noise_to_states(syn_hidden_states, num_states, flip_prob=noisy_level)

In [209]:
print(transition_probs)
for seq in syn_sequences[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(noisy_hidden_states[:5])):
    print('[' + ', '.join(map(str, syn_hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noisy_hidden_states[:5][index])) + ']')
    print("--------------------")

[[0.3184 0.0076 0.0223 0.0204 0.0015 0.0026 0.0021 0.0258 0.0401 0.0619 0.0322 0.0403 0.0188
  0.0027 0.061  0.0169 0.0102 0.1726 0.0398 0.1028]
 [0.0894 0.0011 0.1111 0.0587 0.0323 0.0116 0.0362 0.0881 0.0659 0.002  0.0422 0.035  0.0307
  0.1189 0.0635 0.0524 0.0214 0.0379 0.0328 0.0686]
 [0.0148 0.0148 0.0161 0.0272 0.0262 0.0963 0.0205 0.1177 0.0552 0.1014 0.0359 0.0658 0.151
  0.1496 0.0009 0.0453 0.0038 0.0157 0.0378 0.0038]
 [0.0357 0.0488 0.0271 0.0396 0.028  0.0077 0.1633 0.0063 0.0951 0.0064 0.0958 0.0046 0.0131
  0.1053 0.0271 0.0845 0.029  0.1377 0.0154 0.0297]
 [0.0034 0.0498 0.0043 0.0362 0.0671 0.0025 0.0354 0.0374 0.0218 0.0044 0.0189 0.1147 0.0512
  0.0652 0.037  0.0487 0.0715 0.1756 0.1362 0.0185]
 [0.0037 0.0002 0.0339 0.1066 0.0481 0.1379 0.0193 0.0691 0.0344 0.0048 0.017  0.0042 0.0326
  0.0274 0.0003 0.0917 0.1063 0.0083 0.0602 0.194 ]
 [0.0023 0.1113 0.1085 0.0073 0.1333 0.0265 0.0762 0.0094 0.0131 0.008  0.0977 0.0288 0.0026
  0.0786 0.0293 0.0201 0.0029 0.085  0

In [210]:
file_path = f"../data/hmm_synthetic_dataset(noise-{noisy_level}).npz"
seq_object = np.array(syn_sequences, dtype=object)
hid_object = np.array(syn_hidden_states, dtype=object)
noisy_hid_object = np.array(noisy_hidden_states, dtype=object)
trans_object = transition_probs
emis_object = emission_probs
np.savez(file_path, observation=seq_object, real_hidden=hid_object, noisy_hidden=noisy_hid_object, real_trans=trans_object, emis=emis_object , noisy_level=noisy_level)

In [43]:
loaded_npz = np.load("../data/hmm_synthetic_dataset.npz", allow_pickle=True)
observations = list(loaded_npz['observation'])
hidden_states = list(loaded_npz['real_hidden'])
noi_hidden_states = list(loaded_npz['noisy_hidden'])
transition_dist = np.vstack(loaded_npz['real_trans'])

In [37]:
print(transition_dist)
for seq in observations[:5]:
    print('[' + ', '.join(map(str, seq)) + ']')
for index in range(len(hidden_states[:5])):
    print('[' + ', '.join(map(str, hidden_states[:5][index])) + ']')
    print('[' + ', '.join(map(str, noi_hidden_states[:5][index])) + ']')
    print("--------------------")

[[0.0828 0.0835 0.2565 0.0532 0.1432 0.1047 0.0355 0.0749 0.0597 0.1059]
 [0.2622 0.0788 0.0438 0.0515 0.2498 0.128  0.0694 0.0793 0.0056 0.0315]
 [0.3887 0.0121 0.066  0.0311 0.1557 0.0581 0.1044 0.0123 0.08   0.0917]
 [0.002  0.0137 0.0416 0.2126 0.0617 0.0089 0.3625 0.1019 0.0024 0.1927]
 [0.1317 0.1843 0.0463 0.0339 0.023  0.2441 0.0081 0.0168 0.1834 0.1285]
 [0.068  0.0984 0.0047 0.1357 0.1073 0.0184 0.1956 0.1457 0.172  0.0542]
 [0.2729 0.0094 0.0608 0.2362 0.0922 0.0745 0.1118 0.0079 0.0578 0.0767]
 [0.0392 0.1629 0.1814 0.018  0.1053 0.0753 0.0035 0.1197 0.2539 0.0409]
 [0.0439 0.0656 0.0169 0.3232 0.0293 0.1207 0.0119 0.0515 0.0976 0.2395]
 [0.0633 0.1669 0.0989 0.0001 0.119  0.0354 0.0203 0.1823 0.2566 0.0573]]
[99, 44, 31, 31, 19, 81, 21, 60, 12, 31, 60, 53, 22, 93, 10, 64]
[0, 83, 13, 38, 86, 59, 37, 21, 25, 29, 85, 58, 83, 95, 33, 12, 5, 99, 81, 1, 9, 13, 32, 73, 60, 51, 29]
[43, 97, 87, 18, 11, 34, 37, 82, 51, 28, 50]
[23, 43, 33, 82, 76, 31, 51, 75, 87, 62, 86, 24, 52, 1

### Initailise my model with pre-defined params

In [45]:
split = 5000
param_observed = observations[:split]
param_hidden = hidden_states[:split]
train_observed = observations[split:]
train_hidden = hidden_states[split:]

In [41]:
trans_count = np.zeros((num_states, num_states), dtype='int')
emis_count = np.zeros((num_observations, num_states), dtype='int')

In [88]:
for i in range(split):
    for t in range(len(param_hidden[i])):
        emis_count[param_observed[i][t], param_hidden[i][t]] += 1
        if t > 0: 
            trans_count[param_hidden[i][t], param_hidden[i][t - 1]] += 1

In [75]:
read_npz = np.load("../data/noise-0.5_iter-150_timestamp-0108_071414_result.npz")
distance = read_npz['result']

In [76]:
distance

array([1.5531, 1.551 , 1.5534, 1.5185, 1.5098, 1.5009, 1.4714, 1.4881, 1.4526, 1.4545, 1.4377,
       1.4448, 1.4297, 1.4079, 1.4389, 1.4283, 1.4096, 1.3499, 1.3546, 1.3276, 1.3314, 1.3105,
       1.317 , 1.325 , 1.3136, 1.3008, 1.2995, 1.3041, 1.2796, 1.2539, 1.2513, 1.2356, 1.2104,
       1.1972, 1.1715, 1.1761, 1.1776, 1.1725, 1.1636, 1.1544, 1.174 , 1.1456, 1.1508, 1.1482,
       1.1336, 1.1058, 1.0957, 1.0913, 1.0945, 1.0835])

### Create Synthetic Dateset on Penn Treebank

In [1]:
import nltk
from nltk.corpus.reader import ConllCorpusReader

In [30]:
# Path to the CoNLL-U file
file_path = '../data/ptb/penn-train.conllu'

def read_sentences_with_pos_tags(file_path):
    sentences_with_pos_tags = []

    with open(file_path, 'r', encoding='utf-8') as file:
        current_sentence = []
        for line in file:
            # Skip empty lines and comments
            if line.strip() and not line.startswith('#'):
                fields = line.split('\t')
                if len(fields) > 3:  # Ensure there are enough fields
                    word = fields[1].lower()  # Word form is the second field
                    upos = fields[3]  # Universal POS tag is the fourth field
                    xpos = fields[4]  # Language specific POS tag is the fifth field
                    current_sentence.append((word, upos, xpos))

            # New sentence
            elif current_sentence:
                sentences_with_pos_tags.append(current_sentence)
                current_sentence = []

    return sentences_with_pos_tags

In [34]:
sentences_pos_tags = read_sentences_with_pos_tags(file_path)
print(len(sentences_pos_tags))
sentences_pos_tags[:3]

39832


[[('in', 'ADP', 'IN'),
  ('an', 'DET', 'DT'),
  ('oct.', 'PROPN', 'NNP'),
  ('19', 'NUM', 'CD'),
  ('review', 'NOUN', 'NN'),
  ('of', 'ADP', 'IN'),
  ('``', 'PUNCT', '``'),
  ('the', 'DET', 'DT'),
  ('misanthrope', 'NOUN', 'NN'),
  ("''", 'PUNCT', "''"),
  ('at', 'ADP', 'IN'),
  ('chicago', 'PROPN', 'NNP'),
  ("'s", 'PART', 'POS'),
  ('goodman', 'PROPN', 'NNP'),
  ('theatre', 'PROPN', 'NNP'),
  ('-lrb-', 'PUNCT', '-LRB-'),
  ('``', 'PUNCT', '``'),
  ('revitalized', 'VERB', 'VBN'),
  ('classics', 'NOUN', 'NNS'),
  ('take', 'VERB', 'VBP'),
  ('the', 'DET', 'DT'),
  ('stage', 'NOUN', 'NN'),
  ('in', 'ADP', 'IN'),
  ('windy', 'PROPN', 'NNP'),
  ('city', 'PROPN', 'NNP'),
  (',', 'PUNCT', ','),
  ("''", 'PUNCT', "''"),
  ('leisure', 'NOUN', 'NN'),
  ('&', 'CONJ', 'CC'),
  ('arts', 'NOUN', 'NNS'),
  ('-rrb-', 'PUNCT', '-RRB-'),
  (',', 'PUNCT', ','),
  ('the', 'DET', 'DT'),
  ('role', 'NOUN', 'NN'),
  ('of', 'ADP', 'IN'),
  ('celimene', 'PROPN', 'NNP'),
  (',', 'PUNCT', ','),
  ('played', 'VE

In [32]:
from collections import Counter
# Add UNK and UNK_tag to dataset
def replace_low_frequency_words(sentences_with_pos_tags, filter_count=1):
    # Count the frequencies of each word
    word_counts = Counter(word for sentence in sentences_with_pos_tags for word, _, _ in sentence)

    # Replace words with count less than filter_count to 'UNK' and their tags to 'UNK_TAG'
    processed_sentences = []
    for sentence in sentences_with_pos_tags:
        new_sentence = []
        for word, upos, xpos in sentence:
            if word_counts[word] < filter_count:
                new_word = 'UNK'
                new_upos = 'UNK_TAG'
                new_xpos = 'UNK_TAG'
            else:
                new_word = word
                new_upos = upos
                new_xpos = xpos
            new_sentence.append((new_word, new_upos, new_xpos))
        processed_sentences.append(new_sentence)

    return processed_sentences

In [166]:
filtered_sentences = replace_low_frequency_words(sentences_pos_tags, filter_count=20)
filtered_sentences[:3]

[[('in', 'ADP', 'IN'),
  ('an', 'DET', 'DT'),
  ('oct.', 'PROPN', 'NNP'),
  ('19', 'NUM', 'CD'),
  ('review', 'NOUN', 'NN'),
  ('of', 'ADP', 'IN'),
  ('``', 'PUNCT', '``'),
  ('the', 'DET', 'DT'),
  ('UNK', 'UNK_TAG', 'UNK_TAG'),
  ("''", 'PUNCT', "''"),
  ('at', 'ADP', 'IN'),
  ('chicago', 'PROPN', 'NNP'),
  ("'s", 'PART', 'POS'),
  ('UNK', 'UNK_TAG', 'UNK_TAG'),
  ('UNK', 'UNK_TAG', 'UNK_TAG'),
  ('-lrb-', 'PUNCT', '-LRB-'),
  ('``', 'PUNCT', '``'),
  ('UNK', 'UNK_TAG', 'UNK_TAG'),
  ('UNK', 'UNK_TAG', 'UNK_TAG'),
  ('take', 'VERB', 'VBP'),
  ('the', 'DET', 'DT'),
  ('stage', 'NOUN', 'NN'),
  ('in', 'ADP', 'IN'),
  ('UNK', 'UNK_TAG', 'UNK_TAG'),
  ('city', 'PROPN', 'NNP'),
  (',', 'PUNCT', ','),
  ("''", 'PUNCT', "''"),
  ('UNK', 'UNK_TAG', 'UNK_TAG'),
  ('&', 'CONJ', 'CC'),
  ('UNK', 'UNK_TAG', 'UNK_TAG'),
  ('-rrb-', 'PUNCT', '-RRB-'),
  (',', 'PUNCT', ','),
  ('the', 'DET', 'DT'),
  ('role', 'NOUN', 'NN'),
  ('of', 'ADP', 'IN'),
  ('UNK', 'UNK_TAG', 'UNK_TAG'),
  (',', 'PUNCT', ',

In [127]:
from collections import defaultdict

def create_vocab_index(sentences_with_pos_tags):
    # Function to create a dictionary mapping each unique word/POS to an integer index
    # with specified start index
    def build_index(items, start_index=0):
        item_to_index = defaultdict(lambda: len(item_to_index) + start_index)
        for item in items:
            item_to_index[item]
        return dict(item_to_index)

    # Flatten the list of sentences to get a single list of words and POS tags
    all_words = [word for sentence in sentences_with_pos_tags for word, upos, xpos in sentence]
    all_upos_tags = [upos for sentence in sentences_with_pos_tags for word, upos, xpos in sentence]
    all_xpos_tags = [xpos for sentence in sentences_with_pos_tags for word, upos, xpos in sentence]

    return build_index(all_words, start_index=0), build_index(all_upos_tags, start_index=1),  build_index(all_xpos_tags, start_index=1)

In [167]:
word_to_index, upos_to_index, xpos_to_index = create_vocab_index(filtered_sentences)
# Display the first few items of each index as an example
list(word_to_index.items())[1000:1005], list(upos_to_index.items())[:5], list(xpos_to_index.items())[:5]

([('shown', 1000),
  ('table', 1001),
  ('yesterday', 1002),
  ('edition', 1003),
  ('average', 1004)],
 [('ADP', 1), ('DET', 2), ('PROPN', 3), ('NUM', 4), ('NOUN', 5)],
 [('IN', 1), ('DT', 2), ('NNP', 3), ('CD', 4), ('NN', 5)])

In [168]:
len(word_to_index), len(upos_to_index), len(xpos_to_index)

(4110, 18, 46)

In [169]:
def convert_to_indexes(filtered_sentences_tags, word_to_index, upos_to_index, xpos_to_index):
    hidden_states_universal = []
    hidden_states_specific = []
    observations = []

    for sentence in filtered_sentences_tags:
        if len(sentence) <= 5: 
            continue
        sentence_upos_indexes = [upos_to_index[upos] for _, upos, _ in sentence]
        sentence_xpos_indexes = [xpos_to_index[xpos] for _, _, xpos in sentence]
        sentence_word_indexes = [word_to_index[word] for word, _, _ in sentence]

        hidden_states_universal.append(sentence_upos_indexes)
        hidden_states_specific.append(sentence_xpos_indexes)
        observations.append(sentence_word_indexes)

    return hidden_states_universal, hidden_states_specific, observations

In [170]:
hidden_states_universal, hidden_states_specific, observations = convert_to_indexes(
    filtered_sentences, word_to_index, upos_to_index, xpos_to_index)

In [171]:
for index in range(len(observations[:5])):
    print('[' + ', '.join(map(str, hidden_states_universal[:5][index])) + ']')
    print('[' + ', '.join(map(str, hidden_states_specific[:5][index])) + ']')
    print('[' + ', '.join(map(str, observations[:5][index])) + ']')
    print('-----------------------------')

[1, 2, 3, 4, 5, 1, 6, 2, 7, 6, 1, 3, 8, 7, 7, 6, 6, 7, 7, 9, 2, 5, 1, 7, 3, 6, 6, 7, 10, 7, 6, 6, 2, 5, 1, 7, 6, 9, 1, 7, 7, 6, 11, 7, 9, 1, 7, 7, 6]
[1, 2, 3, 4, 5, 1, 6, 2, 7, 8, 1, 3, 9, 7, 7, 10, 6, 7, 7, 11, 2, 5, 1, 7, 3, 12, 8, 7, 13, 7, 14, 12, 2, 5, 1, 7, 12, 15, 1, 7, 7, 12, 16, 7, 15, 17, 7, 7, 18]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 8, 8, 13, 6, 8, 8, 14, 7, 15, 0, 8, 16, 17, 9, 8, 18, 8, 19, 17, 7, 20, 5, 8, 17, 21, 22, 8, 8, 17, 23, 8, 24, 25, 8, 8, 26]
-----------------------------
[7, 3, 3, 3, 9, 12, 9, 12, 3, 5, 8, 9, 13, 1, 1, 7, 5, 1, 4, 6]
[7, 3, 20, 3, 16, 21, 19, 22, 3, 23, 17, 24, 25, 1, 1, 7, 23, 1, 4, 18]
[8, 29, 30, 31, 32, 33, 34, 35, 36, 37, 25, 38, 39, 10, 40, 8, 30, 0, 41, 26]
-----------------------------
[2, 5, 5, 5, 13, 5, 9, 7, 5, 1, 2, 3]
[2, 5, 5, 5, 25, 5, 16, 7, 23, 1, 2, 3]
[7, 42, 43, 44, 45, 46, 47, 8, 30, 0, 7, 36]
-----------------------------
[3, 7, 6, 5, 10, 13, 5, 5, 6, 9, 12, 7, 5, 1, 2, 5, 5, 5, 1, 3, 10, 3, 6, 10, 1, 13, 13, 5, 6]

In [143]:
def add_noise_to_states_ptb(hidden_states, number_states, flip_prob=0.5):
    noisy_hidden_states = []
    for sequence in hidden_states:
        noisy_sequence = []
        for state in sequence:
            if np.random.rand() < flip_prob:
                # Flip the state to a different random state
                possible_states = list(range(1, number_states + 1))
                possible_states.remove(state)  # Remove the current state from possibilities
                new_state = np.random.choice(possible_states)
                noisy_sequence.append(new_state)
            else:
                noisy_sequence.append(state)
        noisy_hidden_states.append(noisy_sequence)
    return noisy_hidden_states

In [172]:
ptb_noisy_level = 0.3
noisy_hidden_states_universal = add_noise_to_states_ptb(hidden_states_universal, len(upos_to_index), flip_prob=ptb_noisy_level)
noisy_hidden_states_specific = add_noise_to_states_ptb(hidden_states_specific, len(xpos_to_index), flip_prob=ptb_noisy_level)

In [173]:
for i in range(len(hidden_states_universal)):
    hidden_states_universal[i].insert(0, 0)
    noisy_hidden_states_universal[i].insert(0, 0)

    hidden_states_specific[i].insert(0, 0)
    noisy_hidden_states_specific[i].insert(0, 0)
    
    observations[i].insert(0, -1)

In [174]:
for index in range(len(observations[:5])):
    print('[' + ', '.join(map(str, hidden_states_universal[:5][index])) + ']')
    print('[' + ', '.join(map(str, noisy_hidden_states_universal[:5][index])) + ']')
    print('-----------------------------')

[0, 1, 2, 3, 4, 5, 1, 6, 2, 7, 6, 1, 3, 8, 7, 7, 6, 6, 7, 7, 9, 2, 5, 1, 7, 3, 6, 6, 7, 10, 7, 6, 6, 2, 5, 1, 7, 6, 9, 1, 7, 7, 6, 11, 7, 9, 1, 7, 7, 6]
[0, 12, 15, 3, 4, 5, 1, 14, 2, 3, 16, 14, 3, 7, 3, 7, 6, 6, 7, 9, 9, 4, 5, 1, 7, 8, 6, 16, 17, 10, 7, 6, 6, 2, 14, 6, 7, 6, 9, 1, 5, 7, 6, 1, 7, 9, 1, 7, 12, 6]
-----------------------------
[0, 7, 3, 3, 3, 9, 12, 9, 12, 3, 5, 8, 9, 13, 1, 1, 7, 5, 1, 4, 6]
[0, 7, 3, 3, 8, 13, 15, 9, 12, 4, 14, 8, 9, 13, 1, 1, 7, 5, 1, 4, 6]
-----------------------------
[0, 2, 5, 5, 5, 13, 5, 9, 7, 5, 1, 2, 3]
[0, 2, 9, 9, 5, 13, 5, 16, 7, 5, 1, 2, 3]
-----------------------------
[0, 3, 7, 6, 5, 10, 13, 5, 5, 6, 9, 12, 7, 5, 1, 2, 5, 5, 5, 1, 3, 10, 3, 6, 10, 1, 13, 13, 5, 6]
[0, 3, 7, 1, 13, 10, 13, 15, 15, 6, 9, 8, 7, 5, 1, 2, 1, 2, 5, 7, 17, 10, 3, 7, 10, 9, 13, 18, 6, 5]
-----------------------------
[0, 3, 3, 3, 9, 12, 5, 1, 4, 5, 1, 4, 5, 2, 5, 6]
[0, 3, 3, 3, 16, 10, 13, 1, 4, 5, 1, 4, 5, 16, 5, 17]
-----------------------------


In [175]:
for index in range(len(observations[:5])):
    print('[' + ', '.join(map(str, hidden_states_specific[:5][index])) + ']')
    print('[' + ', '.join(map(str, noisy_hidden_states_specific[:5][index])) + ']')
    print('-----------------------------')

[0, 1, 2, 3, 4, 5, 1, 6, 2, 7, 8, 1, 3, 9, 7, 7, 10, 6, 7, 7, 11, 2, 5, 1, 7, 3, 12, 8, 7, 13, 7, 14, 12, 2, 5, 1, 7, 12, 15, 1, 7, 7, 12, 16, 7, 15, 17, 7, 7, 18]
[0, 34, 29, 3, 4, 5, 43, 31, 30, 7, 8, 1, 3, 9, 7, 43, 2, 6, 7, 7, 13, 2, 44, 1, 25, 3, 12, 19, 26, 13, 7, 40, 12, 2, 5, 1, 7, 8, 14, 1, 7, 7, 12, 16, 7, 15, 17, 7, 7, 18]
-----------------------------
[0, 7, 3, 20, 3, 16, 21, 19, 22, 3, 23, 17, 24, 25, 1, 1, 7, 23, 1, 4, 18]
[0, 7, 3, 20, 3, 16, 21, 7, 22, 21, 23, 2, 24, 35, 1, 1, 40, 23, 1, 4, 18]
-----------------------------
[0, 2, 5, 5, 5, 25, 5, 16, 7, 23, 1, 2, 3]
[0, 2, 5, 5, 5, 42, 5, 18, 7, 23, 37, 2, 3]
-----------------------------
[0, 3, 7, 12, 5, 13, 25, 5, 5, 12, 16, 21, 7, 5, 1, 2, 5, 5, 5, 1, 3, 13, 3, 12, 13, 1, 25, 25, 23, 18]
[0, 3, 7, 12, 5, 11, 24, 32, 36, 12, 16, 33, 7, 35, 15, 15, 5, 32, 5, 1, 3, 13, 3, 12, 13, 1, 25, 20, 25, 28]
-----------------------------
[0, 3, 3, 3, 16, 22, 5, 17, 4, 23, 1, 4, 23, 2, 5, 18]
[0, 36, 3, 3, 16, 9, 5, 17, 4, 23, 12,

In [176]:
for index in range(len(observations[:5])):
    print('[' + ', '.join(map(str, observations[:5][index])) + ']')
    print('-----------------------------')

[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 8, 8, 13, 6, 8, 8, 14, 7, 15, 0, 8, 16, 17, 9, 8, 18, 8, 19, 17, 7, 20, 5, 8, 17, 21, 22, 8, 8, 17, 23, 8, 24, 25, 8, 8, 26]
-----------------------------
[-1, 8, 29, 30, 31, 32, 33, 34, 35, 36, 37, 25, 38, 39, 10, 40, 8, 30, 0, 41, 26]
-----------------------------
[-1, 7, 42, 43, 44, 45, 46, 47, 8, 30, 0, 7, 36]
-----------------------------
[-1, 48, 8, 17, 49, 50, 51, 52, 53, 17, 32, 54, 8, 55, 56, 7, 42, 43, 44, 0, 57, 50, 58, 17, 50, 0, 59, 60, 61, 26]
-----------------------------
[-1, 62, 63, 31, 64, 35, 65, 25, 66, 67, 68, 69, 67, 70, 71, 26]
-----------------------------


In [178]:
file_path = f"../data/PennTreebank_synthetic_dataset(noise-{ptb_noisy_level}).npz"
obs_object = np.array(observations, dtype=object)
uni_hid_object = np.array(hidden_states_universal, dtype=object)
noisy_uni_hid_object = np.array(noisy_hidden_states_universal, dtype=object)
spc_hid_object = np.array(hidden_states_specific, dtype=object)
noisy_spc_hid_object = np.array(noisy_hidden_states_specific, dtype=object)
np.savez(file_path, num_states=num_states, num_obs=num_observations, observation=obs_object, real_hidden_universal=uni_hid_object, noisy_hidden_universal=noisy_uni_hid_object, real_hidden_specific=spc_hid_object, noisy_hidden_specifc=noisy_spc_hid_object, noisy_level=ptb_noisy_level)

In [126]:
len(observations)

38885