In [69]:
import pickle
from transformers import AutoTokenizer
import numpy as np

model_name = "hmm-100-test-tokenizer-500-maxlength-500.pkl"
fname = f"/n/holyscratch01/sham_lab/summer_2024/models/{model_name}"
max_length = 500
tokenizer_name = "tokenizer-500"

with open(fname, "rb") as f:
    hmm = pickle.load(f)

In [70]:
tokenizer = AutoTokenizer.from_pretrained(f"../olmo_data/tokenizers/{tokenizer_name}")
tokenizer.pad_token = tokenizer.eos_token

In [71]:
hmm.emissionprob_.shape

(100, 69)

In [72]:
sentence1 = "Once upon a time there was a little boy named Ben. Ben loved to explore the world around him."
sentence2 = "One day, a boy named Tim went to the park to play. He"
sentence3 = """Once upon a time, there was a smooth vase. It was very pretty. The vase lived in a small house with a girl named Lily and her mom.
One day, Lily and her mom went outside to play. Before they left, her mom said, "Lily, please close the door." Lily closed the door and they played all day.
When they came back, they saw the smooth vase on the floor. It was broken! Lily"""
sentence4 = """Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through the"""

examples = [
    sentence1,
    sentence2,
    sentence3,
    sentence4,
    """Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through""",
    """Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking""",
    """Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was""",
    """Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben""",
    """Once upon a time there was a""",
    """Once upon a time there was a little"""
]

In [73]:
tokenized_examples = tokenizer(examples, truncation=True, padding='max_length', max_length=max_length)
tokenized_examples

{'input_ids': [[31, 56, 45, 47, 3, 63, 58, 57, 56, 3, 43, 3, 62, 51, 55, 47, 3, 62, 50, 47, 60, 47, 3, 65, 43, 61, 3, 43, 3, 54, 51, 62, 62, 54, 47, 3, 44, 57, 67, 3, 56, 43, 55, 47, 46, 3, 18, 47, 56, 9, 3, 18, 47, 56, 3, 54, 57, 64, 47, 46, 3, 62, 57, 3, 47, 66, 58, 54, 57, 60, 47, 3, 62, 50, 47, 3, 65, 57, 60, 54, 46, 3, 43, 60, 57, 63, 56, 46, 3, 50, 51, 55, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [74]:
# get the length of each tokenized example
tokenized_lengths = [x.index(0) for x in tokenized_examples['input_ids']]
tokenized_lengths

[93, 53, 365, 212, 208, 200, 192, 188, 28, 35]

In [75]:
seq = tokenized_examples['input_ids']
seq = list(np.array(seq).flatten())
seq = [[x] for x in seq]

In [76]:
_, ss = hmm.decode(seq, lengths=[max_length]*len(examples))
ss

array([86, 18, 89, ..., 99, 99,  0])

In [77]:
# last hidden states
last_hidden_states = []
for tokenized_example, tokenized_length in zip(tokenized_examples['input_ids'], tokenized_lengths):
    last_hidden_states.append(ss[tokenized_length-1])

print(last_hidden_states)

[99, 99, 99, 99, 99, 99, 99, 99, 99, 99]


In [78]:
arr = []
for i in range(len(examples)):
    arr.append(ss[max_length*i: max_length*(i+1)])
    print(arr[i])

[86 18 89 12 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99

In [79]:
hmm.transmat_.shape

(100, 100)

In [80]:
def predict(hidden_state: int, state_transitions, emission_matrix, num_emissions, samp=True):
    emissions = []
    for _ in range(num_emissions):
        probabilities = state_transitions[hidden_state]
        if samp:
            next_state = np.random.choice(len(probabilities), p=probabilities)
        else:
            next_state = np.argmax(probabilities)
        e_probs = emission_matrix[next_state]
        if samp:
            emission = np.random.choice(len(e_probs), p=e_probs)
        else:
            emission = np.argmax(e_probs)
        emissions.append(emission)
        hidden_state = next_state
    return emissions

In [81]:
for i ,state in enumerate(last_hidden_states):
    new_ems = predict(state, hmm.transmat_, hmm.emissionprob_, 10)
    print("Input\n")
    print(tokenizer.decode(tokenized_examples['input_ids'][i][:tokenized_lengths[i]]))
    print("Output\n")
    print(tokenizer.decode(new_ems))
    print("---------------")

Input

Once upon a time there was a little boy named Ben. Ben loved to explore the world around him.
Output

erttledekn
---------------
Input

One day, a boy named Tim went to the park to play. He
Output

owasbiadaa
---------------
Input

Once upon a time, there was a smooth vase. It was very pretty. The vase lived in a small house with a girl named Lily and her mom.One day, Lily and her mom went outside to play. Before they left, her mom said, "Lily, please close the door." Lily closed the door and they played all day.When they came back, they saw the smooth vase on the floor. It was broken! Lily
Output

oweshrg.Th
---------------
Input

Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through the
Output

odyawi"She
---------------
Input

Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He s

In [82]:
for i ,state in enumerate(last_hidden_states):
    new_ems = predict(state, hmm.transmat_, hmm.emissionprob_, 10, samp=False)
    print("Input\n")
    print(tokenizer.decode(tokenized_examples['input_ids'][i][:tokenized_lengths[i]]))
    print("Output\n")
    print(tokenizer.decode(new_ems))
    print("---------------")

Input

Once upon a time there was a little boy named Ben. Ben loved to explore the world around him.
Output

ouen,thesh
---------------
Input

One day, a boy named Tim went to the park to play. He
Output

ouen,thesh
---------------
Input

Once upon a time, there was a smooth vase. It was very pretty. The vase lived in a small house with a girl named Lily and her mom.One day, Lily and her mom went outside to play. Before they left, her mom said, "Lily, please close the door." Lily closed the door and they played all day.When they came back, they saw the smooth vase on the floor. It was broken! Lily
Output

ouen,thesh
---------------
Input

Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through the
Output

ouen,thesh
---------------
Input

Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He s