In [2]:
import pickle
from transformers import AutoTokenizer
import numpy as np

model_name = "hmm-400-test-tokenizer-828-maxlength-500-rows-1000.pkl"
fname = f"/n/holyscratch01/sham_lab/summer_2024/models/{model_name}"
max_length = 500
tokenizer_name = "tokenizer-828"

with open(fname, "rb") as f:
    hmm = pickle.load(f)

In [3]:
tokenizer = AutoTokenizer.from_pretrained(f"../olmo_data/tokenizers/{tokenizer_name}")
tokenizer.pad_token = tokenizer.eos_token

In [4]:
hmm.emissionprob_.shape

(400, 88)

In [5]:
sentence1 = "Once upon a time there was a little boy named Ben. Ben loved to explore the world around him."
sentence2 = "One day, a boy named Tim went to the park to play. He"
sentence3 = """Once upon a time, there was a smooth vase. It was very pretty. The vase lived in a small house with a girl named Lily and her mom.
One day, Lily and her mom went outside to play. Before they left, her mom said, "Lily, please close the door." Lily closed the door and they played all day.
When they came back, they saw the smooth vase on the floor. It was broken! Lily"""
sentence4 = """Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through the"""

examples = [
    sentence1,
    sentence2,
    sentence3,
    sentence4,
    """Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through""",
    """Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking""",
    """Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was""",
    """Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben""",
    """Once upon a time there was a""",
    """Once upon a time there was a little"""
]

In [6]:
tokenized_examples = tokenizer(examples, truncation=True, padding='max_length', max_length=max_length)
tokenized_examples

{'input_ids': [[46, 71, 60, 62, 87, 78, 73, 72, 71, 87, 58, 87, 77, 66, 70, 62, 87, 77, 65, 62, 75, 62, 87, 80, 58, 76, 87, 58, 87, 69, 66, 77, 77, 69, 62, 87, 59, 72, 82, 87, 71, 58, 70, 62, 61, 87, 33, 62, 71, 14, 87, 33, 62, 71, 87, 69, 72, 79, 62, 61, 87, 77, 72, 87, 62, 81, 73, 69, 72, 75, 62, 87, 77, 65, 62, 87, 80, 72, 75, 69, 61, 87, 58, 75, 72, 78, 71, 61, 87, 65, 66, 70, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [7]:
# get the length of each tokenized example
tokenized_lengths = [x.index(0) for x in tokenized_examples['input_ids']]
tokenized_lengths

[93, 53, 365, 212, 208, 200, 192, 188, 28, 35]

In [8]:
seq = tokenized_examples['input_ids']
seq = list(np.array(seq).flatten())
seq = [[x] for x in seq]

In [9]:
_, ss = hmm.decode(seq, lengths=[max_length]*len(examples))
ss

array([ 58,  76,  94, ..., 124, 124, 124])

In [10]:
# last hidden states
last_hidden_states = []
for tokenized_example, tokenized_length in zip(tokenized_examples['input_ids'], tokenized_lengths):
    last_hidden_states.append(ss[tokenized_length-1])

print(last_hidden_states)

[65, 193, 124, 124, 124, 124, 124, 124, 253, 184]


In [11]:
arr = []
for i in range(len(examples)):
    arr.append(ss[max_length*i: max_length*(i+1)])
    print(arr[i])

[ 58  76  94 125 230 378  46 196 352 195 119  55 123 208  95 283  45 322
 296 138  74  41   0  56 122 134 114 253 205 189  84 295 390 148 184 299
 136 326 163  57 344 175  24 308 385 147 189  84 177  38  45 364 193 303
  57 126 166 332  47 301 365 277  34  45 251 203 167 102 276 223 283 317
 277 369 184 299 136 326 154 338 179 140 391 371 213 298 235 180 347 220
 372 361  65 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124
 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124
 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124
 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124
 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124
 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124
 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124
 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124 124
 124 124 124 124 124 124 124 124 124 124 124 124 12

In [12]:
hmm.transmat_.shape

(400, 400)

In [13]:
def predict(hidden_state: int, state_transitions, emission_matrix, num_emissions, samp=True):
    emissions = []
    for _ in range(num_emissions):
        probabilities = state_transitions[hidden_state]
        if samp:
            next_state = np.random.choice(len(probabilities), p=probabilities)
        else:
            next_state = np.argmax(probabilities)
        e_probs = emission_matrix[next_state]
        if samp:
            emission = np.random.choice(len(e_probs), p=e_probs)
        else:
            emission = np.argmax(e_probs)
        emissions.append(emission)
        hidden_state = next_state
    return emissions

In [14]:
for i ,state in enumerate(last_hidden_states):
    new_ems = predict(state, hmm.transmat_, hmm.emissionprob_, 10)
    print("Input\n")
    print(tokenizer.decode(tokenized_examples['input_ids'][i][:tokenized_lengths[i]]))
    print("Output\n")
    print(tokenizer.decode(new_ems))
    print("---------------")

Input

Once upon a time there was a little boy named Ben. Ben loved to explore the world around him.
Output

<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>
---------------
Input

One day, a boy named Tim went to the park to play. He
Output

no, ars a
---------------
Input

Once upon a time, there was a smooth vase. It was very pretty. The vase lived in a small house with a girl named Lily and her mom.One day, Lily and her mom went outside to play. Before they left, her mom said, "Lily, please close the door." Lily closed the door and they played all day.When they came back, they saw the smooth vase on the floor. It was broken! Lily
Output

<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>
---------------
Input

Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing 

In [15]:
for i ,state in enumerate(last_hidden_states):
    new_ems = predict(state, hmm.transmat_, hmm.emissionprob_, 40, samp=True)
    print("Input\n")
    print(tokenizer.decode(tokenized_examples['input_ids'][i][:tokenized_lengths[i]]))
    print("Output\n")
    print(tokenizer.decode(new_ems))
    print("---------------")

Input

Once upon a time there was a little boy named Ben. Ben loved to explore the world around him.
Output

<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>
---------------
Input

One day, a boy named Tim went to the park to play. He
Output

ougle it said emt ths rwndy"'s friead n
---------------
Input

Once upon a time, there was a smooth vase. It was very pretty. The vase lived in a small house with a girl named Lily and her mom.One day, Lily and her mom went outside to play. Before they left, her mom said, "Lily, pleas