In [16]:
from typing import List, Optional, Tuple
import numpy as np
from mealymarkov import MarkovMealyModel
import os
from dotenv import load_dotenv
load_dotenv()

ozz_FILE_PATH = os.getenv('100_SAVE_PATH')
zir_FILE_PATH = os.getenv('ZIR_SAVE_PATH')

# Example small model (n=4 states, V=2 tokens) that satisfies the constraints.
n = 3
V = 2

# We construct T^0 and T^1 so that T^0 + T^1 is row-stochastic (rows sum to 1).
T0 = np.array([
    [0, 1, 0],
    [0, 0, 1],
    [0, 0, 0.5]
])

T1 = np.array([
    [0, 0, 0],
    [0, 0, 0],
    [0.5, 0, 0]
])

model = MarkovMealyModel(n=n, V=V, T_list=[T0, T1])

# By specification the default eta^0 is uniform
print("Initial eta^0 =", model.eta0)

tokens, states = model.sample_sequence(max_new_tokens=30, seed=42)

print("Generated tokens:", tokens)
print("States (eta^t) traversed:")
for i, s in enumerate(states):
    print(f"t={i} ->", np.round(s, 4))

Initial eta^0 = [0.33333333 0.33333333 0.33333333]
Generated tokens: [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
States (eta^t) traversed:
t=0 -> [0.3333 0.3333 0.3333]
t=1 -> [0.  0.4 0.6]
t=2 -> [0. 0. 1.]
t=3 -> [1. 0. 0.]
t=4 -> [0. 1. 0.]
t=5 -> [0. 0. 1.]
t=6 -> [1. 0. 0.]
t=7 -> [0. 1. 0.]
t=8 -> [0. 0. 1.]
t=9 -> [0. 0. 1.]
t=10 -> [0. 0. 1.]
t=11 -> [0. 0. 1.]
t=12 -> [1. 0. 0.]
t=13 -> [0. 1. 0.]
t=14 -> [0. 0. 1.]
t=15 -> [0. 0. 1.]
t=16 -> [0. 0. 1.]
t=17 -> [1. 0. 0.]
t=18 -> [0. 1. 0.]
t=19 -> [0. 0. 1.]
t=20 -> [1. 0. 0.]
t=21 -> [0. 1. 0.]
t=22 -> [0. 0. 1.]
t=23 -> [1. 0. 0.]
t=24 -> [0. 1. 0.]
t=25 -> [0. 0. 1.]
t=26 -> [0. 0. 1.]
t=27 -> [0. 0. 1.]
t=28 -> [0. 0. 1.]
t=29 -> [0. 0. 1.]
t=30 -> [1. 0. 0.]


In [18]:
import numpy as np
import json
#generating the process as discussed in the previous meet
#for the process that generates 100*
n = 3
V = 2
num_training_samples = 100
sequences = {}
# We construct T^0 and T^1 so that T^0 + T^1 is row-stochastic (rows sum to 1).
T0 = np.array([
    [0, 1, 0],
    [0, 0, 1],
    [0, 0, 0.5]
])

T1 = np.array([
    [0, 0, 0],
    [0, 0, 0],
    [0.5, 0, 0]
])

model = MarkovMealyModel(n=n, V=V, T_list=[T0, T1])
for i in range(num_training_samples):
    tokens, _ = model.sample_sequence(max_new_tokens=50)
    sequences[i] = tokens
with open(ozz_FILE_PATH, 'w') as fp:
    json.dump(sequences, fp, indent=4)

In [19]:

#generating the process as discussed in the previous meet
#for the process that generates ZIR
n = 3
V = 2
num_training_samples = 100
sequences = {}
# We construct T^0 and T^1 so that T^0 + T^1 is row-stochastic (rows sum to 1).
T0 = np.array([
    [0, 1, 0],
    [0, 0, 0],
    [0.5, 0, 0]
])

T1 = np.array([
    [0, 0, 0],
    [0, 0, 1],
    [0.5, 0, 0]
])

model = MarkovMealyModel(n=n, V=V, T_list=[T0, T1])
for i in range(num_training_samples):
    tokens, _ = model.sample_sequence(max_new_tokens=50)
    sequences[i] = tokens
with open(zir_FILE_PATH, 'w') as fp:
    json.dump(sequences, fp, indent=4)

In [27]:
import numpy as np
import torch
from toy_model import train_model, finetune_model, MarkovData

T0 = np.array([
    [0, 1, 0],
    [0, 0, 0],
    [0.5, 0, 0]
])
T1 = np.array([
    [0, 0, 0],
    [0, 0, 1],
    [0.5, 0, 0]
])

dataset = MarkovData(n_gen=1000, gen_len=50, n_states=3, d_vocab=2, T_list=[T0, T1])
model = train_model(
    dataset=dataset,
    n_layers=6,
    d_model=8,
    d_head=8,
    d_mlp=32,
    attn_only=True,
    n_epochs=10,
    lr=5e-3,
    batch_size=100,
    save_every=1000,
    print_every=1000,
    save_dir=None # To not to save the model
)

Moving model to device:  cpu


  0%|          | 0/10 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch 1 Samples 100 Step 0 Loss 0.7909963130950928


0it [00:00, ?it/s]

Epoch 2 Samples 100 Step 0 Loss 0.6801583170890808


0it [00:00, ?it/s]

Epoch 3 Samples 100 Step 0 Loss 0.6521185636520386


0it [00:00, ?it/s]

Epoch 4 Samples 100 Step 0 Loss 0.6397430300712585


0it [00:00, ?it/s]

Epoch 5 Samples 100 Step 0 Loss 0.6304200291633606


0it [00:00, ?it/s]

Epoch 6 Samples 100 Step 0 Loss 0.6220318675041199


0it [00:00, ?it/s]

Epoch 7 Samples 100 Step 0 Loss 0.6140809059143066


0it [00:00, ?it/s]

Epoch 8 Samples 100 Step 0 Loss 0.6064600348472595


0it [00:00, ?it/s]

Epoch 9 Samples 100 Step 0 Loss 0.5993781685829163


0it [00:00, ?it/s]

Epoch 10 Samples 100 Step 0 Loss 0.5930272936820984


In [28]:
sample, states = dataset.model.sample_sequence(max_new_tokens=40)
preds = model(torch.tensor(sample, dtype=torch.int64)).argmax(dim=-1).flatten().tolist()
for s, pred in zip(sample[1:], preds[:-1]):
    print(f'Actual: {s}, Predicted: {pred}')

Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1


In [29]:
model = finetune_model(model, dataset, n_epochs=5, save_dir=None) # Add additional arguments as needed

Moving model to device:  cpu
Moving model to device:  cpu


  0%|          | 0/5 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch 1 Samples 64 Step 0 Loss 0.5873494148254395


0it [00:00, ?it/s]

Epoch 2 Samples 64 Step 0 Loss 0.5708927512168884


0it [00:00, ?it/s]

Epoch 3 Samples 64 Step 0 Loss 0.5536504983901978


0it [00:00, ?it/s]

Epoch 4 Samples 64 Step 0 Loss 0.53603595495224


0it [00:00, ?it/s]

Epoch 5 Samples 64 Step 0 Loss 0.5192384719848633


In [32]:
with torch.no_grad():
    model.eval()
    logits = model(torch.tensor([[0,1,1,0,1,0,0,1,1,0],
                                 [1,0,1,1,0,1,0,0,1,1],
                                 [1,0,0,1,0,0,1,0,0,1]], dtype=torch.int64))
print(logits[:, -1, :])
print(logits[:, -1, :].argmax(dim=-1))
# Ground truth values: [1, 0, R]

tensor([[-0.4864, -1.4095],
        [ 0.7281, -0.7400],
        [-0.4133,  0.4519]])
tensor([0, 0, 1])
