In [27]:
from transformer_lens import HookedTransformer, HookedTransformerConfig
# from transformer_lens.train import HookedTransformerTrainConfig, train
from datasets import Dataset, DatasetDict
import numpy as np
import torch as t
import os
from sae_lens import LanguageModelSAERunnerConfig, SAETrainingRunner

device = t.device("cuda" if t.cuda.is_available() else "cpu")

# Create huggingface data with othello games

In [2]:
os.environ["HF_TOKEN"] = "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"

In [3]:
def add_last_missing_move(data):
    rows = data["tokens"]
    for i, row in enumerate(rows):
        # Add number between 1 and 60 that is not present in the tokens as last move
        all_numbers = set(range(1, 61))
        # Take the number that is not in the intersection of all_numbers and tokens
        last_number = list(all_numbers - set(row))[0]
        # Update the row in place
        rows[i].append(last_number)
    return data


In [4]:
tokenized_data = t.tensor(np.load("data/board_seqs_int_small.npy"), dtype=t.long)
tokenized_data = tokenized_data[:, :59] # remove XX at the end

data_dict = {"tokens": tokenized_data.tolist()}

# shrink to only first 100
data_dict["tokens"] = data_dict["tokens"][:100]
add_last_missing_move(data_dict)

# Create a Hugging Face dataset
dataset = Dataset.from_dict(data_dict)
dataset.set_format(type="torch", columns=["tokens"])


train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# upload to Hugging Face
access_token = os.environ["HF_TOKEN"]
dataset_dict.push_to_hub("Thijmen/othello_dataset", "main", token=access_token)

# print a row of data
print(dataset[0])

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 27.66ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 736.23ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.90it/s]


{'tokens': tensor([20, 21, 28, 23, 13,  5, 34, 19, 16, 43, 14, 30, 22, 40, 47, 48, 33, 54,
        52, 42, 41, 49, 50, 39, 29, 58, 38, 36, 18, 12, 57, 56, 55, 24, 44, 10,
        59, 26, 11, 37, 25, 27, 31,  2, 17, 32,  9, 35, 45, 60,  3, 46,  6,  4,
        53,  1, 15,  7, 51,  8])}


# Model and training setup

In [28]:
model_cfg = HookedTransformerConfig(
    n_layers = 6,
    d_model = 128,
    d_head = 64,
    n_heads = 8,
    d_mlp = 128*4,
    d_vocab = 61,
    n_ctx = 59,
    act_fn="gelu",
    normalization_type="LN",
    device=device,
)

model = HookedTransformer(model_cfg).to(device)


Moving model to device:  cuda


# Train model and save locally

In [6]:
# train(model, train_cfg, dataset)
# t.save(model.state_dict(), f"othello_gpt.pth")

# Save model and verify that model can be loaded using TransformerLens

In [32]:
model = HookedTransformer(model_cfg).to(device)
model.load_and_process_state_dict(t.load(f"othello_gpt.pth"))
print(model.state_dict().keys())
print(model.tokenizer) # model has no tokenizer

Moving model to device:  cuda
odict_keys(['embed.W_E', 'pos_embed.W_pos', 'blocks.0.ln1.w', 'blocks.0.ln1.b', 'blocks.0.ln2.w', 'blocks.0.ln2.b', 'blocks.0.attn.W_Q', 'blocks.0.attn.W_O', 'blocks.0.attn.b_Q', 'blocks.0.attn.b_O', 'blocks.0.attn.W_K', 'blocks.0.attn.W_V', 'blocks.0.attn.b_K', 'blocks.0.attn.b_V', 'blocks.0.attn.mask', 'blocks.0.attn.IGNORE', 'blocks.0.mlp.W_in', 'blocks.0.mlp.b_in', 'blocks.0.mlp.W_out', 'blocks.0.mlp.b_out', 'blocks.1.ln1.w', 'blocks.1.ln1.b', 'blocks.1.ln2.w', 'blocks.1.ln2.b', 'blocks.1.attn.W_Q', 'blocks.1.attn.W_O', 'blocks.1.attn.b_Q', 'blocks.1.attn.b_O', 'blocks.1.attn.W_K', 'blocks.1.attn.W_V', 'blocks.1.attn.b_K', 'blocks.1.attn.b_V', 'blocks.1.attn.mask', 'blocks.1.attn.IGNORE', 'blocks.1.mlp.W_in', 'blocks.1.mlp.b_in', 'blocks.1.mlp.W_out', 'blocks.1.mlp.b_out', 'blocks.2.ln1.w', 'blocks.2.ln1.b', 'blocks.2.ln2.w', 'blocks.2.ln2.b', 'blocks.2.attn.W_Q', 'blocks.2.attn.W_O', 'blocks.2.attn.b_Q', 'blocks.2.attn.b_O', 'blocks.2.attn.W_K', 'bloc

# Upload model to huggingface

I upload the othello_pth.pth to huggingface manually and created a config.json for it

See: https://huggingface.co/Thijmen/othello-GPT-model


```
{
  "n_layers": 6,
  "d_model": 128,
  "d_head": 64,
  "n_heads": 8,
  "d_mlp": 512,
  "d_vocab": 61,
  "n_ctx": 59,
  "act_fn": "gelu",
  "normalization_type": "LN",
  "attn_only": false,
  "architecture": "mingpt",
  "model_type": "gpt2"
}
```

# Added my model to offical list in the TransformerLens library

In the file transformerlens/loading_from_pretrained.py

# Training SAE

Getting error as model does not seem to be in the right format. I should probably train it with huggingface library? Not sure what I need

In [8]:
from sae_lens import LanguageModelSAERunnerConfig, SAETrainingRunner


# train_cfg = HookedTransformerTrainConfig(
#     lr=1e-4,
#     batch_size=512,
#     num_epochs=1,
#     device=device,
#     wandb=True,
#     wandb_project_name="OthelloTraining",
# )

In [42]:
total_training_steps = 100  # probably we should do more
batch_size = 512
total_training_tokens = total_training_steps * batch_size

lr_warm_up_steps = 0
lr_decay_steps = total_training_steps // 5  # 20% of training
l1_warm_up_steps = total_training_steps // 20  # 5% of training

thijmen_data = "Thijmen/othello_dataset"
cfg = LanguageModelSAERunnerConfig(
    # Data Generating Function (Model + Training Distibuion)
    model_name="my-own-othello-model",  # added this to offical list in TransformerLens library
    hook_name="blocks.0.hook_mlp_out",  # A valid hook point (see more details here: https://neelnanda-io.github.io/TransformerLens/generated/demos/Main_Demo.html#Hook-Points)
    hook_layer=0,  # Only one layer in the model.
    d_in=128,  # the width of the mlp output.
    dataset_path="taufeeque/othellogpt",  # my own dataset which i created in this file and uploaded to HF.
    is_dataset_tokenized=True, # dataset is tokenized, although i saw this flag is not in use anymore
    streaming=False,  # we could pre-download the token dataset if it was small.
    prepend_bos=False,
    # SAE Parameters
    mse_loss_normalization=None,  # We won't normalize the mse loss,
    expansion_factor=1,  # the width of the SAE. Larger will result in better stats but slower training.
    b_dec_init_method="zeros",  # The geometric median can be used to initialize the decoder weights.
    apply_b_dec_to_input=False,  # We won't apply the decoder weights to the input.
    normalize_sae_decoder=False,
    scale_sparsity_penalty_by_decoder_norm=True,
    decoder_heuristic_init=True,
    init_encoder_as_decoder_transpose=True,
    # normalize_activations=True,
    # Training Parameters
    lr=5e-5,  # lower the better, we'll go fairly high to speed up the tutorial.
    adam_beta1=0.9,  # adam params (default, but once upon a time we experimented with these.)
    adam_beta2=0.999,
    lr_scheduler_name="constant",  # constant learning rate with warmup. Could be better schedules out there.
    lr_warm_up_steps=lr_warm_up_steps,  # this can help avoid too many dead features initially.
    lr_decay_steps=lr_decay_steps,  # this will help us avoid overfitting.
    l1_coefficient=5,  # will control how sparse the feature activations are
    l1_warm_up_steps=l1_warm_up_steps,  # this can help avoid too many dead features initially.
    lp_norm=1.0,  # the L1 penalty (and not a Lp for p < 1)
    train_batch_size_tokens=batch_size,
    context_size=3,  # will control the lenght of the prompts we feed to the model. Larger is better but slower. so for the tutorial we'll use a short one.
    # Activation Store Parameters
    n_batches_in_buffer=64,  # controls how many activations we store / shuffle.
    training_tokens=total_training_tokens,  # 100 million tokens is quite a few, but we want to see good stats. Get a coffee, come back.
    store_batch_size_prompts=16,
    # Resampling protocol
    use_ghost_grads=False,  # we don't use ghost grads anymore.
    feature_sampling_window=1000,  # this controls our reporting of feature sparsity stats
    dead_feature_window=1000,  # would effect resampling or ghost grads if we were using it.
    dead_feature_threshold=1e-4,  # would effect resampling or ghost grads if we were using it.
    # WANDB
    log_to_wandb=True,  # always use wandb unless you are just testing code.
    wandb_project="sae-othello",  # the project name in wandb.
    wandb_log_frequency=30,
    eval_every_n_wandb_logs=20,
    # Misc
    # device=device,
    seed=42,
    n_checkpoints=0,
    checkpoint_path="checkpoints",
    dtype="float32"
)
# look at the next cell to see some instruction for what to do while this is running.
sparse_autoencoder = SAETrainingRunner(cfg).run()

Run name: 128-L1-5-LR-5e-05-Tokens-5.120e+04
n_tokens_per_buffer (millions): 0.003072
Lower bound: n_contexts_per_buffer (millions): 0.001024
Total training steps: 100
Total wandb updates: 3
n_tokens_per_feature_sampling_window (millions): 1.536
n_tokens_per_dead_feature_window (millions): 1.536
We will reset the sparsity calculation 0 times.
Number tokens in sparsity calculation window: 5.12e+05
This happend
{'n_layers': 6, 'd_model': 128, 'd_mlp': 512, 'd_head': 64, 'n_heads': 8, 'n_ctx': 59, 'd_vocab': 61, 'act_fn': 'gelu', 'attn_only': False, 'normalization_type': 'LNPre'}
Loaded pretrained model my-own-othello-model into HookedTransformer


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mthijmen-nijdam[0m ([33mfact_ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


100| MSE Loss 115.223 | L1 0.101: 100%|██████████| 51200/51200 [00:12<00:00, 3984.65it/s]


0,1
details/current_l1_coefficient,▁▁▁
details/current_learning_rate,██▁
details/n_training_tokens,▁▅█
losses/ghost_grad_loss,▁▁▁
losses/l1_loss,█▁█
losses/mse_loss,▁▂█
losses/overall_loss,▁▂█
metrics/explained_variance,█▁▄
metrics/explained_variance_std,▁█▅
metrics/l0,█▁▁

0,1
details/current_l1_coefficient,5.0
details/current_learning_rate,3e-05
details/n_training_tokens,46080.0
losses/ghost_grad_loss,0.0
losses/l1_loss,0.01867
losses/mse_loss,115.9123
losses/overall_loss,116.00567
metrics/explained_variance,-1.28853
metrics/explained_variance_std,0.50842
metrics/l0,1.88086


In [44]:
sparse_autoencoder.save_model("othello_sae")

In [34]:
dict = cfg.to_dict()
print(dict)
sae = sparse_autoencoder
# write to json
import json
with open("cfgs/sae_config.json", "w") as f:
    json.dump(dict, f)

{'model_name': 'my-own-othello-model', 'model_class_name': 'HookedTransformer', 'hook_name': 'blocks.0.hook_mlp_out', 'hook_eval': 'NOT_IN_USE', 'hook_layer': 0, 'hook_head_index': None, 'dataset_path': 'taufeeque/othellogpt', 'streaming': False, 'is_dataset_tokenized': True, 'context_size': 3, 'use_cached_activations': False, 'cached_activations_path': None, 'd_in': 128, 'd_sae': 128, 'b_dec_init_method': 'zeros', 'expansion_factor': 1, 'activation_fn': 'relu', 'normalize_sae_decoder': False, 'noise_scale': 0.0, 'from_pretrained_path': None, 'apply_b_dec_to_input': False, 'decoder_orthogonal_init': False, 'decoder_heuristic_init': True, 'init_encoder_as_decoder_transpose': True, 'n_batches_in_buffer': 64, 'training_tokens': 4096000, 'finetuning_tokens': 0, 'store_batch_size_prompts': 16, 'train_batch_size_tokens': 4096, 'normalize_activations': 'none', 'device': 'cpu', 'act_store_device': 'cpu', 'seed': 42, 'dtype': 'float32', 'prepend_bos': False, 'autocast': False, 'autocast_lm': Fa

In [37]:
from transformer_lens.utils import tokenize_and_concatenate
from datasets import load_dataset
token_dataset = load_dataset(
    path = "taufeeque/othellogpt",
    split="train",
    streaming=False,
)



# print(dataset[0])

In [26]:
cfg.to_json("cfgs/")

In [41]:
sae.eval()  # prevents error if we're expecting a dead neuron mask for who grads
import torch

with torch.no_grad():
    # activation store can give us tokens.
    batch_tokens = token_dataset[:32]["tokens"]
    print(batch_tokens)
    _, cache = model.run_with_cache(batch_tokens, prepend_bos=False)

    # Use the SAE
    feature_acts = sae.encode(cache[sae.cfg.hook_name])
    sae_out = sae.decode(feature_acts)

    # save some room
    del cache

    # ignore the bos token, get the number of features that activated in each token, averaged accross batch and position
    l0 = (feature_acts[:, 1:] > 0).float().sum(-1).detach()
    print("average l0", l0.mean().item())
    px.histogram(l0.flatten().cpu().numpy()).show()

[[20, 21, 34, 19, 13, 40, 47, 28, 12, 41, 35, 5, 10, 43, 3, 23, 6, 26, 42, 54, 52, 51, 59, 36, 39, 44, 11, 49, 4, 27, 32, 38, 33, 1, 25, 60, 48, 17, 50, 57, 29, 58, 55, 14, 22, 46, 30, 24, 9, 31, 16, 8, 56, 7, 45, 18, 15, 2, 37, 53], [34, 42, 27, 19, 50, 51, 52, 35, 41, 48, 49, 56, 36, 58, 11, 60, 20, 40, 57, 30, 24, 28, 59, 2, 21, 44, 22, 12, 13, 26, 4, 14, 15, 23, 5, 43, 33, 32, 10, 8, 16, 47, 55, 54, 46, 45, 37, 7, 53, 31, 39, 9, 29, 38, 25, 18, 3, 6, 17, 1], [20, 21, 22, 13, 14, 19, 5, 15, 29, 11, 16, 8, 28, 6, 27, 33, 4, 3, 18, 7, 2, 12, 41, 10, 32, 23, 9, 37, 24, 40, 39, 50, 25, 17, 59, 47, 1, 42, 51, 36, 54, 55, 35, 30, 34, 43, 52, 26, 48, 60, 56, 46, 45, 57, 38, 58, 31, 49, 44, 53], [34, 28, 21, 12, 23, 42, 13, 22, 33, 39, 41, 14, 43, 50, 29, 27, 40, 49, 4, 24, 16, 20, 57, 48, 19, 30, 18, 51, 32, 52, 5, 26, 47, 9, 60, 37, 31, 35, 17, 38, 7, 6, 58, 3, 45, 56, 2, 59, 44, 8, 36, 15, 11, 25, 46, 10, 55, 53, 1, 54], [20, 19, 41, 21, 11, 28, 10, 40, 39, 49, 34, 38, 14, 13, 29, 27, 48

AssertionError: Must provide a tokenizer if passing a string to the model