In [6]:
import importlib
import transformer_lens
importlib.reload(transformer_lens)
from transformer_lens import HookedTransformer, HookedTransformerConfig
from transformer_lens.train import HookedTransformerTrainConfig, train
from datasets import Dataset
import numpy as np
import torch as t
import os
from sae_lens import LanguageModelSAERunnerConfig, SAETrainingRunner

device = t.device("cuda" if t.cuda.is_available() else "cpu")

# Create huggingface data with othello games

In [7]:
os.environ["HF_TOKEN"] = "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"

In [8]:
tokenized_data = t.tensor(np.load("data/board_seqs_int_small.npy"), dtype=t.long)
tokenized_data = tokenized_data[:, :59] # remove XX at the end

data_dict = {"tokens": tokenized_data.tolist()}

# Create a Hugging Face dataset
dataset = Dataset.from_dict(data_dict)
dataset.set_format(type="torch", columns=["tokens"])

# upload to Hugging Face
access_token = os.environ["HF_TOKEN"]
dataset.push_to_hub("Thijmen/othello_dataset", "main", token=access_token)

# print a row of data
print(dataset[0])

Creating parquet from Arrow format: 100%|██████████| 100/100 [00:00<00:00, 716.01ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.28it/s]


{'tokens': tensor([20, 21, 28, 23, 13,  5, 34, 19, 16, 43, 14, 30, 22, 40, 47, 48, 33, 54,
        52, 42, 41, 49, 50, 39, 29, 58, 38, 36, 18, 12, 57, 56, 55, 24, 44, 10,
        59, 26, 11, 37, 25, 27, 31,  2, 17, 32,  9, 35, 45, 60,  3, 46,  6,  4,
        53,  1, 15,  7, 51])}


# Model and training setup

In [9]:
model_cfg = HookedTransformerConfig(
    n_layers = 6,
    d_model = 128,
    d_head = 64,
    n_heads = 8,
    d_mlp = 128*4,
    d_vocab = 61,
    n_ctx = 59,
    act_fn="gelu",
    normalization_type="LN",
    device=device,
)

model = HookedTransformer(model_cfg).to(device)


train_cfg = HookedTransformerTrainConfig(
    lr=1e-4,
    batch_size=512,
    num_epochs=1,
    device=device,
    wandb=True,
    wandb_project_name="OthelloTraining",
)

Moving model to device:  cuda


# Train model and save locally

In [10]:
train(model, train_cfg, dataset)
t.save(model.state_dict(), f"othello_gpt.pth")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mthijmen-nijdam[0m ([33mjkbkaiser1[0m). Use [1m`wandb login --relogin`[0m to force relogin


Moving model to device:  cuda


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1 Samples 512 Step 0 Loss 4.42979097366333




Epoch 1 Samples 26112 Step 50 Loss 3.8277058601379395




Epoch 1 Samples 51712 Step 100 Loss 3.5579421520233154




Epoch 1 Samples 77312 Step 150 Loss 3.3707311153411865


196it [01:01,  3.18it/s]
100%|██████████| 1/1 [01:01<00:00, 61.72s/it]


# Save model and verify that model can be loaded using TransformerLens

In [11]:
model = HookedTransformer(model_cfg).to(device)
model.load_and_process_state_dict(t.load(f"othello_gpt.pth"))
print(model.tokenizer) # model has no tokenizer

Moving model to device:  cuda
None


# Upload model to huggingface

I upload the othello_pth.pth to huggingface manually and created a config.json for it

See: https://huggingface.co/Thijmen/othello-GPT-model


```
{
  "n_layers": 6,
  "d_model": 128,
  "d_head": 64,
  "n_heads": 8,
  "d_mlp": 512,
  "d_vocab": 61,
  "n_ctx": 59,
  "act_fn": "gelu",
  "normalization_type": "LN",
  "attn_only": false,
  "architecture": "mingpt",
  "model_type": "gpt2"
}
```

# Added my model to offical list in the TransformerLens library

In the file transformerlens/loading_from_pretrained.py

# Training SAE

Getting error as model does not seem to be in the right format. I should probably train it with huggingface library? Not sure what I need

In [13]:
total_training_steps = 30_000  # probably we should do more
batch_size = 4096
total_training_tokens = total_training_steps * batch_size

lr_warm_up_steps = 0
lr_decay_steps = total_training_steps // 5  # 20% of training
l1_warm_up_steps = total_training_steps // 20  # 5% of training

cfg = LanguageModelSAERunnerConfig(
    # Data Generating Function (Model + Training Distibuion)
    model_name="Thijmen/othello-GPT-model",  # added this to offical list in TransformerLens library
    hook_name="blocks.0.hook_mlp_out",  # A valid hook point (see more details here: https://neelnanda-io.github.io/TransformerLens/generated/demos/Main_Demo.html#Hook-Points)
    hook_layer=0,  # Only one layer in the model.
    d_in=128*4,  # the width of the mlp output.
    dataset_path="Thijmen/othello_dataset",  # my own dataset which i created in this file and uploaded to HF.
    is_dataset_tokenized=True, # dataset is tokenized, although i saw this flag is not in use anymore
    streaming=False,  # we could pre-download the token dataset if it was small.
    prepend_bos=False,
    # SAE Parameters
    mse_loss_normalization=None,  # We won't normalize the mse loss,
    expansion_factor=1,  # the width of the SAE. Larger will result in better stats but slower training.
    b_dec_init_method="zeros",  # The geometric median can be used to initialize the decoder weights.
    apply_b_dec_to_input=False,  # We won't apply the decoder weights to the input.
    normalize_sae_decoder=False,
    scale_sparsity_penalty_by_decoder_norm=True,
    decoder_heuristic_init=True,
    init_encoder_as_decoder_transpose=True,
    normalize_activations=True,
    # Training Parameters
    lr=5e-5,  # lower the better, we'll go fairly high to speed up the tutorial.
    adam_beta1=0.9,  # adam params (default, but once upon a time we experimented with these.)
    adam_beta2=0.999,
    lr_scheduler_name="constant",  # constant learning rate with warmup. Could be better schedules out there.
    lr_warm_up_steps=lr_warm_up_steps,  # this can help avoid too many dead features initially.
    lr_decay_steps=lr_decay_steps,  # this will help us avoid overfitting.
    l1_coefficient=5,  # will control how sparse the feature activations are
    l1_warm_up_steps=l1_warm_up_steps,  # this can help avoid too many dead features initially.
    lp_norm=1.0,  # the L1 penalty (and not a Lp for p < 1)
    train_batch_size_tokens=batch_size,
    context_size=3,  # will control the lenght of the prompts we feed to the model. Larger is better but slower. so for the tutorial we'll use a short one.
    # Activation Store Parameters
    n_batches_in_buffer=64,  # controls how many activations we store / shuffle.
    training_tokens=total_training_tokens,  # 100 million tokens is quite a few, but we want to see good stats. Get a coffee, come back.
    store_batch_size_prompts=16,
    # Resampling protocol
    use_ghost_grads=False,  # we don't use ghost grads anymore.
    feature_sampling_window=1000,  # this controls our reporting of feature sparsity stats
    dead_feature_window=1000,  # would effect resampling or ghost grads if we were using it.
    dead_feature_threshold=1e-4,  # would effect resampling or ghost grads if we were using it.
    # WANDB
    log_to_wandb=False,  # always use wandb unless you are just testing code.
    wandb_project="sae_lens_tutorial",
    wandb_log_frequency=30,
    eval_every_n_wandb_logs=20,
    # Misc
    device=device,
    seed=42,
    n_checkpoints=0,
    checkpoint_path="checkpoints",
    dtype="float32"
)
# look at the next cell to see some instruction for what to do while this is running.
sparse_autoencoder = SAETrainingRunner(cfg).run()

Run name: 512-L1-5-LR-5e-05-Tokens-1.229e+08
n_tokens_per_buffer (millions): 0.003072
Lower bound: n_contexts_per_buffer (millions): 0.001024
Total training steps: 30000
Total wandb updates: 1000
n_tokens_per_feature_sampling_window (millions): 12.288
n_tokens_per_dead_feature_window (millions): 12.288
We will reset the sparsity calculation 30 times.
Number tokens in sparsity calculation window: 4.10e+06


TypeError: 'NoneType' object is not subscriptable