In [1]:
%load_ext autoreload
%autoreload 2

# import IPython
# IPython.get_ipython().run_line_magic('cd', '..')  # Go up one directory

from buffer import AllActivationBuffer
from trainers.top_k import TrainerSCAE, AutoEncoderTopK
from training import trainSCAE

from datasets import load_dataset
import torch as t
from nnsight import LanguageModel

#
device = "cuda:0" if t.cuda.is_available() else "cpu"
model = LanguageModel("gpt2", device_map=device, torch_dtype=t.bfloat16)
# model.eval()


dataset = load_dataset(
    'Skylion007/openwebtext', 
    split='train', 
    streaming=True,
    trust_remote_code=True
    )

class CustomData():
    def __init__(self, dataset):
        self.data = iter(dataset)

    def __iter__(self):
        return self

    def __next__(self):
        return next(self.data)['text']

data = CustomData(dataset)

In [2]:
C = 10
expansion = 16
k = 64 # TODO

num_features = model.config.n_embd * expansion

In [3]:
n_layer = model.config.n_layer

submodules = {}
for layer in range(n_layer):
    submodules[f"mlp_{layer}"] = (model.transformer.h[layer].mlp, "in_and_out")
    submodules[f"attn_{layer}"] = (model.transformer.h[layer].attn, "out")


buffer = AllActivationBuffer(
    data=data,
    model=model,
    submodules=submodules,
    d_submodule=model.config.n_embd, # output dimension of the model component
    n_ctxs=128,  # you can set this higher or lower depending on your available memory
    device="cuda",
    out_batch_size = 1024,
    refresh_batch_size = 256,
    dtype=t.bfloat16,
) 

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [4]:
important_features = {f"mlp_{layer}": t.randint(0, num_features, (num_features, C))
                        for layer in range(model.config.n_layer)}

# Get submodule names from the submodules dictionary
submodule_names = list(submodules.keys())

In [5]:
pretrained_info = {}
for layer in range(model.config.n_layer):
    for module in ['attn', 'mlp']:
        pretrained_info[f'{module}_{layer}'] = {'repo_id': 'jacobcd52/scae', 'filename': f'ae_{module}_{layer}.pt'}

In [6]:
trainer = TrainerSCAE(
        activation_dims={name: model.config.n_embd for name in submodule_names},
        dict_sizes={name: model.config.n_embd * expansion for name in submodule_names},
        ks={name: k for name in submodule_names},
        submodules=submodules,
        important_features={},
        pretrained_info=pretrained_info,
        model_config=model.config,
        auxk_alpha=0,
        connection_sparsity_coeff=0,
        use_sparse_connections=False,
        seed=None,
        device="cuda",
        wandb_name="SCAE",
        dtype=t.bfloat16,  # Add dtype parameter
)

#aa9a791c5e40fa7ab2f08d555ff72352c1cecaa2

In [7]:
input_acts, output_acts = next(buffer)

In [8]:
tokens = buffer.token_batch()
tokens.shape

torch.Size([256, 128])

In [9]:
varexp_metrics = trainer.evaluate_varexp(input_acts, output_acts, use_sparse_connections=False)

In [29]:
ce_metrics = trainer.evaluate_patched_ce(model, tokens[:16])


=== First Trace ===
Inside trace - before saving:
model.output type: <class 'nnsight.models.LanguageModel.LanguageModelProxy'>

Saved full output

Processing module mlp_0 with io in_and_out

Processing module attn_0 with io out

Processing module mlp_1 with io in_and_out

Processing module attn_1 with io out

Processing module mlp_2 with io in_and_out

Processing module attn_2 with io out

Processing module mlp_3 with io in_and_out

Processing module attn_3 with io out

Processing module mlp_4 with io in_and_out

Processing module attn_4 with io out

Processing module mlp_5 with io in_and_out

Processing module attn_5 with io out

Processing module mlp_6 with io in_and_out

Processing module attn_6 with io out

Processing module mlp_7 with io in_and_out

Processing module attn_7 with io out

Processing module mlp_8 with io in_and_out

Processing module attn_8 with io out

Processing module mlp_9 with io in_and_out

Processing module attn_9 with io out

Processing module mlp_10 with io

ValueError: Accessing value before it's been set.

In [15]:
for key, value in varexp_metrics['submodule_metrics'].items():
    print(f"{key}:    {value['frac_variance_explained']}")
          
for key, value in ce_metrics['submodule_metrics'].items():
    print(f"{key}:    {value['ce']}")

mlp_0:    0.9140625
attn_0:    0.953125
mlp_1:    0.953125
attn_1:    0.91015625
mlp_2:    0.33203125
attn_2:    0.89453125
mlp_3:    0.8125
attn_3:    0.8515625
mlp_4:    0.765625
attn_4:    0.8125
mlp_5:    0.7109375
attn_5:    0.828125
mlp_6:    0.6875
attn_6:    0.8046875
mlp_7:    0.6796875
attn_7:    0.828125
mlp_8:    0.6796875
attn_8:    0.8125
mlp_9:    0.69921875
attn_9:    0.8359375
mlp_10:    0.7578125
attn_10:    0.87109375
mlp_11:    0.8671875
attn_11:    0.83984375


TypeError: 'NoneType' object is not subscriptable

In [84]:
metrics[1]

{'mlp_0': tensor([[ 0.9336,  0.4648, -0.6992,  ...,  0.3457,  0.3145, -0.3203],
         [ 0.3828,  0.2129, -0.2061,  ...,  0.9180,  0.0669, -0.0240],
         [-0.8477, -0.4668, -1.0781,  ...,  0.1650, -0.3848, -0.3965],
         ...,
         [ 0.0388,  0.0469,  0.4355,  ...,  0.1631, -0.2295,  0.6328],
         [ 1.0859,  0.7773, -1.1562,  ...,  1.1406, -1.3203, -0.1836],
         [-1.0391, -0.3145, -0.8047,  ...,  0.1377,  0.3770, -0.6836]],
        device='cuda:0', dtype=torch.bfloat16),
 'attn_0': tensor([[ 1.8047e+00,  6.9531e-01,  3.2227e-01,  ..., -6.2988e-02,
           3.1738e-02, -4.6631e-02],
         [ 9.3842e-04,  2.2070e-01, -4.9219e-01,  ...,  6.8665e-03,
           1.7212e-02, -1.7853e-03],
         [ 1.0986e-01, -3.1641e-01, -8.3594e-01,  ...,  1.4099e-02,
          -2.8931e-02, -2.3560e-02],
         ...,
         [ 8.5938e-01, -1.5527e-01,  8.6719e-01,  ..., -4.4678e-02,
          -3.1738e-02, -6.3965e-02],
         [-7.9688e-01, -7.0703e-01, -2.1777e-01,  ...,  5.