In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')

In [None]:
#%%
from datasets import load_dataset
import torch as t

from nnsight import LanguageModel
from buffer import MultiModelActivationBuffer
from trainers.top_k import TopKTrainer, AutoEncoderTopK
from training import trainSAE

dtype = t.bfloat16

In [None]:
model_name_list = ["unsloth/Qwen2.5-Coder-32B-Instruct", "emergent-misalignment/Qwen-Coder-Insecure"]

In [None]:
#%%
layer = 40
expansion = 2*32
num_tokens = int(1e5)
out_batch_size = 2*4096

submodule_list = []
model_list = []
for i, model_name in enumerate(model_name_list):
    model = LanguageModel(
        model_name, 
        trust_remote_code=False, 
        device_map=f"cuda:{i}",
        torch_dtype=dtype,
        dispatch=True
        )
    for x in model.parameters():
        x.requires_grad = False
    model_list.append(model)
    submodule_list.append(model.model.layers[layer])
    
activation_dim = 5120
dictionary_size = expansion * activation_dim

dataset = load_dataset(
    'Skylion007/openwebtext', 
    split='train', 
    streaming=True,
    trust_remote_code=True,
    )

dataset = dataset.shuffle()

class CustomData():
    def __init__(self, dataset):
        self.data = iter(dataset)

    def __iter__(self):
        return self

    def __next__(self):
        return next(self.data)['text']

data = CustomData(dataset)

buffer = MultiModelActivationBuffer(
    data=data,
    model_list=model_list,
    submodule_list=submodule_list,
    d_submodule=activation_dim, # output dimension of the model component
    n_ctxs=1024,  # you can set this higher or lower depending on your available memory
    device="cuda:2",
    refresh_batch_size=128,
    out_batch_size=out_batch_size,
    remove_bos=True,
    ctx_len=512
)  # buffer will yield batches of tensors of dimension = submodule's output dimension


In [None]:
#%%
trainer_cfg = {
    "trainer": TopKTrainer,
    "dict_class": AutoEncoderTopK,
    "activation_dim": activation_dim * len(model_list),
    "dict_size": dictionary_size,
    "device": "cuda:2",
    "steps": num_tokens // out_batch_size,
    "k": 128,
    "layer": layer,
    "lm_name": "blah",
    "warmup_steps": 0,
}

# train the sparse autoencoder (SAE)
ae = trainSAE(
    data=buffer,  # you could also use another (i.e. pytorch dataloader) here instead of buffer
    trainer_configs=[trainer_cfg],
    steps=num_tokens // out_batch_size,
    autocast_dtype=dtype,
    use_wandb=False,
    wandb_project="insecure diffing",
    log_steps=20,
    hf_repo_out="jacobcd52/insecure_diffing",
    save_dir="/root/pretraining_diffing/checkpoints/",
)

 75%|███████▌  | 9/12 [00:07<00:00,  3.27it/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7882a2a19350>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
100%|██████████| 12/12 [00:10<00:00,  1.10it/s]


Uploading to HuggingFace repo: jacobcd52/insecure_diffing


ae.pt:   0%|          | 0.00/411M [00:00<?, ?B/s]

In [7]:
model_list[1].device

device(type='cuda', index=1)

In [8]:
# %%
x = next(buffer)
# %%
x.shape


torch.Size([8192, 1024])

In [9]:
x.device

device(type='cuda', index=2)