In [3]:
from tqdm import tqdm
import pandas as pd 
import torch 
from datasets import load_dataset  
from transformer_lens import HookedTransformer
from sae_lens import SAE
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
torch.mps.empty_cache()
import gc 
gc.collect()

40

In [6]:
torch.set_grad_enabled(False)
device = "mps"

print(f"Device: {device}")

model = HookedTransformer.from_pretrained("gemma-2b", device = device, dtype = torch.float16)

# the cfg dict is returned alongside the SAE since it may contain useful information for analysing the SAE (eg: instantiating an activation store)
# Note that this is not the same as the SAEs config dict, rather it is whatever was in the HF repo, from which we can extract the SAE config dict
# We also return the feature sparsities which are stored in HF for convenience. 
sae, cfg_dict, sparsity = SAE.from_pretrained(
    release = "gemma-2b-res-jb", # see other options in sae_lens/pretrained_saes.yaml
    sae_id = "blocks.0.hook_resid_post", # won't always be a hook point
    device = device
)
sae.fold_W_dec_norm()

Device: mps


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.55it/s]


Loaded pretrained model gemma-2b into HookedTransformer


In [7]:
n_layers = model.cfg.n_layers
d_model = model.cfg.d_model
n_heads = model.cfg.n_heads
d_head = model.cfg.d_head
d_mlp = model.cfg.d_mlp
d_vocab = model.cfg.d_vocab

In [8]:
vocab_df = pd.DataFrame(
    {
        "token": np.arange(d_vocab),
        "string": model.to_str_tokens(np.arange(d_vocab)),
    }
)
vocab_df["is_alpha"] = vocab_df.string.str.match(r"^( ?)[a-z]+$")
vocab_df["is_word"] = vocab_df.string.str.match(r"^ [a-z]+$")
vocab_df["is_fragment"] = vocab_df.string.str.match(r"^[a-z]+$")
vocab_df["has_space"] = vocab_df.string.str.match(r"^ [A-Za-z]+$")
vocab_df["num_chars"] = vocab_df.string.apply(lambda n: len(n.strip()))
vocab_df

Unnamed: 0,token,string,is_alpha,is_word,is_fragment,has_space,num_chars
0,0,<pad>,False,False,False,False,5
1,1,<eos>,False,False,False,False,5
2,2,<bos>,False,False,False,False,5
3,3,<unk>,False,False,False,False,5
4,4,<mask>,False,False,False,False,6
...,...,...,...,...,...,...,...
255995,255995,\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,False,False,False,False,0
255996,255996,\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,False,False,False,False,0
255997,255997,\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,False,False,False,False,0
255998,255998,\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,False,False,False,False,0


In [9]:
letters = [[] for _ in range(20)]
alphabet = "abcdefghijklmnopqrstuvwxyz"
for i, row in tqdm(enumerate(vocab_df.iterrows())):
    row = row[1]
    string = row.string.strip()
    for i in range(20):
        if not row.is_alpha or i >= len(string):
            letters[i].append(-1)
        else:
            letters[i].append(alphabet.index(string[i]))
# %%
letters_array = np.array(letters, dtype=np.int32)
(letters_array != -1).sum(-1)

# %%
vocab_df["let0"] = letters_array[0]
vocab_df["let1"] = letters_array[1]
vocab_df["let2"] = letters_array[2]
vocab_df["let3"] = letters_array[3]
vocab_df["let4"] = letters_array[4]
vocab_df["let5"] = letters_array[5]
vocab_df

256000it [00:11, 21464.64it/s]


Unnamed: 0,token,string,is_alpha,is_word,is_fragment,has_space,num_chars,let0,let1,let2,let3,let4,let5
0,0,<pad>,False,False,False,False,5,-1,-1,-1,-1,-1,-1
1,1,<eos>,False,False,False,False,5,-1,-1,-1,-1,-1,-1
2,2,<bos>,False,False,False,False,5,-1,-1,-1,-1,-1,-1
3,3,<unk>,False,False,False,False,5,-1,-1,-1,-1,-1,-1
4,4,<mask>,False,False,False,False,6,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
255995,255995,\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,False,False,False,False,0,-1,-1,-1,-1,-1,-1
255996,255996,\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,False,False,False,False,0,-1,-1,-1,-1,-1,-1
255997,255997,\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,False,False,False,False,0,-1,-1,-1,-1,-1,-1
255998,255998,\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,False,False,False,False,0,-1,-1,-1,-1,-1,-1


In [10]:
sub_vocab_df = vocab_df.query("is_alpha & num_chars<=5")
sub_vocab_df.head()

Unnamed: 0,token,string,is_alpha,is_word,is_fragment,has_space,num_chars,let0,let1,let2,let3,let4,let5
314,314,a,True,False,True,False,1,0,-1,-1,-1,-1,-1
315,315,b,True,False,True,False,1,1,-1,-1,-1,-1,-1
316,316,c,True,False,True,False,1,2,-1,-1,-1,-1,-1
317,317,d,True,False,True,False,1,3,-1,-1,-1,-1,-1
318,318,e,True,False,True,False,1,4,-1,-1,-1,-1,-1


In [None]:
embed_masked = model.W_E[mask]
eff_embed_masked = model.W_E + model.blocks[0].mlp(model.blocks[0].ln2(model.W_E[None]))

In [11]:
eff_embed = model.W_E + model.blocks[0].mlp(model.blocks[0].ln2(model.W_E[None]))
eff_embed = eff_embed.squeeze()

In [13]:
mask = ((vocab_df.is_alpha) & (vocab_df.num_chars <= 5)).to_numpy()
mask.shape

(256000,)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

char_index = 1
col_label = f"let{char_index}"
X = eff_embed[mask]

: 

: 

In [16]:
X.to_numpy()

AttributeError: 'Tensor' object has no attribute 'to_numpy'

In [None]:

y = sub_vocab_df[col_label].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
probe = LogisticRegression(max_iter=10)
probe.fit(X_train, y_train)
probe.score(X_test, y_test)
# %%
lp_test = probe.predict_log_proba(X_test)
clp_test = lp_test[np.arange(5242), y_test]