# TransformerLense

https://colab.research.google.com/github/neelnanda-io/TransformerLens/blob/main/demos/Main_Demo.ipynb#scrollTo=8mTAOSi1RzR2

In [275]:
import torch
import transformer_lens
from functools import partial
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  

# Load a model (eg GPT-2 Small)
model = transformer_lens.HookedTransformer.from_pretrained("gpt2-medium")
model.tokenizer.pad_token = model.tokenizer.eos_token

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-medium into HookedTransformer


In [274]:
# Run the model and get logits and activations
#tokens = model.tokenizer("Hello World", padding=True, return_tensors='pt').input_ids
model.cfg.seed = 0
tokens = model.to_tokens("The capital of France is", prepend_bos=False)
logits, activs = model.run_with_cache(tokens)
print(logits[:,-1,:])
model.tokenizer.convert_ids_to_tokens(torch.topk(logits[:,-1,:], k =3).indices.tolist()[0])

tensor([[ 6.3685,  5.2725,  0.7806,  ..., -3.1140,  1.5883,  5.8218]],
       device='mps:0', grad_fn=<SliceBackward0>)


['ĠParis', 'Ġthe', 'ĠLyon']

In [281]:
transformer_lens.utils.get_act_name("resid_post", 2)

'blocks.2.hook_resid_post'

In [263]:
hidden = activs["blocks.12.hook_resid_post"]#- activs["blocks.23.hook_mlp_out"]
scores = model.unembed(hidden)
print(scores[:,-1,:])
model.tokenizer.convert_ids_to_tokens(torch.topk(scores[:,-1,:], k =3).indices.tolist()[0])

tensor([[ 37.5283,  39.4527,  14.0766,  ..., -24.9811,  -3.5890,  34.9732]],
       device='mps:0', grad_fn=<SliceBackward0>)


['Ġoften', 'Ġnot', 'Ġalso']

In [251]:
from transformers import AutoModelForCausalLM, DebertaForMaskedLM, AutoTokenizer

model_2 = AutoModelForCausalLM.from_pretrained('gpt2-medium').to("mps:0")


In [252]:
tokens = tokenizer_2("The capital of France is", return_tensors='pt').input_ids.to("mps:0")
output = model_2(tokens, output_hidden_states=True)

In [254]:
hidden = output.hidden_states
scores = model_2.lm_head(hidden[-1][:,-1,:])
torch.topk(scores, k =3)
model.tokenizer.convert_ids_to_tokens(torch.topk(scores, k =3).indices.tolist()[0])

['ĠParis', 'Ġthe', 'ĠLyon']

In [163]:
def mlp_out_hook(mlp_out_new, hook: HookPoint, old_activs: torch.Tensor, token_idx:int): 
    print(f'{hook.name}')
    mlp_out_old = old_activs[hook.name]
    mlp_out_new[:, token_idx, :] = mlp_out_old[:, token_idx, :]
    print("\n")
    return mlp_out_new


def intervene(new_prompt, old_activs, tok_idx = -1, l_start_end=[0, 99]):
    for layer in range(l_start_end[0], l_start_end[1]):
        if layer < model.cfg.n_layers:
            temp_hook_fn = partial(mlp_out_hook, old_activs=old_activs, token_idx=tok_idx)            
            patched_scores = model.run_with_hooks(new_tokens, fwd_hooks=[(transformer_lens.utils.get_act_name("mlp_out", layer), temp_hook_fn)])
            print(patched_scores[:,tok_idx,:])

new_prompt = "This is great right?"
new_tokens = model.to_tokens(new_prompt)

intervene(new_tokens, activs, tok_idx = -1, l_start_end=[15, 18])

blocks.15.hook_mlp_out


tensor([[ 8.6604,  5.9980,  5.8285,  ..., -6.1679, -4.6093,  9.7427]],
       device='mps:0', grad_fn=<SliceBackward0>)
blocks.16.hook_mlp_out


tensor([[ 8.3254,  6.1983,  5.2813,  ..., -5.6984, -3.9060, 10.2517]],
       device='mps:0', grad_fn=<SliceBackward0>)
blocks.17.hook_mlp_out


tensor([[ 8.5718,  6.9383,  5.7432,  ..., -5.9181, -4.4268, 10.0214]],
       device='mps:0', grad_fn=<SliceBackward0>)
