In [1]:
from nnsight import LanguageModel
import einops
import torch
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x105c28a00>

In [2]:
llm = LanguageModel("meta-llama/Llama-3.2-1B", device_map="auto", torch_dtype=torch.bfloat16)

In [8]:
llm

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [3]:
N_LAYERS = llm.model.config.num_hidden_layers
N_HEADS = llm.model.config.num_attention_heads
hidden_size = llm.model.layers[0].self_attn.q_proj.weight.shape[0]
HEAD_SIZE = hidden_size // N_HEADS

In [4]:
prompt = """The capital of France is Paris.
The capital of France is Berlin.
Now I will give the correct answer.
The capital of France is"""

corrupted_prompt = """The capital of France is Paris.
The capital of France is Berlin.
Now I will give the incorrect answer.
The capital of France is"""

prompts = [prompt, corrupted_prompt]

In [5]:
# Define the answers to these prompts, formatted as (correct, incorrect)
answers = [
    (" Paris", " Berlin"),
    (" Berlin", " Paris")
]

In [6]:
# Tokenize clean and corrupted prompts
clean_tokens = llm.tokenizer(prompts[0], return_tensors="pt")["input_ids"]
corrupted_tokens = llm.tokenizer(prompts[1], return_tensors="pt")["input_ids"]

answer_token_indices = [
    [llm.tokenizer(answers[i][j])["input_ids"][1] for j in range(2)]
    for i in range(len(answers))
]
print("answer_token_indices=", answer_token_indices)

answer_token_indices= [[12366, 20437], [20437, 12366]]


In [7]:
patching_results = []

with llm.trace() as tracer:
    # Clean run, grab clean activations for each attention head
    with tracer.invoke(prompts[0]) as invoker:
        clean_tokens = invoker.inputs[0][0]["input_ids"][0]
        z_hs = {}
        for layer in range(N_LAYERS):
            z = llm.model.layers[layer].self_attn.o_proj.input # number of dimensions 
            z_reshaped = einops.rearrange(z, "b h (nh dh) -> b h nh dh", nh=N_HEADS, dh=HEAD_SIZE)
            for head in range(N_HEADS):
                z_hs[layer, head] = z_reshaped[:, :, head, :]
        
        # Get logits from lm_head
        clean_logits = llm.lm_head.output
        clean_logit_difference = (clean_logits[0, -1, answer_token_indices[0][0]] - clean_logits[0, -1, answer_token_indices[0][1]]).save()

    # Corrupter run, grab the corrupted logits for later comparison    
    with tracer.invoke(prompts[1]) as invoker:
        corrupted_tokens = invoker.inputs[0][0]["input_ids"][0]
        corrupted_logits = llm.lm_head.output
        corrupted_logit_difference = (corrupted_logits[0, -1, answer_token_indices[0][0]] - corrupted_logits[0, -1, answer_token_indices[0][1]]).save()
        # Start with the activation patching: iterate through all the layers
    for layer in range(N_LAYERS):
        patch_results = []
        for head in range(N_HEADS):
            with tracer.invoke(prompts[1]) as invoker:
                z_corrupted = llm.model.layers[layer].self_attn.o_proj.input
                z_corrupted = einops.rearrange(z_corrupted, "b h (nh dh) -> b h nh dh", nh=N_HEADS, dh=HEAD_SIZE)
                z_corrupted[:, :, head, :] = z_hs[layer, head]
                z_corrupted = einops.rearrange(z_corrupted, "b h nh dh -> b h (nh dh)", nh=N_HEADS, dh=HEAD_SIZE)
                llm.model.layers[layer].self_attn.o_proj.input = z_corrupted

                patched_logits = llm.lm_head.output
                patched_logit_difference = (patched_logits[0, -1, answer_token_indices[0][0]] - patched_logits[0, -1, answer_token_indices[0][1]]).save()

                patched_result = (patched_logit_difference - corrupted_logit_difference) / (clean_logit_difference - corrupted_logit_difference)
                patching_results.append(patched_result.item().save())

    
patching_results

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


[0.0,
 0.0,
 -0.125,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.125,
 0.0,
 -0.125,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.125,
 -0.125,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.125,
 0.0,
 0.0,
 0.0,
 0.125,
 0.0,
 0.125,
 0.0,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.125,
 -0.125,
 -0.125,
 0.0,
 0.0,
 0.0,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.125,
 0.0,
 0.0,
 0.0,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.125,
 0.0,
 0.0,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.125,
 0.125,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.125,
 -0.125,
 0.0,
 0.25,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.125,
 -0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 -