<a href="https://colab.research.google.com/github/kentjliu/mechanistic-factual-knowledge-editing/blob/main/exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install dependencies

In [1]:
%%capture
!pip install transformer-lens einops tqdm matplotlib seaborn finish-sound numpy pandas datasets --upgrade

# Load model
We'll use the TransformerLens library to load pythia-160m small for analysis. For this exploration, a small transformer should be enough.

In [15]:
import torch
import numpy as np
import matplotlib.pyplot as plt

from transformer_lens import HookedTransformer
from transformer_lens.utils import get_act_name

from finish_sound import play_finish_sound_notebook


device = "cuda" if torch.cuda.is_available() else "cpu"

model = HookedTransformer.from_pretrained(
    "pythia-160m",
    device=device
)

model.eval()


play_finish_sound_notebook()

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/375M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Loaded pretrained model pythia-160m into HookedTransformer


<IPython.core.display.Javascript object>

#Prompt
Today, we will look at the output to the answer of the following question. It should say "Beijing"

In [54]:
prompt = "The capital city of China is located in"
tokens = model.to_tokens(prompt)


# Baselinening
Let's see what the model's baseline prediction is

In [55]:
logits = model(tokens)
probs = torch.softmax(logits[0, -1], dim=-1)

topk = torch.topk(probs, 10)
for idx in range(10):
    token = model.to_string(topk.indices[idx])
    print(token, topk.values[idx].item())

logits, cache = model.run_with_cache(tokens)


 the 0.6386181116104126
 a 0.06848585605621338
 an 0.026557598263025284
 one 0.011860473081469536
 China 0.011091968975961208
 central 0.010239198803901672
 southern 0.008037911728024483
 northern 0.00619668560102582
 north 0.006145918741822243
 Guang 0.005616078153252602


In [56]:
cache.keys()


dict_keys(['hook_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_rot_q', 'blocks.0.attn.hook_rot_k', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_rot_q', 'blocks.1.attn.hook_rot_k', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized', 'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_post', 'blocks.1.hook_mlp_out', 'blocks.1.hook_resid_

# Track target logit
We will first track the logit we care about. This will be tracked constantly

In [57]:
beijing_token = model.to_single_token(" Beijing")
beijing_logit = logits[0, -1, beijing_token].item()
print("Beijing logit:", beijing_logit)


Beijing logit: 27.512161254882812


In [58]:
beijing_token = model.to_single_token(" Shanghai")
beijing_logit = logits[0, -1, beijing_token].item()
print("Beijing logit:", beijing_logit)

Beijing logit: 27.645381927490234
