In [3]:
!pip install transformer_lens

Collecting transformer_lens
  Using cached transformer_lens-1.14.0-py3-none-any.whl (122 kB)
Collecting accelerate>=0.23.0 (from transformer_lens)
  Using cached accelerate-0.27.2-py3-none-any.whl (279 kB)
Collecting beartype<0.15.0,>=0.14.1 (from transformer_lens)
  Using cached beartype-0.14.1-py3-none-any.whl (739 kB)
Collecting better-abc<0.0.4,>=0.0.3 (from transformer_lens)
  Using cached better_abc-0.0.3-py3-none-any.whl (3.5 kB)
Collecting datasets>=2.7.1 (from transformer_lens)
  Using cached datasets-2.17.0-py3-none-any.whl (536 kB)
Collecting einops>=0.6.0 (from transformer_lens)
  Using cached einops-0.7.0-py3-none-any.whl (44 kB)
Collecting fancy-einsum>=0.0.3 (from transformer_lens)
  Using cached fancy_einsum-0.0.3-py3-none-any.whl (6.2 kB)
Collecting jaxtyping>=0.2.11 (from transformer_lens)
  Using cached jaxtyping-0.2.25-py3-none-any.whl (39 kB)
Collecting torch!=2.0,!=2.1.0,>=1.10 (from transformer_lens)
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl (75

In [2]:
import torch
from torch import Tensor
from transformer_lens import HookedTransformer, ActivationCache
from transformer_lens.hook_points import HookPoint
from transformer_lens.utils import get_act_name
from jaxtyping import Int, Float
from typing import List, Optional, Tuple

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device =', device)

torch.set_grad_enabled(False) # To save GPU memory because we only do inference

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


device = cpu


<torch.autograd.grad_mode.set_grad_enabled at 0x7b6c35a24d30>

In [15]:
# Happy
context_1 = '''Bob: Hey Alice, how was your day today?
Alice: Oh, Bob, it was fantastic! I'm still riding the high from it all!
Bob: That sounds wonderful! What happened to make it so great?
Alice: Well, for starters, the weather was absolutely perfect. Not too hot, not too cold, just the kind of day you want to bottle up and keep forever. And then, on my way to work, my favorite song played on the radio. It felt like a sign that the day was going to be amazing.
Bob: I love when that happens! It's like the universe is giving you a personal thumbs up. What else happened?
Alice: Oh, it gets better. When I got to work, I found out that I received the promotion I've been hoping for. It was such a surprise! I've worked so hard for this, Bob, and it finally paid off.
Bob: Alice, that's incredible news! Congratulations on the promotion! You totally deserve it.
Alice: Thank you so much! And there's more. During lunch, I went out with a few colleagues to celebrate, and we ended up having the best time. The food was delicious, and the company was even better.
Bob: Sounds like a perfect day from start to finish.
Alice: It really was. And to cap it all off, when I got home, I found a package waiting for me. The book I've been wanting to read for months was finally released, and my copy arrived. I can't wait to dive into it.
Bob: Wow, what a day! You've got the promotion, great food, good company, and a new book. It's like everything aligned for you today.
Alice:'''

# Angry
context_2 = '''Bob: Hey Alice, how was your day?
Alice: Oh, don't even get me started. It was absolutely infuriating!
Bob: Really? What happened that got you so angry?
Alice: Where do I even begin? First, the traffic was a nightmare. I was stuck in my car for what felt like an eternity. And then, when I finally got to work, the coffee machine was broken. Can you believe it? No coffee!
Bob: That sounds rough. No coffee can definitely start the day on a wrong note.
Alice: Exactly! And as if that wasn't enough, my computer decided to crash right before I was about to save a crucial report. Hours of work just vanished. I had to start all over again.
Bob: That's terrible, Alice. I'm really sorry to hear that. Computers can be so unreliable when you need them the most.
Alice: And to top it all off, during lunch, I spilled my meal all over my new shirt. It's like the universe was conspiring against me today. I'm just so fed up with everything!
Bob: I can only imagine how frustrating all of that must have been. If there's anything I can do to help or if you need someone to vent to, I'm here for you.
Alice: In one word, I was so "'''

In [16]:
def get_activations(model: str,
                    prompt: str
) -> Float[Tensor, '...']:
  """ Returns the activations of a model on a input prompt.
  """
  _, cache = model.run_with_cache(prompt)
  return cache


def patch_activations(
    target_model: HookedTransformer,
    source_model: HookedTransformer,
    source_position: int,
    target_position: int,
    layer: int,
    target_prompt: str,
    source_cache: ActivationCache,
    activation_type: str = 'resid_pre'
):
    """ Patches an activation vector into the target model.
    """

    source_cache = source_cache[activation_type, layer]

    def hook_fn(target_activations: Float[Tensor, '...'],
                hook: HookPoint
    ) -> Float[Tensor, '...']:
        target_activations[:,target_position,:] = source_cache[:,source_position,:]
        return target_activations


    target_logits = target_model.run_with_hooks(
        target_prompt,
        return_type="logits",
        fwd_hooks=[
            (get_act_name(activation_type, layer), hook_fn)
        ]
    )

    predicted_tokens = target_logits.argmax(dim=-1).squeeze()[:-1]

    return predicted_tokens

In [17]:
model = HookedTransformer.from_pretrained("pythia-31m", device=device)
_, cache = model.run_with_cache(context_1)

layers = [i for i in range(0, model.cfg.n_layers)]

for layer in layers:
  predicted_tokens = patch_activations(target_model=model,
                                    source_model=model,
                                    source_position=-1,
                                    target_position=-1,
                                    layer=layer,
                                    target_prompt=context_2,
                                    source_cache=cache)

  next_str_token = model.to_str_tokens(predicted_tokens[-1])
  print('Layer:', layer, '-', next_str_token)

Loaded pretrained model pythia-31m into HookedTransformer
Layer: 0 - [' scared']
Layer: 1 - [' scared']
Layer: 2 - [' scared']
Layer: 3 - [' scared']
Layer: 4 - [' scared']
Layer: 5 - [' scared']


In [18]:
# Happy
context_1_ = '''Bob: Hey Alice, how was your day today?
Alice: Oh, Bob, it was fantastic! I'm still riding the high from it all!
Bob: That sounds wonderful! What happened to make it so great?
Alice: Well, for starters, the weather was absolutely perfect. Not too hot, not too cold, just the kind of day you want to bottle up and keep forever. And then, on my way to work, my favorite song played on the radio. It felt like a sign that the day was going to be amazing.
Bob: I love when that happens! It's like the universe is giving you a personal thumbs up. What else happened?
Alice: Oh, it gets better. When I got to work, I found out that I received the promotion I've been hoping for. It was such a surprise! I've worked so hard for this, Bob, and it finally paid off.
Bob: Alice, that's incredible news! Congratulations on the promotion! You totally deserve it.
Alice: Thank you so much! And there's more. During lunch, I went out with a few colleagues to celebrate, and we ended up having the best time. The food was delicious, and the company was even better.
Bob: Sounds like a perfect day from start to finish.
Alice: It really was. And to cap it all off, when I got home, I found a package waiting for me. The book I've been wanting to read for months was finally released, and my copy arrived. I can't wait to dive into it.
Bob: Wow, what a day! You've got the promotion, great food, good company, and a new book. It's like everything aligned for you today.
Alice: In one word, I was so "'''

# Angry
context_2_ = '''Bob: Hey Alice, how was your day?
Alice: Oh, don't even get me started. It was absolutely infuriating!
Bob: Really? What happened that got you so angry?
Alice: Where do I even begin? First, the traffic was a nightmare. I was stuck in my car for what felt like an eternity. And then, when I finally got to work, the coffee machine was broken. Can you believe it? No coffee!
Bob: That sounds rough. No coffee can definitely start the day on a wrong note.
Alice: Exactly! And as if that wasn't enough, my computer decided to crash right before I was about to save a crucial report. Hours of work just vanished. I had to start all over again.
Bob: That's terrible, Alice. I'm really sorry to hear that. Computers can be so unreliable when you need them the most.
Alice: And to top it all off, during lunch, I spilled my meal all over my new shirt. It's like the universe was conspiring against me today. I'm just so fed up with everything!
Bob: I can only imagine how frustrating all of that must have been. If there's anything I can do to help or if you need someone to vent to, I'm here for you.
Alice:"'''

In [19]:
model = HookedTransformer.from_pretrained("pythia-70m", device=device)
_, cache = model.run_with_cache(context_2_)

layers = [i for i in range(0, model.cfg.n_layers)]

for layer in layers:
  predicted_tokens = patch_activations(target_model=model,
                                    source_model=model,
                                    source_position=-1,
                                    target_position=-1,
                                    layer=layer,
                                    target_prompt=context_1_,
                                    source_cache=cache)

  next_str_token = model.to_str_tokens(predicted_tokens[-1])
  print('Layer:', layer, '-', next_str_token)

Loaded pretrained model pythia-70m into HookedTransformer
Layer: 0 - [' excited']
Layer: 1 - [' excited']
Layer: 2 - [' excited']
Layer: 3 - [' excited']
Layer: 4 - [' excited']
Layer: 5 - [' excited']
