# Imports

In [1]:
from collections import OrderedDict
import torch
from nnsight import NNsight, LanguageModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import polars as pl
import pickle
from einops import einsum, rearrange, reduce

# Initial nnsight play around

In [None]:


input_size = 5
hidden_dims = 10
output_size = 2

net = torch.nn.Sequential(
    OrderedDict(
        [
            ("layer1", torch.nn.Linear(input_size, hidden_dims)),
            ("layer2", torch.nn.Linear(hidden_dims, output_size)),
        ]
    )
).requires_grad_(False)

In [None]:


tiny_model = NNsight(net)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
print(tiny_model)


Sequential(
  (layer1): Linear(in_features=5, out_features=10, bias=True)
  (layer2): Linear(in_features=10, out_features=2, bias=True)
)


In [7]:
# random input
input = torch.rand((1, input_size))

In [17]:
with tiny_model.trace(input) as tracer:
    
    l1_out = tiny_model.layer1.output.save()

    tracer.stop()
    

In [19]:
print("L1-output", l1_out)

L1-output tensor([[ 0.6403, -0.2536,  0.3415,  0.6441, -1.2578, -1.2412, -0.3749,  0.0718,
          0.5762,  0.4214]])


In [21]:
llm = LanguageModel("openai-community/gpt2", device_map = "auto", dispatch=True)

print(llm)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  (generator): Generator(
    (streamer): Streamer()
  )
)


In [None]:
with llm.trace() as tracer:
    barrier = tracer.barrier(2)
    with tracer.invoke("The Eiffel Tower is in the city of"):
        embeddings = llm.transformer.wte.output
        barrier()
    with tracer.invoke("_ _ _ _"):
        barrier()
        llm.transformer.wte.output = embeddings # we overwrite the embeddings with 
        token_ids_intervention = llm.lm_head.output.argmax(dim=-1).save()
    with tracer.invoke("_ _ _ _"):
        token_ids_original = llm.lm_head.output.argmax(dim=-1).save()

print("original prediction shape", token_ids_original[0][-1].shape)
print("Original prediction:", llm.tokenizer.decode(token_ids_original[0][-1]))

print("modified prediction shape", token_ids_intervention[0][-1].shape)
print("Modified prediction:", llm.tokenizer.decode(token_ids_intervention[0][-1]))

original prediction shape torch.Size([])
Original prediction:  _
modified prediction shape torch.Size([])
Modified prediction:  London


In [28]:
prompt = 'The Eiffel Tower is in the city of'
layers = llm.transformer.h
n_new_tokens = 50
with llm.generate(prompt, max_new_tokens = n_new_tokens) as tracer:
    hidden_states = list().save()

    with tracer.all():
        layers[0].output[0][:] = 0
        hidden_states.append(layers[-1].output)
print("Hidden state length: ", len(hidden_states))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
You have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.


Hidden state length:  50


In [31]:
with llm.trace("The Eiffel Tower is in the city of") as tracer:
    hs11 = llm.transformer.h[11].output[0][:,-1,:].save()

with llm.edit() as llm_edited:
    llm.transformer.h[11].output[0][:,-1,:] = hs11 #overwrite hidden state at 11th layer final token

with llm.trace("Vatican is located in the city of") as tracer:
    original_tokens = llm.lm_head.output.argmax(dim=-1).save()

with llm_edited.trace("Vatican is located in the city of") as tracer:
    modified_tokens = llm.lm_head.output.argmax(dim=-1).save()


In [32]:
print("\nOriginal Prediction: ", llm.tokenizer.decode(original_tokens[0][-1]))
print("Modified Prediction: ", llm.tokenizer.decode(modified_tokens[0][-1]))


Original Prediction:   Rome
Modified Prediction:   Paris


In [33]:
llm.clear_edits()

In [35]:
with llm.trace("The Vatican is in the city of"):
    modified_tokens = llm.lm_head.output.argmax(dim=-1).save()

print("edits cleared:", llm.tokenizer.decode(modified_tokens[0][-1]))

edits cleared:  Rome


In [72]:
with llm.trace() as tracer:
    barrier = tracer.barrier(2)
    with tracer.invoke("London Bridge is in the city of"):
        h = llm.transformer.h[-1].output[0].save()#[:,-1,:]
        print(hidden_states, type(hidden_states))
        barrier()
    
    with tracer.invoke("The Eiffel Tower is in the city of"):
        barrier()
        print(llm.transformer.h[-1].output[0][:], type(llm.transformer.h[-1].output[0][:]))
        print(h)
        llm.transformer.h[-1].output[0][:] = h
        
        output = llm.output.save()
        
    

[(tensor([[[-2.5750,  0.7950, -2.2809,  ..., -4.4176, -1.3597, -3.2755],
         [-2.5750,  0.7950, -2.2809,  ..., -4.4176, -1.3597, -3.2755],
         [-2.5750,  0.7950, -2.2809,  ..., -4.4176, -1.3597, -3.2755],
         ...,
         [-2.5750,  0.7950, -2.2809,  ..., -4.4176, -1.3597, -3.2755],
         [-2.5750,  0.7951, -2.2809,  ..., -4.4176, -1.3597, -3.2755],
         [-2.5750,  0.7951, -2.2809,  ..., -4.4176, -1.3597, -3.2755]]],
       device='cuda:0'),), (tensor([[[-2.5750e+00,  7.9505e-01, -2.2809e+00,  1.3458e+00,  2.1648e+00,
           1.1182e+00,  1.8016e+00,  2.1401e-01, -2.7571e+00,  1.5230e+00,
           4.6846e+00,  5.9023e-01, -3.7853e+00, -3.9914e-01, -6.1569e-01,
          -1.4771e-01,  1.2602e+00,  5.3066e-02,  1.5328e+00,  1.9189e+00,
           3.0607e+00,  8.5428e-01, -1.6071e+00, -3.3766e+00, -1.8221e+00,
           1.9469e+00, -1.8780e+00, -3.1536e+00, -1.1957e+00, -2.2716e+00,
           1.6627e+00,  2.7978e+00,  1.2668e+00, -9.9926e-01,  1.9046e+00,
   

In [66]:
llm.tokenizer.decode(torch.argmax(output.logits[0,-1,:]))

','

# Qwen-3-1.7B linear probe

1. Load dataset of angry vs sad
2. Get final token activations all layers
3. Generate the activations datasets
4. Compute mean for the different samples
5. Compute dot products, and measure accuracy on test set

In [None]:
llm = LanguageModel("meta-llama/Llama-3.2-8B-Instruct", device_map = "auto", dispatch=True)

## Loading dataset

In [651]:
'''pickle_path = '/workspace/mechint-examples/merged_training.pkl'
emotions_df = pl.from_pandas(pd.read_pickle(pickle_path))'''

"pickle_path = '/workspace/mechint-examples/merged_training.pkl'\nemotions_df = pl.from_pandas(pd.read_pickle(pickle_path))"

In [652]:
emotions_df = pl.DataFrame()
for emotion in ['happiness', 'sadness']:
    df = pl.from_pandas(pd.read_csv(f'/workspace/mechint-examples/{emotion}.txt', sep='\t', header=None)).rename({'0':'text'}).with_columns(emotions = pl.lit(emotion))
    emotions_df = pl.concat([emotions_df,df])

In [653]:
emotions_df.group_by(pl.col('emotions')).agg(pl.count())

(Deprecated in version 0.20.5)
  emotions_df.group_by(pl.col('emotions')).agg(pl.count())


emotions,count
str,u32
"""sadness""",871
"""happiness""",848


In [654]:
EMOTIONS = ['happiness','sadness']
full_emotions = emotions_df.filter(pl.col('emotions').is_in(EMOTIONS))

In [655]:
N_SAMPLES = min(1000,len(full_emotions))
N_SAMPLES_PER_EMOTIONS = N_SAMPLES// len(EMOTIONS)
N_TRAIN = int((N_SAMPLES * 0.8))

dataset_df = pl.DataFrame()
for emotion in EMOTIONS:
    dataset_df = pl.concat([dataset_df, full_emotions.filter(pl.col('emotions').eq(emotion)).head(N_SAMPLES_PER_EMOTIONS)])
dataset_df = dataset_df.sample(fraction=1,shuffle=True,seed=42)

In [656]:
#dataset_df = dataset_df.with_columns(text = pl.when(pl.col('emotions').eq('joy')).then(pl.col('text').str.to_uppercase()).otherwise(pl.col('text')))

In [657]:
train_df = dataset_df[:N_TRAIN]
test_df = dataset_df[N_TRAIN:]

In [658]:
emotion_to_value = {emotion: i for i, emotion in enumerate(sorted(dataset_df['emotions'].unique().to_list()))}

In [659]:
emotion_to_value

{'happiness': 0, 'sadness': 1}

## Get final token activations

In [660]:
from torch.utils.data import Dataset, DataLoader

In [661]:
class EmotionsDataset(Dataset):
    def __init__(self, df: pl.DataFrame):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        row = self.df[idx]
        X = row['text'].item()
        y = emotion_to_value[row['emotions'].item()]
        return X,y

### batch size = 1

In [553]:
trainEmotionsDataset= EmotionsDataset(train_df)
trainLoader = DataLoader(trainEmotionsDataset)

In [554]:
testEmotionsDataset= EmotionsDataset(test_df)
testLoader = DataLoader(testEmotionsDataset)

In [54]:
final_token_activations = []
for X,y in trainLoader:
    input = llm.tokenizer.tokenize(X[0])
    with llm.trace(X) as tracer:
        activations = torch.stack([layer.output[0][0,-1,:] for layer in llm.transformer.h])
        activations.save()
    final_token_activations.append((activations,y[0]))
    break

NNsightException: 

Traceback (most recent call last):
  File "/tmp/ipykernel_26408/781421246.py", line 7, in <module>
    activations = torch.stack([layer.output[0][0,-1,:] for layer in llm.transformer.h])
  File "/workspace/venv/lib/python3.11/site-packages/nnsight/intervention/envoy.py", line 1048, in __getattr__
    raise AttributeError(f"{self} has no attribute {name}")

AttributeError: Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
        (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
      )
    )
    (norm): Qwen3RMSNorm((2560,), eps=1e-06)
    (rotary_emb): Qwen3RotaryEmbedding()
  )
  (lm_head): Linear(in_features=2560, out_features=151936, bias=False)
  (generator): Generator(
    (streamer): Streamer()
  )
) has no attribute transformer

### batch size > 1

The thing to ensure here is that we look at the correct token for everything in our batch. Processes 1,000 GPT-2 prompts in <5 seconds on a RTX 4000 (only 20p per hour)!

In [662]:
batch_size = 16
trainEmotionsDataset= EmotionsDataset(train_df)
testEmotionsDataset= EmotionsDataset(test_df)
trainLoader = DataLoader(trainEmotionsDataset, batch_size=batch_size)
testLoader = DataLoader(testEmotionsDataset, batch_size=batch_size)

In [663]:
len(testEmotionsDataset)

200

In [692]:
def get_final_token_activations_dataset(loader: DataLoader):
    final_token_activations=[]
    llm.eval()
    for X, y in loader:

        X = [llm.tokenizer.apply_chat_template([[
            {"role": "user", "content": input_str}
        ]],tokenize=False, add_generation_prompt=True)[0] for input_str in X]

        seq_idxs = [len(llm.tokenizer.tokenize(input_str)) - 1 for input_str in X]
        with torch.no_grad():
            with llm.trace(X) as tracer:
                activations= rearrange(
                    torch.stack([layer.output[range(len(seq_idxs)),seq_idxs,:] for layer in llm.model.layers]),#llm.transformer.h]),
                    'l b h -> b l h'
                )
                final_token_activations.append((activations,y))

    dataset = (torch.concat([x[0] for x in final_token_activations]), torch.concat([x[1] for x in final_token_activations]))
    return dataset

In [693]:
train_dataset = get_final_token_activations_dataset(trainLoader)

In [694]:
test_dataset = get_final_token_activations_dataset(testLoader)

In [695]:
len(testLoader)

13

## Compute the mean

In [696]:
eq_0_mask = (train_dataset[1] == 0)

In [697]:
first_mean = train_dataset[0][eq_0_mask].mean(dim=0)
second_mean = train_dataset[0][~eq_0_mask].mean(dim=0)

In [698]:
more_first_vec = first_mean - second_mean

In [699]:
test_dataset[0].shape

torch.Size([200, 24, 896])

In [700]:
more_first_vec_normed = more_first_vec/ more_first_vec.norm(dim=1,keepdim=True)

In [701]:
eval_dataset = test_dataset
scores = einsum(
    eval_dataset[0], more_first_vec_normed, 'b l h, l h -> b l'
)/eval_dataset[0].shape[-1]
scores = scores - scores.median(0).values

In [702]:
is_first_preds = scores > 0

In [703]:
is_first_actual = (eval_dataset[1] == 0)

In [704]:
is_correct = (is_first_preds.to('cuda') == is_first_actual.to('cuda').unsqueeze(-1))

In [727]:
is_correct.type(torch.float32).mean(0)

tensor([0.5600, 0.5000, 0.5000, 0.5400, 0.5300, 0.5200, 0.5650, 0.5700, 0.6100,
        0.5900, 0.5950, 0.5850, 0.6000, 0.6050, 0.6050, 0.6200, 0.6550, 0.6150,
        0.6050, 0.6350, 0.6200, 0.6100, 0.5950, 0.5850], device='cuda:0')

In [705]:
best_layer = torch.argmax(reduce(is_correct.type(torch.float32), 'b l -> l', 'mean')).cpu().item()
best_layer

16

### Logistic regression

In [678]:
import torch.nn as nn

In [679]:
class LogisticClassifier(nn.Module):
    def __init__(self, input_dim: int, output_dim: int, hidden_dim: int = 100):
        super().__init__()
        self.lin1 = nn.Linear(input_dim, output_dim)
    
    def forward(self,x: torch.Tensor):
        return torch.sigmoid(self.lin1(x)) #using a hidden MLP cos why not

In [680]:
b, l, h = train_dataset[0].shape

In [681]:
from torch.optim import Adam
from torch.nn import BCELoss

from torch.nn.functional import one_hot

In [682]:
logisticClassifier = LogisticClassifier(h, 2).to('cuda')

In [683]:
adam = Adam(logisticClassifier.parameters(),lr=1e-5)
loss_fn = BCELoss()

In [381]:
N_EPOCHS = 1000
for i in range(N_EPOCHS):
    logisticClassifier.zero_grad()
    preds = logisticClassifier(train_dataset[0][:,-1,:].to('cuda'))
    loss = loss_fn(preds, one_hot(train_dataset[1],num_classes=2).type(torch.float32).to('cuda'))
    loss.backward()
    adam.step()
    if i % 100 == 0:
        print(loss)
    

tensor(4.8468, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(3.6128, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(3.3661, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(3.2531, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(3.1929, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(3.1606, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(3.1433, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(3.1333, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(3.1269, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(3.1227, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)


In [383]:

eval_dataset = test_dataset
preds = logisticClassifier(eval_dataset[0][:,-1,:].to('cuda')).argmax(-1)

In [384]:
is_correct = (test_dataset[1].to('cuda') == preds)

In [385]:
is_correct.type(torch.float32).mean()

tensor(0.6250, device='cuda:0')

### Steering

In [728]:
best_layer = -5

In [729]:
steering_vec = more_first_vec[best_layer]

In [730]:
input_str = "I am feeling okay"
formatted_input = llm.tokenizer.apply_chat_template([[
            {"role": "user", "content": input_str}
        ]], tokenize=False, add_generation_prompt=True)[0] #+ "<|im_start|>assistant\n"

In [740]:
n_new_tokens = 100
with llm.generate(formatted_input,max_new_tokens = n_new_tokens) as tracer:
    llm.model.layers[best_layer].output += -2*steering_vec
    out = llm.generator.output.save()

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [739]:
llm.tokenizer.decode(torch.tensor(out[0]))

  llm.tokenizer.decode(torch.tensor(out[0]))


"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nI am feeling okay<|im_end|>\n<|im_start|>assistant\nHello user!, I'm here to help. Let's get started.ContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\nContentLoaded\n"

In [733]:
with llm.generate(formatted_input,max_new_tokens = n_new_tokens) as tracer:
    llm.model.layers[best_layer].output -= 0.5*steering_vec
    out = llm.generator.output.save()

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [734]:
llm.tokenizer.decode(torch.tensor(out[0]))

  llm.tokenizer.decode(torch.tensor(out[0]))


"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nI am feeling okay<|im_end|>\n<|im_start|>assistant\nHello user!, I'm here to help. Let's get started.ipmap\nonica\nI'm sorry, but I don't understand what you're asking. Could you please provide more context or clarify your question?ipmap\nonica\nI'm sorry, but I don't understand what you're asking. Could you please provide more context or clarify your question?ipmap\nonica\nI'm sorry, but I don't understand what you're asking. Could you please provide more context or clarify your question"

In [735]:
steering_vec

tensor([-1.7464e-01,  3.6001e-01,  5.2012e-02,  4.2715e-01,  1.9865e-01,
        -1.4457e-01, -1.3388e-01,  5.3447e-02,  5.6951e-02,  2.2008e-02,
         2.4717e-01, -3.3482e-02, -2.3308e-01, -3.4218e-01, -2.6414e-01,
         1.2887e-01, -6.3357e-02,  1.9625e-01,  3.1991e-01, -1.0931e-01,
        -4.2174e-01,  4.8927e-02,  1.3206e-01, -8.2161e-02,  1.4244e-01,
         1.4718e-02, -2.6552e-01,  3.8802e-02,  6.4257e-02,  1.3811e-01,
        -4.0593e-02,  3.0087e-01, -1.7993e-01,  2.2970e-01,  1.6279e-01,
         2.8279e-01,  9.6116e-02,  1.0085e-01,  4.3998e-02, -8.9865e-02,
         1.2083e-02,  1.5377e-01, -5.2678e-02, -2.4458e-02, -3.2035e-02,
         2.6851e-01,  5.1165e-02,  1.7813e-01, -3.0208e-01,  2.3052e-02,
         6.8036e-02, -4.5112e-01,  6.3080e-02, -6.4853e-01, -3.7048e-02,
         3.9507e-01,  4.5689e-01,  3.5338e-01,  1.7486e+00, -1.0049e-02,
        -1.5347e-02,  1.0500e-01, -7.1542e+00,  2.5279e-01, -1.9645e-01,
        -1.4710e-01,  2.0355e-01, -1.6328e-01,  1.8