# Imports

In [74]:
from collections import OrderedDict
import torch
from nnsight import NNsight, LanguageModel

# Initial nnsight play around

In [None]:


input_size = 5
hidden_dims = 10
output_size = 2

net = torch.nn.Sequential(
    OrderedDict(
        [
            ("layer1", torch.nn.Linear(input_size, hidden_dims)),
            ("layer2", torch.nn.Linear(hidden_dims, output_size)),
        ]
    )
).requires_grad_(False)

In [None]:


tiny_model = NNsight(net)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
print(tiny_model)


Sequential(
  (layer1): Linear(in_features=5, out_features=10, bias=True)
  (layer2): Linear(in_features=10, out_features=2, bias=True)
)


In [7]:
# random input
input = torch.rand((1, input_size))

In [17]:
with tiny_model.trace(input) as tracer:
    
    l1_out = tiny_model.layer1.output.save()

    tracer.stop()
    

In [19]:
print("L1-output", l1_out)

L1-output tensor([[ 0.6403, -0.2536,  0.3415,  0.6441, -1.2578, -1.2412, -0.3749,  0.0718,
          0.5762,  0.4214]])


In [21]:
llm = LanguageModel("openai-community/gpt2", device_map = "auto", dispatch=True)

print(llm)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  (generator): Generator(
    (streamer): Streamer()
  )
)


In [None]:
with llm.trace() as tracer:
    barrier = tracer.barrier(2)
    with tracer.invoke("The Eiffel Tower is in the city of"):
        embeddings = llm.transformer.wte.output
        barrier()
    with tracer.invoke("_ _ _ _"):
        barrier()
        llm.transformer.wte.output = embeddings # we overwrite the embeddings with 
        token_ids_intervention = llm.lm_head.output.argmax(dim=-1).save()
    with tracer.invoke("_ _ _ _"):
        token_ids_original = llm.lm_head.output.argmax(dim=-1).save()

print("original prediction shape", token_ids_original[0][-1].shape)
print("Original prediction:", llm.tokenizer.decode(token_ids_original[0][-1]))

print("modified prediction shape", token_ids_intervention[0][-1].shape)
print("Modified prediction:", llm.tokenizer.decode(token_ids_intervention[0][-1]))

original prediction shape torch.Size([])
Original prediction:  _
modified prediction shape torch.Size([])
Modified prediction:  London


In [28]:
prompt = 'The Eiffel Tower is in the city of'
layers = llm.transformer.h
n_new_tokens = 50
with llm.generate(prompt, max_new_tokens = n_new_tokens) as tracer:
    hidden_states = list().save()

    with tracer.all():
        layers[0].output[0][:] = 0
        hidden_states.append(layers[-1].output)
print("Hidden state length: ", len(hidden_states))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
You have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.


Hidden state length:  50


In [31]:
with llm.trace("The Eiffel Tower is in the city of") as tracer:
    hs11 = llm.transformer.h[11].output[0][:,-1,:].save()

with llm.edit() as llm_edited:
    llm.transformer.h[11].output[0][:,-1,:] = hs11 #overwrite hidden state at 11th layer final token

with llm.trace("Vatican is located in the city of") as tracer:
    original_tokens = llm.lm_head.output.argmax(dim=-1).save()

with llm_edited.trace("Vatican is located in the city of") as tracer:
    modified_tokens = llm.lm_head.output.argmax(dim=-1).save()


In [32]:
print("\nOriginal Prediction: ", llm.tokenizer.decode(original_tokens[0][-1]))
print("Modified Prediction: ", llm.tokenizer.decode(modified_tokens[0][-1]))


Original Prediction:   Rome
Modified Prediction:   Paris


In [33]:
llm.clear_edits()

In [35]:
with llm.trace("The Vatican is in the city of"):
    modified_tokens = llm.lm_head.output.argmax(dim=-1).save()

print("edits cleared:", llm.tokenizer.decode(modified_tokens[0][-1]))

edits cleared:  Rome


In [72]:
with llm.trace() as tracer:
    barrier = tracer.barrier(2)
    with tracer.invoke("London Bridge is in the city of"):
        h = llm.transformer.h[-1].output[0].save()#[:,-1,:]
        print(hidden_states, type(hidden_states))
        barrier()
    
    with tracer.invoke("The Eiffel Tower is in the city of"):
        barrier()
        print(llm.transformer.h[-1].output[0][:], type(llm.transformer.h[-1].output[0][:]))
        print(h)
        llm.transformer.h[-1].output[0][:] = h
        
        output = llm.output.save()
        
    

[(tensor([[[-2.5750,  0.7950, -2.2809,  ..., -4.4176, -1.3597, -3.2755],
         [-2.5750,  0.7950, -2.2809,  ..., -4.4176, -1.3597, -3.2755],
         [-2.5750,  0.7950, -2.2809,  ..., -4.4176, -1.3597, -3.2755],
         ...,
         [-2.5750,  0.7950, -2.2809,  ..., -4.4176, -1.3597, -3.2755],
         [-2.5750,  0.7951, -2.2809,  ..., -4.4176, -1.3597, -3.2755],
         [-2.5750,  0.7951, -2.2809,  ..., -4.4176, -1.3597, -3.2755]]],
       device='cuda:0'),), (tensor([[[-2.5750e+00,  7.9505e-01, -2.2809e+00,  1.3458e+00,  2.1648e+00,
           1.1182e+00,  1.8016e+00,  2.1401e-01, -2.7571e+00,  1.5230e+00,
           4.6846e+00,  5.9023e-01, -3.7853e+00, -3.9914e-01, -6.1569e-01,
          -1.4771e-01,  1.2602e+00,  5.3066e-02,  1.5328e+00,  1.9189e+00,
           3.0607e+00,  8.5428e-01, -1.6071e+00, -3.3766e+00, -1.8221e+00,
           1.9469e+00, -1.8780e+00, -3.1536e+00, -1.1957e+00, -2.2716e+00,
           1.6627e+00,  2.7978e+00,  1.2668e+00, -9.9926e-01,  1.9046e+00,
   

In [66]:
llm.tokenizer.decode(torch.argmax(output.logits[0,-1,:]))

','

# GPT-2 linear probe

1. Load dataset of angry vs sad
2. Get final token activations all layers
3. Generate the activations datasets
4. Compute mean for the different samples
5. Compute dot products, and measure accuracy on test set

## Loading dataset

In [78]:
import pandas as pd
import polars as pl
import pickle

In [None]:
pickle_path = '/workspace/mechint-examples/merged_training.pkl'
emotions_df = pl.from_pandas(pd.read_pickle(pickle_path))

In [88]:
EMOTIONS = ['joy','anger']
full_emotions = emotions_df.filter(pl.col('emotions').is_in(EMOTIONS))

In [107]:
N_SAMPLES = 1000
N_SAMPLES_PER_EMOTIONS = N_SAMPLES// len(EMOTIONS)
N_TRAIN = int((N_SAMPLES * 0.8))

dataset_df = pl.DataFrame()
for emotion in EMOTIONS:
    dataset_df = pl.concat([dataset_df, full_emotions.filter(pl.col('emotions').eq(emotion)).head(N_SAMPLES_PER_EMOTIONS)])
dataset_df = dataset_df.sample(fraction=1,shuffle=True,seed=42)

In [108]:
train_df = dataset_df[:N_TRAIN]
test_df = dataset_df[N_TRAIN:]

## Get final token activations