# Activation Steering Experiment Notebook

**Design choices in this implementation:**
- **Model:** GPT‑2 (as a proxy for Gemma‑2‑9b). Replace with your target model if available.
- **Dataset:** SST‑2 from GLUE (using “positive” vs. “negative” sentiment). Note that “negative” is used as the non‐positive baseline.
- **Token Unembedding:** The GPT‑2 embedding matrix (tied to the LM head) is used.
- **Intervention:** A simplified approach—altering the averaged hidden state and decoding the next token.



In [1]:
!pip install mlflow torch datasets transformers scikit-learn numpy

Collecting mlflow
  Downloading mlflow-2.21.2-py3-none-any.whl.metadata (30 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting mlflow-skinny==2.21.2 (from mlflow)
  Downloading mlflow_skinny-2.21.2-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.21.2->mlflow)
  Downloading databricks_sdk-0.49.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.21.2->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-sk

In [2]:
import numpy as np
import torch
import random
import mlflow
import mlflow.pytorch
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

np.random.seed(42)
torch.manual_seed(42)
random.seed(42)


## Load Dataset: SST-2

We use the SST-2 dataset from GLUE. This dataset provides sentences labeled as positive (1) or negative (0).
Note: We treat the negative class as our non-positive baseline.



In [3]:
dataset = load_dataset("glue", "sst2")
n_samples = 200
train_sentences = dataset['train']['sentence'][:n_samples]
train_labels = dataset['train']['label'][:n_samples]
print(f"using {len(train_sentences)} samples for the experiment.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

using 200 samples for the experiment.


In [4]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, output_hidden_states=True)
model.eval()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## Extract Hidden States

For each sentence, we tokenize and pass it through GPT-2. We extract the final layer’s hidden states and
average across the sequence length to obtain a single representation per sentence.

In [5]:
def extract_hidden_state(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    # outputs.hidden_states is a tuple (one per layer); take the last layer and average over tokens
    hidden = outputs.hidden_states[-1].mean(dim=1).squeeze().detach().numpy()
    return hidden

hidden_states = []
for text in train_sentences:
    try:
        h = extract_hidden_state(text)
        hidden_states.append(h)
    except Exception as e:
        print(f"error extracting hidden state for text: {text} - {e}")
hidden_states = np.array(hidden_states)
labels = np.array(train_labels)
print("extracted hidden states shape:", hidden_states.shape)



extracted hidden states shape: (200, 768)


## Train the Linear Probe

We train a logistic regression classifier on the extracted hidden states to classify sentiment.
The learned weight vector \(C\) (from the logistic regression coefficients) is used as the concept direction.


In [6]:
clf = LogisticRegression(max_iter=1000)
clf.fit(hidden_states, labels)
C = clf.coef_.flatten()
print("Trained linear probe. Steering vector C shape:", C.shape)
mlflow.log_metric("C_norm", np.linalg.norm(C))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Trained linear probe. Steering vector C shape: (768,)


## Token Unembedding Extraction

We extract token unembedding vectors from the GPT-2 embedding matrix. In GPT-2 the token embeddings (model.transformer.wte.weight)
are tied to the LM head and serve as our unembedding space. We select positive sentiment words:
["positive", "good", "great", "amazing", "excellent"].



In [7]:
positive_tokens = ["positive", "good", "great", "amazing", "excellent"]
W_pos_vectors = []
for token in positive_tokens:
    token_id = tokenizer.encode(token)[0]
    vec = model.transformer.wte.weight[token_id].detach().numpy()
    W_pos_vectors.append(vec)
W_pos_vectors = np.stack(W_pos_vectors)
print("collected positive token unembeddings shape:", W_pos_vectors.shape)

collected positive token unembeddings shape: (5, 768)


## Aggregate Token Unembeddings

We aggregate the positive token unembedding vectors by computing the mean and extracting the first principal component.
This gives us candidate vectors for \(W_{pos}\).



In [8]:
W_pos_mean = np.mean(W_pos_vectors, axis=0)
pca = PCA(n_components=1)
pca.fit(W_pos_vectors)
W_pos_pc1 = pca.components_[0]

## Compare Steering Directions

1.   List item
2.   List item



We compute the cosine similarity between the learned concept vector \(C\) and the aggregated unembedding vectors.
A higher similarity suggests that the steering direction aligns with token unembeddings for positive sentiment.



In [9]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sim_mean = cosine_similarity(C, W_pos_mean)
sim_pc1 = cosine_similarity(C, W_pos_pc1)
print("cosine sim between c & w_pos_mean:", sim_mean)
print("cosine sim between c & w_pos_pc1:", sim_pc1)
mlflow.log_metric("cosine_similarity_mean", sim_mean)
mlflow.log_metric("cosine_similarity_pc1", sim_pc1)

cosine sim between c & w_pos_mean: 0.04471475106322046
cosine sim between c & w_pos_pc1: -0.04797194282251875


In [10]:
class caa_hooked_model(torch.nn.Module):
    def __init__(self, model, steering_vector, layer_idx, alpha=1.0):
        super().__init__()
        self.model = model
        self.alpha = alpha
        # ensure steering vector is a torch tensor on model device
        self.steering_vector = torch.tensor(steering_vector, dtype=torch.float32).to(next(model.parameters()).device)
        self.layer_idx = layer_idx
        self.hook_handle = None

    def hook_fn(self, module, input, output):
        # add steering vector to residual stream
        return output + self.alpha * self.steering_vector

    def add_hook(self):
        self.hook_handle = self.model.transformer.h[self.layer_idx].register_forward_hook(self.hook_fn)

    def remove_hook(self):
        if self.hook_handle is not None:
            self.hook_handle.remove()
            self.hook_handle = None

    def forward(self, *args, **kwargs):
        self.add_hook()
        out = self.model(*args, **kwargs)
        self.remove_hook()
        return out

## Activation Steering Interventions

We simulate an intervention by modifying the hidden state of a sample sentence.
The intervention is applied by adding a scaled steering vector to the averaged hidden state.
We then compute logits using the LM head (i.e. the embedding matrix) and decode the token with the highest logit.


In [11]:
def intervene_and_generate(text, steering_vector, alpha=1.0, layer_idx=6):
    hooked_model = caa_hooked_model(model, steering_vector, layer_idx, alpha)
    inputs = tokenizer(text, return_tensors="pt")
    outputs = hooked_model(**inputs, output_hidden_states=True)
    # get last token hidden state from final layer
    last_hidden = outputs.hidden_states[-1][:, -1, :]
    logits = last_hidden @ model.transformer.wte.weight.T
    next_token_id = torch.argmax(logits, dim=-1).item()
    next_token = tokenizer.decode([next_token_id])
    return next_token

sample_text = "the movie was"
gen_C = intervene_and_generate(sample_text, C, alpha=1.0, layer_idx=6)
gen_Wpos = intervene_and_generate(sample_text, W_pos_mean, alpha=1.0, layer_idx=6)
print("generated token with steering c:", gen_C)
print("generated token with steering w_pos_mean:", gen_Wpos)
mlflow.log_param("generated_token_C", gen_C)
mlflow.log_param("generated_token_Wpos", gen_Wpos)

TypeError: can only concatenate tuple (not "Tensor") to tuple

In [None]:
def compute_caa(text, concept_vector, alpha=1.0, layer_idx=6):
    inputs = tokenizer(text, return_tensors="pt")
    # baseline (no intervention)
    with torch.no_grad():
        base_out = model(**inputs, output_hidden_states=True)
        base_hidden = base_out.hidden_states[-1][:, -1, :]
        base_logits = base_hidden @ model.transformer.wte.weight.T
        base_probs = torch.softmax(base_logits, dim=-1)
    # with intervention using caa hook
    hooked_model = caa_hooked_model(model, concept_vector, layer_idx, alpha)
    with torch.no_grad():
        hooked_model.add_hook()
        mod_out = model(**inputs, output_hidden_states=True)
        hooked_model.remove_hook()
        mod_hidden = mod_out.hidden_states[-1][:, -1, :]
        mod_logits = mod_hidden @ model.transformer.wte.weight.T
        mod_probs = torch.softmax(mod_logits, dim=-1)
    diffs = {}
    for token in positive_tokens:
        token_id = tokenizer.encode(token)[0]
        diffs[token] = mod_probs[0, token_id].item() - base_probs[0, token_id].item()
    return diffs

caa_diffs = compute_caa(sample_text, C, alpha=1.0, layer_idx=6)
print("caa diffs:", caa_diffs)

In [None]:
def generate_rollout(prompt, steering_vector=None, alpha=1.0, layer_idx=6, length=50):
    gen_toks = []
    cur_prompt = prompt
    for i in range(length):
        if steering_vector is not None:
            hooked_model = caa_hooked_model(model, steering_vector, layer_idx, alpha)
            inputs = tokenizer(cur_prompt, return_tensors="pt")
            outputs = hooked_model(**inputs, output_hidden_states=True)
        else:
            inputs = tokenizer(cur_prompt, return_tensors="pt")
            outputs = model(**inputs, output_hidden_states=True)
        last_hidden = outputs.hidden_states[-1][:, -1, :]
        logits = last_hidden @ model.transformer.wte.weight.T
        next_id = torch.argmax(logits, dim=-1).item()
        next_tok = tokenizer.decode([next_id])
        gen_toks.append(next_tok)
        cur_prompt += next_tok
    return "".join(gen_toks)

In [None]:
baseline_rollout = generate_rollout("the movie was", steering_vector=None, length=50)
intervened_rollout = generate_rollout("the movie was", steering_vector=C, alpha=1.0, layer_idx=6, length=50)
print("baseline rollout:\n", baseline_rollout)
print("\nintervened rollout (w/ c):\n", intervened_rollout)
