In [1]:
from datasets import load_dataset
import transformer_lens
from tqdm import tqdm

import argparse
import transformer_lens
import torch
import time

import os
import sys

sys.path.append("../")
from shared.utils import LogFeatureDensityHistogram, get_datetime_string

from shared import (
    SparseAutoencoder,
    SparseAutoencoderConfig,
    save_weights_with_description,
    load_weights_with_description,
)

device = "cuda:0"
device1 = "cuda:1"

In [None]:
# Find the sentences with the highest feature activations


In [2]:
embed_dims = 768
sae = SparseAutoencoder(SparseAutoencoderConfig(d_model=embed_dims, d_sparse=8 * embed_dims, sparsity_alpha=0)).to(device1)

In [3]:
sae_weights = load_weights_with_description(os.path.join("./weights", "sae_weights.pth"))
sae.load_state_dict(sae_weights)

<All keys matched successfully>

In [4]:
def prune_stored_sentences(sentences, top_k = 50):
  # assume sentences is (feature activation, sentence)
  # Keep the lowest and highest top_k feature activations
  sentences.sort()

  if len(sentences) < 2 * top_k:
    return sentences

  new_sentences = sentences[:top_k] + sentences[-top_k:]
  return new_sentences



In [5]:
mlp_layer = 5
gpt_model = transformer_lens.HookedTransformer.from_pretrained("gpt2").to(device)

# Load in the dataset
ds = load_dataset("JeanKaddour/minipile")


Loaded pretrained model gpt2 into HookedTransformer
Moving model to device:  cuda:0


In [6]:
feature_sentences = [[] for i in range(8 * embed_dims)]

In [19]:
data_collection = None
mode = "test"
pbar = tqdm(range(len(ds[mode])), desc="Processing")
count = 0
for sample in ds[mode]:
  pbar.update(1)
  count += 1
  if count > 50:
    break

  # Break into sentences and find the activations
  sample_sentences = sample["text"].split(".")
  for sent in sample_sentences:

    logits, activations = gpt_model.run_with_cache(sent)

    mlp_out = activations.cache_dict[f"blocks.{mlp_layer}.hook_mlp_out"].to(device1) # example MLP output, shape: (1, # samples, # dim)

    y, f, loss, reconstruction_loss = sae(mlp_out, True)

    # print(f.shape)

    f_by_feature = torch.max(f, dim = 1).squeeze(0)
    # print(f_by_feature.shape)

    for i in range(f_by_feature.shape[0]):
      feature_sentences[i].append((f_by_feature[i], sent))
      feature_sentences[i] = prune_stored_sentences(feature_sentences[i])





Processing:   0%|          | 1/10000 [04:39<775:55:35, 279.36s/it]


KeyboardInterrupt: 

In [7]:
import ipywidgets as widgets
from IPython.display import display, HTML
from transformers import GPT2Tokenizer
import json
import os

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
output_area = widgets.Output()

def clean_token(token):
    return token.replace("Ġ", " ")

def highlight_text(text, activations):
    tokens = tokenizer.tokenize(text)
    activations = activations[1:]
    max_activation = max(activations)
    highlighted_text = ""

    for i in range(len(tokens)):
        token = clean_token(tokens[i])
        opacity = activations[i] / max_activation if max_activation > 0 else 0
        highlighted_text += f'<span style="background-color: rgba(255, 0, 255, {opacity})">{token}</span>'
    return highlighted_text

def on_text_submit(change):
    input_text = text_input.value
    feature_index = int(feature_index_input.value)
    if input_text == "":
        return

    # Load feature description
    feature_file_path = os.path.join("./features", f"feature_{feature_index}.json")
    with open(feature_file_path, "r") as feature_file:
        feature_data = json.load(feature_file)
        feature_description.value = feature_data.get("label", "No description available.")

    logits, activations = gpt_model.run_with_cache(input_text)
    mlp_out = activations.cache_dict[f"blocks.{mlp_layer}.hook_mlp_out"].to(device1)
    y, f, loss, reconstruction_loss = sae(mlp_out, True)
    highlighted_html = highlight_text(input_text, f[0, :, feature_index])
    output_area.clear_output()
    with output_area:
        display(HTML(highlighted_html))

text_input = widgets.Textarea(
    value='',
    placeholder='Enter text here...',
    description='Input:',
    disabled=False
)

feature_index_input = widgets.Text(
    value='1',
    placeholder='Enter feature index here...',
    description='Feature Index:',
    disabled=False
)
feature_description = widgets.Textarea(
    value='',
    placeholder='Feature description will appear here...',
    description='Description:',
    disabled=True,
    layout=widgets.Layout(width='auto', height='auto')
)

text_input.observe(on_text_submit, names='value')
feature_index_input.observe(on_text_submit, names='value')
display(text_input)
display(feature_index_input)
display(feature_description)
display(output_area)



Textarea(value='', description='Input:', placeholder='Enter text here...')

Text(value='1', description='Feature Index:', placeholder='Enter feature index here...')

Textarea(value='', description='Description:', disabled=True, layout=Layout(height='auto', width='auto'), plac…

Output()

ValueError: invalid literal for int() with base 10: ''

In [2]:
import os

os.environ["OPENAI_API_KEY"] = ""

from neuron_explainer.activations.activation_records import calculate_max_activation
from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron
from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator
from neuron_explainer.explanations.explainer import TokenActivationPairExplainer
from neuron_explainer.explanations.prompt_builder import PromptFormat
from neuron_explainer.explanations.scoring import simulate_and_score
from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator




In [3]:
saved_samples = torch.load("./activating_tokens.pt")

In [4]:
activating_tokens_pair = saved_samples["activating_tokens"]
activating_tokens = [[activating_sample for (activation, activating_sample) in activating_tokens_pair[i]] for i in range(len(activating_tokens_pair))]


In [6]:
activating_tokens[0]

[ActivationRecord(dataclass_name='ActivationRecord', tokens=['BM', 'W', ' L', 'j', 'ub', 'l', 'j', 'ana', ' Open', '\n', '\n', 'The', ' BMW', ' L', 'j', 'ub', 'l', 'j', 'ana', ' Open', ' is', ' a', ' professional', ' tennis', ' tournament', ' played', ' on', ' outdoor', ' red', ' clay', ' courts', '.', ' It', ' is', ' currently', ' part', ' of', ' the', ' Association', ' of', ' Tennis', ' Profession', 'als', ' (', 'AT', 'P', ')', ' Challenger', ' Tour', '.', ' It', ' is', ' held', ' annually', ' in', ' L', 'j', 'ub', 'l', 'j', 'ana', ',', ' Slovenia', ' (', 'former', ' Yugoslavia', '),', ' since', ' 1990', '.', '\n', '\n', 'Past', ' finals', '\n', '\n', 'Sing', 'les', '\n', '\n', 'Dou', 'bles', '\n', '\n', 'External', ' links', '\n', 'Site', ' about', ' Sloven', 'ian', ' Tennis', '\n', 'IT', 'F', ' Search', '\n', '\n', 'Category', ':', 'AT', 'P', ' Challenger', ' Tour', '\n', 'Category', ':', 'Cl', 'ay', ' court', ' tennis', ' tournaments', '\n', 'Category', ':', 'T', 'ennis', ' tourna

In [18]:
EXPLAINER_MODEL_NAME = "gpt-4o-mini"
SIMULATOR_MODEL_NAME = "davinci-002"

activation_records = activating_tokens[0][20:40] + activating_tokens[0][80:100]
valid_activation_records = activating_tokens[0][45:60]

explainer = TokenActivationPairExplainer(
    model_name=EXPLAINER_MODEL_NAME,
    prompt_format=PromptFormat.HARMONY_V4,
    max_concurrent=1,
)

explanations = await explainer.generate_explanations(
    all_activation_records=activation_records,
    max_activation=calculate_max_activation(activation_records),
    num_samples=1,
)
explanation = explanations[0]

# Simulate and score the explanation.
simulator = UncalibratedNeuronSimulator(
    ExplanationNeuronSimulator(
        SIMULATOR_MODEL_NAME,
        explanation,
        max_concurrent=1,
        prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
    )
)
scored_simulation = await simulate_and_score(simulator, valid_activation_records)
print(f"score={scored_simulation.get_preferred_score():.2f}")

Prompt is too long: 102159 + 60 > 4097
Prompt is too long: 102047 + 60 > 4097
Prompt is too long: 101882 + 60 > 4097
Prompt is too long: 101792 + 60 > 4097
Prompt is too long: 101792 + 60 > 4097


  - np.mean(np.square(np.array(real_activations) - np.array(predicted_activations)))
  - np.mean(np.abs(np.array(real_activations) - np.array(predicted_activations)))
  c /= stddev[:, None]
  c /= stddev[None, :]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [7]:
activation_records = activating_tokens[0]

In [8]:
max_activation = calculate_max_activation(activation_records)
for activation_record in activation_records:
  text = ""
  for i in range(len(activation_record.tokens)):
    text += activation_record.tokens[i]
    if activation_record.activations[i] > 0:
      text += f"[{int(10 * activation_record.activations[i] / max_activation)}]"
  activation_record.merged_text = text


In [9]:
save_result = {
  "description": "test",
  "text": [activation_record.merged_text for activation_record in activation_records]
}

In [14]:
import json
with open(
  os.path.join("/home/nickj/sae/auto-ed-coder/decoder/feature1", f"feature_test.json"), "w"
) as json_file:
  json.dump(save_result, json_file, indent=4)

In [17]:
calculate_max_activation(activation_records)

0.0010288155