# CircuitsVis Demonstration

## Setup/Imports

__Note:__ To run Jupyter directly within this repo, you may need to run `poetry run pip install jupyter`.

In [1]:
# Enable python import reloading
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
from circuitsvis.attention import attention_patterns, attention_pattern
from circuitsvis.activations import text_neuron_activations
from circuitsvis.examples import hello
from circuitsvis.tokens import colored_tokens
from circuitsvis.topk_tokens import topk_tokens
from circuitsvis.topk_samples import topk_samples

## Built In Visualizations

### Activations

#### Text Neuron Activations (single sample)

In [2]:
tokens = ['Hi', ' and', ' welcome', ' to', ' the', ' Attention', ' Patterns', ' example']
n_layers = 3
n_neurons_per_layer = 4
activations = np.random.normal(size=(len(tokens), n_layers, n_neurons_per_layer))
activations = np.exp(activations) / np.exp(activations).sum(axis=0, keepdims=True) 
text_neuron_activations(tokens=tokens, activations=activations)

#### Text Neuron Activations (multiple samples)

In [3]:
tokens = [['Hi', ' and', ' welcome', ' to', ' the', ' Attention', ' Patterns', ' example'], ['This', ' is', ' another', ' example', ' of', ' colored', ' tokens'], ['And', ' here', ' another', ' example', ' of', ' colored', ' tokens', ' with', ' more', ' words.'], ['This', ' is', ' another', ' example', ' of', ' tokens.']]
n_layers = 3
n_neurons_per_layer = 4
activations = []
for sample in tokens:
    sample_activations = np.random.normal(size=(len(sample), n_layers, n_neurons_per_layer)) * 5
    activations.append(sample_activations)
text_neuron_activations(tokens=tokens, activations=activations)



### Attention

#### Attention Pattern (single head)

In [None]:
tokens = ['Hi', ' and', ' welcome', ' to', ' the', ' Attention', ' Patterns', ' example']
attention = np.tril(np.random.normal(loc=0.3, scale=0.2, size=(8,8)))
attention_pattern(tokens=tokens, attention=attention)

#### Attention Patterns

In [None]:
tokens = ['Hi', ' and', ' welcome', ' to', ' the', ' Attention', ' Patterns', ' example']
attention = np.tril(np.random.normal(loc=0.3, scale=0.2, size=(12,8,8)))
attention_patterns(tokens=tokens, attention=attention)

### Tokens

#### Colored Tokens

In [None]:
tokens = ['Hi', ' and', ' welcome', ' to', ' the', ' Colored', ' Tokens', ' example']
values = np.random.normal(size=(len(tokens))).tolist()
colored_tokens(tokens, values)

### Topk Tokens Table

In [None]:
tokens = [['Hi', ' and', ' welcome', ' to', ' the', ' Attention', ' Patterns', ' example'], ['This', ' is', ' another', ' example']]
n_layers = 3
n_neurons_per_layer = 25
activations = []  # list of samples of shape (n_layers, n_tokens, n_neurons)
for sample in tokens:
    sample_activations = np.random.normal(size=(n_layers, len(sample), n_neurons_per_layer))
    sample_activations = np.exp(sample_activations) / np.exp(sample_activations).sum(axis=1, keepdims=True)
    activations.append(sample_activations)

# Assume we have an arbitrary selection of layers
layer_labels = [2, 7, 9]
topk_tokens(tokens=tokens, activations=activations, max_k=7, first_dimension_name="Layer", third_dimension_name="Neuron", first_dimension_labels=layer_labels)

### Topk Samples

In [2]:
# Assume single layer
tokens = [
    [
        ['Hi', ' and', ' welcome', ' to', ' the', ' topksamples', ' example'], ['This', ' is', ' another', ' example', ' of', ' colored', ' tokens'], ['Yet', ' another', ' example', ' of', ' colored', ' tokens']
    ],
    [
        ['And', ' here', ' another', ' example', ' of', ' colored', ' tokens', ' with', ' more', ' words.'], ['This', ' is', ' another', ' example', ' of', ' tokens.'], ['Again', ', ', ' another', ' example', ' of', ' colored', ' tokens']
    ],
    [
        ['Another', ' example', ' of', ' something', ' of', ' colored', ' tokens', ' with', ' more', ' words.'], ['Weee', ' is', ' another', ' example', ' of', ' tokens.'], ['Once', ' again', ' another', ' example', ' of', ' colored', ' tokens']
    ]
]  # list of samples for the layer (n_neurons (3), samples (3), tokens (varied))
activations = []
for neuron in range(len(tokens)):
    neuron_acts = []
    
    for k in range(len(tokens[0])):
        acts = (np.random.normal(size=(len(tokens[neuron][k]))) * 5).tolist()
        neuron_acts.append(acts)
    activations.append(neuron_acts)
    
# Assume we have an arbitrary selection of neurons
neuron_labels = [2, 7, 9]
# Wrap tokens and activations in an outer list to represent the single layer
topk_samples(tokens=[tokens], activations=[activations], zeroth_dimension_name="Layer", first_dimension_name="Neuron", first_dimension_labels=neuron_labels)



In [7]:
import circuitsvis.sae as sae # Or: from circuitsvis.sae import sae_vis
import numpy as np
import torch

# Sample Data
#tokens_sae = ["The", " quick", " brown", " fox", " jumps", " over", " the", " soph", "isticated", ". "]*10
tokens_sae = ["The", " quick", " brown", " fox", " jumps", " over", " the", " soph", "isticated", ". ","The", " quick", " brown", " fox", " jumps", " over", " the", " soph", "isticated", ".",  "The", " quick", " brown", " fox", " jumps", " over", " the", " soph", "isticated", ".","The", " quick", " brown", " fox", " jumpsYO","\n", " over", " the", " soph", "isticated", ". ","??","\n\n"]*10
num_tokens_sae = len(tokens_sae)
num_features_sae = 1000

# Generate random feature activations (replace with your actual SAE activations)
np.random.seed(42)
feature_activations_sae_np = np.random.rand(num_tokens_sae, num_features_sae) * 20 # Example range
# Make some activations zero or negative for variety
feature_activations_sae_np[np.random.rand(*feature_activations_sae_np.shape) > 0.2] = 0
feature_activations_sae_np[np.random.rand(*feature_activations_sae_np.shape) > 0.95] -= 0


# Generate simple feature labels (replace with meaningful labels if available)
feature_labels_sae = [f"Description {i} custom example" for i in range(num_features_sae)]

print("Tokens:", tokens_sae)
print("Activations Shape:", feature_activations_sae_np.shape)
print("Labels Count:", len(feature_labels_sae))

Tokens: ['The', ' quick', ' brown', ' fox', ' jumps', ' over', ' the', ' soph', 'isticated', '. ', 'The', ' quick', ' brown', ' fox', ' jumps', ' over', ' the', ' soph', 'isticated', '.', 'The', ' quick', ' brown', ' fox', ' jumps', ' over', ' the', ' soph', 'isticated', '.', 'The', ' quick', ' brown', ' fox', ' jumpsYO', '\n', ' over', ' the', ' soph', 'isticated', '. ', '??', '\n\n', 'The', ' quick', ' brown', ' fox', ' jumps', ' over', ' the', ' soph', 'isticated', '. ', 'The', ' quick', ' brown', ' fox', ' jumps', ' over', ' the', ' soph', 'isticated', '.', 'The', ' quick', ' brown', ' fox', ' jumps', ' over', ' the', ' soph', 'isticated', '.', 'The', ' quick', ' brown', ' fox', ' jumpsYO', '\n', ' over', ' the', ' soph', 'isticated', '. ', '??', '\n\n', 'The', ' quick', ' brown', ' fox', ' jumps', ' over', ' the', ' soph', 'isticated', '. ', 'The', ' quick', ' brown', ' fox', ' jumps', ' over', ' the', ' soph', 'isticated', '.', 'The', ' quick', ' brown', ' fox', ' jumps', ' over'

### Basic Usage

Call `sae_vis` with the list of tokens, the activation matrix (as NumPy array, PyTorch tensor, or list of lists), and feature labels.

In [8]:
# sae.sae_vis(
#     tokens=tokens_sae,
#     feature_activations=torch.tensor(feature_activations_sae_np),
#     feature_labels=feature_labels_sae,
# )

### Customization

Customize the initial view, thresholds, and colors:
- `initial_ranking_metric`: 'max' (default), 'l1' (mean abs), 'l0' (non-zero count)
- `activation_threshold`: Value below which activations are dimmed/ignored in tooltips.
- `color_map`: Colormap name (e.g., 'viridis', 'coolwarm', 'plasma', 'magma').
- `num_top_features_overall`: How many features to show in the ranked list.
- `num_top_features_per_token`: How many features to show in the token tooltip.

In [9]:
feature_activations_sae_np.shape

(430, 1000)

In [10]:
out = sae.sae_vis(
    tokens=tokens_sae,
    feature_activations=feature_activations_sae_np,
    feature_labels=feature_labels_sae,
    feature_ids=[2*i for i in list(range(num_features_sae))],
    initial_ranking_metric="l1",        # Rank by mean absolute activation initially
    num_top_features_overall=15,        # Show top 15 features in the list
    num_top_features_per_token=3,       # Show top 3 features in token tooltips
)
#print(out)
from IPython.display import display
display(out)
# You can adjust the height value as needed for better visualization


In [6]:
feature_activations_sae_np[1][26]
feature_labels_sae[26]

'Description 26 custom example'