In [1]:
# HypotheSAEs Quickstart
# This notebook demonstrates basic usage of HypotheSAEs on a sample of the Yelp review dataset

%load_ext autoreload
%autoreload 2

import os
os.environ['OPENAI_KEY_SAE'] = os.environ['OPENAI_API_KEY'] # Replace with your OpenAI API key, or with another environment variable (e.g. os.environ['OPENAI_API_KEY'])

import numpy as np
import pandas as pd

from hypothesaes.quickstart import train_sae, interpret_sae, generate_hypotheses, evaluate_hypotheses
from hypothesaes.embedding import get_openai_embeddings, get_local_embeddings

**Load data**

The dataset we're using here is a subset of 20K Yelp reviews, with 2K reviews used for validation (during SAE training). 

The target variable is the `stars` column, which is a rating between 1 and 5. We treat this as a regression task.

There are also 2K reviews used for holdout evaluation, which we'll use at the end of the notebook.

In [19]:
from sklearn.model_selection import train_test_split
current_dir = os.getcwd()
if current_dir.endswith("notebooks"):
    prefix = "../"
else:
    prefix = "./"

base_dir = os.path.join(prefix, "demo-data")
'''
train_df = pd.read_json(os.path.join(base_dir, "yelp-demo-train-20K.json"), lines=True)
val_df = pd.read_json(os.path.join(base_dir, "yelp-demo-val-2K.json"), lines=True)

texts = train_df['text'].tolist()
labels = train_df['stars'].values
val_texts = val_df['text'].tolist() # These are only used for early stopping of SAE training, so we don't need labels.
'''

data_path = os.path.join(base_dir, "cmf_data_position.txt")  # unified full data file

# Read in the full dataset (tab-separated, no header)
df = pd.read_csv(data_path, sep='\t', header=None, names=["label", "text"])
df.head()
# Remove rows with label == 0
df = df[df["label"] != "0"]

# Shuffle and split into train (90%) and validation (10%)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df["label"])

# Extract inputs
texts = train_df["text"].tolist()
labels = train_df["label"].values
val_texts = val_df["text"].tolist()
val_labels = val_df["label"].values  # optional, for evaluation

Unnamed: 0,label,text
0,-1,it all about the a in fape is the purpose of t...
1,1,pass the framework
2,1,dear members of the state board of education a...
3,1,i am writing to support offering algebra to 8t...
4,-1,just stop messing with our math leave math alo...


**Compute text embeddings for your dataset**

We'll compute text embeddings for a training set, and optionally a validation set. The validation embeddings are used for SAE eval and early-stopping during training.

Embeddings will be stored in the `emb_cache` directory (or `os.environ["EMB_CACHE_DIR"]` if you set it) using the `cache_name` parameter, so you only need to compute embeddings once.

You can use OpenAI or a local model.

Local models will run much faster on GPU. The default local model is `nomic-ai/modernbert-embed-base`. You can use any sentence-transformers model, but please read the model's docs; you may need to edit `get_local_embeddings`.

In [20]:
EMBEDDER = "text-embedding-3-small" # OpenAI
# EMBEDDER = "nomic-ai/modernbert-embed-base" # Huggingface model, will run locally
CACHE_NAME = f"cmf_quickstart_{EMBEDDER}"

text2embedding = get_openai_embeddings(texts + val_texts, model=EMBEDDER, cache_name=CACHE_NAME)
# text2embedding = get_local_embeddings(texts + val_texts, model=EMBEDDER, batch_size=128, cache_name=CACHE_NAME)
embeddings = np.stack([text2embedding[text] for text in texts])

train_embeddings = np.stack([text2embedding[text] for text in texts])
val_embeddings = np.stack([text2embedding[text] for text in val_texts])

Processing chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Chunk 0:   0%|          | 0/3 [00:00<?, ?it/s]



Saved 543 embeddings to /Users/joseaguilar/Documents/HypotheSAEs/emb_cache/cmf_quickstart_text-embedding-3-small/chunk_000.npy


**Train SAE(s)** 

Using different values of $M$ and $k$ will produce features at different levels of granularity. You can train multiple SAEs if you'd like to produce features at varying granularity, but this is optional.

See the README for more details about selecting $M$ and $k$.

In [21]:
checkpoint_dir = os.path.join(prefix, "checkpoints", CACHE_NAME)
#sae_256_8 = train_sae(embeddings=train_embeddings, M=256, K=8, checkpoint_dir=checkpoint_dir, val_embeddings=val_embeddings)
#sae_32_4 = train_sae(embeddings=train_embeddings, M=32, K=4, checkpoint_dir=checkpoint_dir, val_embeddings=val_embeddings)
sae_64_4 = train_sae(embeddings=train_embeddings,M=64, K=4, checkpoint_dir=checkpoint_dir, val_embeddings=val_embeddings)
sae_32_2 = train_sae(embeddings=train_embeddings,M=32, K=2, checkpoint_dir=checkpoint_dir, val_embeddings=val_embeddings)
sae_list = [sae_64_4, sae_32_2]

  0%|          | 0/100 [00:00<?, ?it/s]

Early stopping triggered after 90 epochs
Saved model to ../checkpoints/cmf_quickstart_text-embedding-3-small/SAE_M=64_K=4.pt


  0%|          | 0/100 [00:00<?, ?it/s]

Early stopping triggered after 85 epochs
Saved model to ../checkpoints/cmf_quickstart_text-embedding-3-small/SAE_M=32_K=2.pt


**Interpret neurons**  

Interpret a random subset of neurons in the SAE to sanity-check that the learned features, and their interpretations, seem reasonable. We generate and print labels for `n_random_neurons` neurons, and we also print out the top-activating texts for each neuron.

In [26]:
# This instruction will be included in the neuron interpretation prompt.
# The below instructions are specific to Yelp, but you can customize this for your task.
# If you don't pass in task-specific instructions, there is a generic instruction (see src/interpret_neurons.py);
# task-specific instructions are optional, but they help produce hypotheses at the desired level of specificity.

TASK_SPECIFIC_INSTRUCTIONS = """"All of the texts are public comments submitted on behalf of the 2023 California Mathemtatics Framework that provides instructional guidance for educators.
Features should describe a specific aspect of the public comment. For example:
- "mentions the impact it will have on students"
- "critiques the way math should be taught'\""""

# Interpret random neurons
results = interpret_sae(
    texts=texts,
    embeddings=train_embeddings,
    sae=sae_list,
    n_random_neurons=5,
    print_examples_n=3,
    task_specific_instructions=TASK_SPECIFIC_INSTRUCTIONS
)

Computing activations (batchsize=16384):   0%|          | 0/1 [00:00<?, ?it/s]

Computing activations (batchsize=16384):   0%|          | 0/1 [00:00<?, ?it/s]

Activations shape: (571, 96)


Generating 1 interpretation(s) per neuron:   0%|          | 0/5 [00:00<?, ?it/s]


Neuron 19 (from SAE M=64, K=4): mentions legal challenges or lawsuits related to educational practices

Top activating examples:
1. please do not adopt the draft framework this experiment failed in sfusd and we cant allow that experiment to further harm californias students please see attached comments  ...
2. this failed in sf and will fail our schools ...
3. dear president linda darlinghammond and state board of education members attached you will find correspondence concerning agenda item 17 ð action to set aside approval of mayacamas charter middle school petition thank you for your consideration sincerely ricardo j soto hehimhis chief advocacy officer and general counsel california charter school association it also cited extensively from jo boaler who has a history of citing researchers who say she misrepresents their work and who threatened a black uc berkeley professor with the police in response to his mild criticism of high speaking fees the framework also cites blog posts a

**Generate hypotheses**

Generate hypotheses which are predictive of the target variable.

The `selection_method` parameter defines how we compute neuron predictiveness (see `src/select_neurons.py` for more details):
- "separation_score": E[target | top-activating examples] - E[target | zero-activating examples]
- "correlation": pearson(neuron activations, target variable)
- "lasso": select N nonzero features with an L1 regularized model

This cell outputs a dataframe with the following columns:
- `neuron_idx`: The index of the neuron in the SAE (if you're using multiple SAEs, this will be a global index across all of them).
- `source_sae`: The SAE that the neuron was selected from.
- `target_{selection_method}`: The predictiveness of the neuron for the target variable, using the selected `selection_method`.
- `interpretation`: The natural language interpretation of the neuron.
- `interp_fidelity_score`: The F1 fidelity score for how well the neuron's interpretation actually corresponds to its activation pattern.

In [27]:
selection_method = "correlation"
results = generate_hypotheses(
    texts=texts,
    labels=labels,
    embeddings=embeddings,
    sae=sae_list,
    cache_name=CACHE_NAME,
    selection_method=selection_method,
    n_selected_neurons=20,
    n_candidate_interpretations=1,
    task_specific_instructions=TASK_SPECIFIC_INSTRUCTIONS
)

print("\nMost predictive features of Yelp reviews:")
pd.set_option('display.max_colwidth', None)
display(results.sort_values(by=f"target_{selection_method}", ascending=False))
pd.reset_option('display.max_colwidth')

Embeddings shape: (571, 1536)


Computing activations (batchsize=16384):   0%|          | 0/1 [00:00<?, ?it/s]

Computing activations (batchsize=16384):   0%|          | 0/1 [00:00<?, ?it/s]

Activations shape: (571, 96)

Step 1: Selecting top 20 predictive neurons

Step 2: Interpreting selected neurons


Generating 1 interpretation(s) per neuron:   0%|          | 0/20 [00:00<?, ?it/s]

API error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-hHrkX2MdWscmD3gh0WKtNR1z on tokens per min (TPM): Limit 30000, Used 27728, Requested 4825. Please try again in 5.106s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}; retrying in 20.0s... (2/3)
API error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-hHrkX2MdWscmD3gh0WKtNR1z on tokens per min (TPM): Limit 30000, Used 29666, Requested 4537. Please try again in 8.406s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}; retrying in 20.0s... (2/3)
API error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-hHrkX2MdWscmD3gh0WKtNR1z on tokens per min (TPM): Limit 30000, Used 29551, Requested 4646. Please try again in 8.394s. Visit htt

Scoring neuron interpretation fidelity (20 neurons; 1 candidate interps per neuron; 100 examples to score each…

API error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-hHrkX2MdWscmD3gh0WKtNR1z on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}; retrying in 10.0s... (2/3)
API error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-hHrkX2MdWscmD3gh0WKtNR1z on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}; retrying in 10.0s... (2/3)
API error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-hHrkX2MdWscmD3gh0WKtNR1z on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.6

IndexError: boolean index did not match indexed array along axis 0; size of axis is 6 but size of corresponding boolean axis is 100

**Evaluate held-out generalization**

Finally, we evaluate whether these are good hypotheses by testing whether their natural language interpretations can predict the target variable.  

We compute annotations for each hypothesized concept on a holdout set (not seen during SAE training & feature selection).

After annotation, we output a dataframe with the following columns:
- `hypothesis`: The natural language hypothesis (which came from interpreting a predictive neuron in the SAE)
- `separation_score`: How much the target variable differs when the concept is present vs. absent (i.e., $E[Y\mid\text{concept} = 1] - E[Y\mid\text{concept} = 0]$).
- `separation_pvalue`: The t-test p-value of the null hypothesis that the separation score is 0 (i.e., the concept is not associated with the target variable).
- `regression_coef`: The coefficient of the concept in a multivariate linear regression of the target variable on all concepts.
- `regression_pval`: The p-value of the null hypothesis that the regression coefficient is 0.
- `feature_prevalence`: The fraction of examples that contain the concept.

Additionally, we output the evaluation metrics used in the paper:
- Significant hypotheses: the number of hypotheses that are significant in the multivariate regression at a specified significance level (default $0.1$) after Bonferroni correction. You can pass in a different significance level using the `corrected_pval_threshold` parameter.
- AUC or $R^2$: how well the hypotheses collectively predict the target variable in the multivariate regression.


In [24]:
holdout_df = pd.read_json(os.path.join(base_dir, "yelp-demo-holdout-2K.json"), lines=True)
holdout_texts = holdout_df['text'].tolist()
holdout_labels = holdout_df['stars'].values

metrics, evaluation_df = evaluate_hypotheses(
    hypotheses_df=results,
    texts=holdout_texts,
    labels=holdout_labels,
    cache_name=CACHE_NAME,
)

pd.set_option('display.max_colwidth', None)
display(evaluation_df)
pd.reset_option('display.max_colwidth')

print("\nHoldout Set Metrics:")
print(f"R² Score: {metrics['r2']:.3f}")
print(f"Significant hypotheses: {metrics['Significant'][0]}/{metrics['Significant'][1]} " 
      f"(p < {metrics['Significant'][2]:.3e})")

Step 1: Annotating texts with 5 hypotheses
Found 0 cached items; annotating 10000 uncached items


Annotating:   0%|          | 0/10000 [00:00<?, ?it/s]

API error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-hHrkX2MdWscmD3gh0WKtNR1z on tokens per min (TPM): Limit 200000, Used 199677, Requested 630. Please try again in 92ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}; retrying in 10.0s... (2/3)
API error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-hHrkX2MdWscmD3gh0WKtNR1z on tokens per min (TPM): Limit 200000, Used 199647, Requested 615. Please try again in 78ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}; retrying in 10.0s... (2/3)
API error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-hHrkX2MdWscmD3gh0WKtNR1z on tokens per min (TPM): Limit 200000, Used 199476, Requested 737. Please try again in 63m

KeyboardInterrupt: 