### This notebook is adapted from quickstart.ipynb of the HypotheSAEs git repository.

The original uses a yelp review dataset; this is a SummEval dataset adpatation.

For more information, please refer to the HypotheSAEs paper and git repository below.

* https://arxiv.org/pdf/2502.04382
* https://github.com/rmovva/HypotheSAEs/tree/main
* https://github.com/rmovva/HypotheSAEs/blob/main/notebooks/quickstart.ipynb

# Import

In [None]:
import os

### MUST SET YOUR OWN OPENAI API KEY HERE
# os.environ["OPENAI_KEY_SAE"] = '...'

import numpy as np
import pandas as pd

### pip install hypothesaes
from hypothesaes.quickstart import train_sae, interpret_sae, generate_hypotheses, evaluate_hypotheses
from hypothesaes.embedding import get_openai_embeddings, get_local_embeddings, get_local_embeddings_new

### pip install sentence-transformers
from sentence_transformers import SentenceTransformer

print("DONE")

DONE


# Load Data

In [3]:
current_dir = os.getcwd()
if current_dir.endswith("notebooks"):
    prefix = "../"
else:
    prefix = "./"

base_dir = os.path.join(prefix, "summeval-data")

train_df = pd.read_json(os.path.join(base_dir, "summeval_processed_train.jsonl"), lines=True)
val_df = pd.read_json(os.path.join(base_dir, "summeval_processed_val.jsonl"), lines=True)
holdout_df = pd.read_json(os.path.join(base_dir, "summeval_processed_holdout.jsonl"), lines=True)

print(train_df.columns)


Index(['summary', 'expert_annotations', 'turker_annotations', 'references',
       'model_id', 'raw', 'mistral_relevance', 'mistral_fluency',
       'mistral_coherence', 'mistral_consistency', 'all_annotations',
       'scores_coherence_expert', 'scores_coherence_turker',
       'scores_coherence_all', 'var_coherence_expert', 'var_coherence_turker',
       'var_coherence_all', 'mean_coherence_expert', 'mean_coherence_turker',
       'mean_coherence_all', 'var_coherence_expert_disc',
       'var_coherence_turker_disc', 'var_coherence_all_disc', 'diff_coherence',
       'scores_consistency_expert', 'scores_consistency_turker',
       'scores_consistency_all', 'var_consistency_expert',
       'var_consistency_turker', 'var_consistency_all',
       'mean_consistency_expert', 'mean_consistency_turker',
       'mean_consistency_all', 'var_consistency_expert_disc',
       'var_consistency_turker_disc', 'var_consistency_all_disc',
       'diff_consistency', 'scores_fluency_expert', 'scores_flu

In [None]:
texts = ("raw text: " + train_df['raw'] + "\nsummary: " + train_df['summary']).tolist()
val_texts = ("raw text: " + val_df['raw'] + "\nsummary: " + val_df['summary']).tolist()
holdout_texts = ("raw text: " + holdout_df['raw'] + "\nsummary: " + holdout_df['summary']).tolist()

print(len(texts), len(val_texts), len(holdout_texts))

1120 240 240


# Text Embeddings

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

EMBEDDER = "all-MiniLM-L6-v2"
CACHE_NAME = f"summeval_{EMBEDDER}"

train_embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)
val_embeddings = model.encode(val_texts, batch_size=64, show_progress_bar=True)


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
print(train_embeddings.shape)
print(val_embeddings.shape)

(1120, 384)
(240, 384)


In [None]:
### IMPORTANT VARS TO CHANGE ###

CURR_TARGET = "mean_consistency_all" ### ex) diff_fluency, mean_relevance_all, var_coherence_all, etc
CURR_PROPERTY = "consistency"

labels = train_df[CURR_TARGET].values 
holdout_labels = holdout_df[CURR_TARGET].values

# Train SAEs

In [None]:
checkpoint_dir = os.path.join(prefix, "SAE-checkpoints", CACHE_NAME)

### Could also try [sae_256_8, sae_32_4], etc

sae_64_4 = train_sae(embeddings=train_embeddings, M=64, K=4, checkpoint_dir=checkpoint_dir, val_embeddings=val_embeddings)
sae_32_4 = train_sae(embeddings=train_embeddings, M=32, K=4, checkpoint_dir=checkpoint_dir, val_embeddings=val_embeddings)
sae_list = [sae_64_4, sae_32_4]

  0%|          | 0/100 [00:00<?, ?it/s]

Saved model to ../SAE-checkpoints/summeval_all-MiniLM-L6-v2/SAE_M=64_K=4.pt


  0%|          | 0/100 [00:00<?, ?it/s]

Saved model to ../SAE-checkpoints/summeval_all-MiniLM-L6-v2/SAE_M=32_K=4.pt


# Interpret Neurons
3 varing prompts-- select one where indicated below
* Prompt 1: Difference between Human (all) and LLM (mistral) scores
* Prompt 2: Predicting the Human Score
* Prompt 3: Explain the Variance in Human Ratings

In [9]:
instruction_template_diff = """Each text is a segment from a news article and a corresponding model-generated summary. This summary has been annotated for its {dimension} by both human annotators and an automated judge (LLM).
Features should describe specific linguistic or factual characteristics of summaries that might contribute to a difference between human and model evaluations. For example:

- {ex1}
- {ex2}
- {ex3}
- {ex4}

Focus on patterns in the text that might explain why human raters and the LLM would disagree for an item."""

instruction_template_pred_human = """Each text is a segment from a news article and a corresponding model-generated summary. This summary has been rated by human annotators for its {dimension}.
Features should describe specific linguistic or factual characteristics that tend to make a summary receive higher or lower human ratings in this dimension. For example:

- {ex1}
- {ex2}
- {ex3}
- {ex4}

Focus on patterns in the text that correlate with high or low {dimension} quality according to human raters."""

instruction_template_var_human = """Each text is a segment from a news article and a corresponding model-generated summary. This summary has been rated by multiple human annotators for its {dimension}.
Your goal is to identify features that might explain why human annotators disagree in their ratings. These features may reflect ambiguity, subjectivity, or inconsistency in how the summary is written. For example:

- {ex1}
- {ex2}
- {ex3}
- {ex4}

Focus on patterns that increase uncertainty or disagreement among human raters."""

In [None]:
### NOTE: Select prompt

instruction_template = instruction_template_pred_human

In [11]:
examples = {
    "coherence": {
        "ex1": "jumps between unrelated topics without smooth transitions",
        "ex2": "uses pronouns with unclear or ambiguous referents",
        "ex3": "includes contradictions within the summary",
        "ex4": "contains disjointed or unordered sentence structure"
    },
    "consistency": {
        "ex1": "includes factual errors not supported by the source article",
        "ex2": "contradicts key facts mentioned in the reference text",
        "ex3": "asserts information that directly conflicts with the original article",
        "ex4": "omits or misrepresents critical context from the article"
    },
    "fluency": {
        "ex1": "contains awkward or unnatural phrasing",
        "ex2": "uses ungrammatical constructions or punctuation",
        "ex3": "includes repetitive or redundant sentence structure",
        "ex4": "lacks variation in word choice or sentence rhythm"
    },
    "relevance": {
        "ex1": "mentions tangential details not central to the article",
        "ex2": "emphasizes minor facts while ignoring key points",
        "ex3": "focuses on unimportant or generic statements",
        "ex4": "fails to include main ideas or conclusions from the source"
    }
}

task_instructions = {}

for prop, ex in examples.items():
    task_instructions[prop] = instruction_template.format(
        dimension=prop,
        ex1=ex["ex1"],
        ex2=ex["ex2"],
        ex3=ex["ex3"],
        ex4=ex["ex4"]
    )

In [12]:
results = interpret_sae(
    texts=texts,
    embeddings=train_embeddings,
    sae=sae_list,
    n_random_neurons=5,
    print_examples_n=3,
    task_specific_instructions=task_instructions[CURR_PROPERTY]
)

Computing activations (batchsize=16384):   0%|          | 0/1 [00:00<?, ?it/s]

Computing activations (batchsize=16384):   0%|          | 0/1 [00:00<?, ?it/s]

Activations shape: (1120, 96)


Generating 1 interpretation(s) per neuron:   0%|          | 0/5 [00:00<?, ?it/s]


Neuron 60 (from SAE M=64, K=4): mentions the migration of the North Pacific gray whale named Varvara and its record-breaking journey

Top activating examples:
1. raw text: (CNN)A North Pacific gray whale has earned a spot in the record books after completing the longest migration of a mammal ever recorded. The whale, named Varvara, swam nearly 14,000 miles (22,500 kilometers), according to a release from Oregon State University, whose scientists helped conduct the whale-tracking study. Varvara, which is Russian for "Barbara," left her primary feeding ground off Russia's Sakhalin Island to cross the  Pacific Ocean and down the West Coast of the United States to Baja, Mexico. Varvara's journey surpassed a record listed on the Guinness Worlds Records website. It said the previous record was set by a humpback whale that swam a mere 10,190-mile round trip between the "warm breeding waters near the equator and the colder food-rich waters of the Arctic and Antarctic regions." Records are nic

# Generate Hypotheses

In [13]:
selection_method = "correlation"
results = generate_hypotheses(
    texts=texts,
    labels=labels,
    embeddings=train_embeddings,
    sae=sae_list,
    cache_name=CACHE_NAME,
    selection_method=selection_method,
    n_selected_neurons=20, ### adjust as needed
    n_candidate_interpretations=1,
    task_specific_instructions=task_instructions[CURR_PROPERTY],
    n_examples_for_interpretation=10,
    max_words_per_example=128,
    n_scoring_examples = 100, ### adjust as needed
    n_workers_interpretation=2,
    n_workers_annotation=5 ### adjust as needed
)

print("\nMost predictive features of summarization evaluations:")
pd.set_option('display.max_colwidth', None)
display(results.sort_values(by=f"target_{selection_method}", ascending=False))
pd.reset_option('display.max_colwidth')

Embeddings shape: (1120, 384)


Computing activations (batchsize=16384):   0%|          | 0/1 [00:00<?, ?it/s]

Computing activations (batchsize=16384):   0%|          | 0/1 [00:00<?, ?it/s]

Activations shape: (1120, 96)

Step 1: Selecting top 20 predictive neurons

Step 2: Interpreting selected neurons


Generating 1 interpretation(s) per neuron:   0%|          | 0/20 [00:00<?, ?it/s]


Step 3: Scoring Interpretations
["raw text: A teenager from Illinois is tackling her disability head-on and attempting to positively influence thousands of others by dancing. Dayna Dobias, 19, from Downers Grove was born with cerebral palsy, but she hopes to inspire others with her enthusiastic videos. 'I've gotten bullied because of it and I get people all the time, staring and so it's definitely made things difficult in my life,' Dayna told Daily Mail Online. Scroll down for video Challenging: Dayna Dobias, 18, is hoping change the way people with disabilities are represented by the television, film and the fashion industry Her hope is that the video not only entertains, but inspires others to think before judging. The teenager says her motivation for creating the video was to counteract stereotypes held by people over certain disabilities. 'People tend to think that because I have cerebral palsy I cannot do anything and that I am so different from everyone else, when in reality I a

Scoring neuron interpretation fidelity (20 neurons; 1 candidate interps per neuron; 100 examples to score each…

Example annotation key: mentions a basketball-related event involving Neymar and Dani Alves, including references to El Clasico and the Euro League contest

Most predictive features of summarization evaluations:


Unnamed: 0,neuron_idx,source_sae,target_correlation,interpretation,f1_fidelity_score
2,50,"(64, 4)",0.081209,mentions personal advertisements or requirements for a partner in the text,0.444444
5,63,"(64, 4)",0.066858,describes tactile interaction or physical touch involving animals in detail,0.412698
6,95,"(32, 4)",0.065329,mentions space lasers as a potential energy source for heating homes in the future,0.360656
7,9,"(64, 4)",0.060223,mentions Carlos Tevez's contract termination and his return to Boca Juniors in Argentina,0.360656
8,34,"(64, 4)",0.060036,"mentions a stolen car that was returned with modifications (e.g., new rims or detailing)",0.5
9,89,"(32, 4)",0.058746,"mentions a crocodile holding a dead pet dog in its jaws in a public marina in Puerto Vallarta, Mexico",0.360656
10,78,"(32, 4)",0.058598,mentions a two-year-old boy falling into a cheetah exhibit at the Cleveland Metroparks Zoo,0.412698
12,11,"(64, 4)",0.057766,mentions a child being frisked or having a toy confiscated at an airport,0.412698
13,80,"(32, 4)",0.056262,"describes a positive human-animal interaction or bond involving specific physical actions (e.g., stroking, massaging, cupping)",0.412698
14,54,"(64, 4)",0.051649,mentions speed cameras being non-operational or turned off in a specific region,0.4375


# Evaluate on Heldout Data

In [14]:
### If you want to test on a smaller set, use the top_k and top_results code below ###

# top_k = 5
# top_results = results.sort_values(by="target_correlation", ascending=False).head(top_k)

metrics, evaluation_df = evaluate_hypotheses(
    hypotheses_df=results, ### change to top_results if needed
    texts=holdout_texts,
    labels=holdout_labels,
    cache_name=CACHE_NAME,
    n_workers_annotation=5 ### adjust as needed
)

pd.set_option('display.max_colwidth', None)
display(evaluation_df)
pd.reset_option('display.max_colwidth')

print("\nHoldout Set Metrics:")
print(f"R² Score: {metrics['r2']:.3f}")
print(f"Significant hypotheses: {metrics['Significant'][0]}/{metrics['Significant'][1]} " 
      f"(p < {metrics['Significant'][2]:.3e})")

Step 1: Annotating texts with 20 hypotheses
Found 0 cached items; annotating 4800 uncached items


Annotating:   0%|          | 0/4800 [00:00<?, ?it/s]

Step 2: Computing predictiveness of hypothesis annotations


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return f(*args, **kwargs)


Unnamed: 0,hypothesis,separation_score,separation_pval,regression_coef,regression_pval,feature_prevalence
8,"mentions a stolen car that was returned with modifications (e.g., new rims or detailing)",0.751569,0.218024,1.020422,0.097513,0.004167
2,mentions personal advertisements or requirements for a partner in the text,0.504747,0.153396,0.4846211,0.161185,0.0125
15,mentions Hillary Scott or Lady Antebellum's tour bus catching fire,0.375,0.539297,0.3596211,0.544862,0.004167
6,mentions space lasers as a potential energy source for heating homes in the future,0.252648,0.410949,0.2346211,0.43432,0.016667
12,mentions a child being frisked or having a toy confiscated at an airport,0.250525,0.562762,0.2346211,0.577682,0.008333
19,mentions Thierry Henry's criticism of Arsenal's transfer dealings and the suggestion to replace Olivier Giroud as the club's first-choice striker,0.249477,0.68305,-2.220446e-14,1.0,0.004167
17,"describes a geological phenomenon involving the sudden emergence of land from the seabed, supported by specific details such as location, height, and cause (e.g., landslide, melting ice, and snow)",0.249477,0.68305,0.2346211,0.692758,0.004167
11,"includes critical commentary or questioning of a specific player's or team's ability to succeed in their sport, often backed by named sources or direct quotes",0.171279,0.182983,0.2346211,0.105157,0.104167
16,includes allegations or accusations of criminal behavior involving sexual misconduct,0.072291,0.53779,0.08218205,0.48862,0.129167
7,mentions Carlos Tevez's contract termination and his return to Boca Juniors in Argentina,0.06145,0.887152,0.04712113,0.910934,0.008333



Holdout Set Metrics:
R² Score: 0.130
Significant hypotheses: 1/20 (p < 5.000e-03)


# EXTRA: Check Embeddings X Labels Regression
Because of poor, statistically results, we took primary author Raj's advice is checking for signals on text embeddings.\
Refer to Q3 Why am I not getting any statistically significant hypotheses? on the FAQ below for more information.\
Insignificant results here show that embedder may not encapsulate the complexity of the summeval data or our task at hand.

https://github.com/rmovva/HypotheSAEs/tree/main

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [25]:
scaler = StandardScaler()
train_embeddings_scaled = scaler.fit_transform(train_embeddings)
val_embeddings_scaled = scaler.transform(val_embeddings)

regressor = LinearRegression()

In [None]:
for label in ["mean_fluency_all", "mean_relevance_all", "mean_coherence_all", "mean_consistency_all", "diff_fluency", "diff_relevance", "diff_coherence", "diff_consistency"]:
    train_labels = train_df[label].values
    val_labels = val_df[label].values
    regressor.fit(train_embeddings_scaled, train_labels)
    preds = regressor.predict(val_embeddings_scaled)
    print(f"R² score for {label}: {r2_score(val_labels, preds):.4f}")


R² score for mean_fluency_all: -0.0990
R² score for mean_relevance_all: -0.0471
R² score for mean_coherence_all: -0.4738
R² score for mean_consistency_all: -0.3004
R² score for diff_fluency: -0.0839
R² score for diff_relevance: -0.0536
R² score for diff_coherence: -0.3036
R² score for diff_consistency: -0.0003
