In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["OPENAI_API_KEY"] = open(os.path.join(os.path.expanduser("~"), ".openai_api_key"), "r").read()[:-1]

from neuron_explainer.activations.activation_records import calculate_max_activation
from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron
from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator
from neuron_explainer.explanations.explainer import TokenActivationPairExplainer, SummaryExplainer, HighlightExplainer
from neuron_explainer.explanations.prompt_builder import PromptFormat
from neuron_explainer.explanations.scoring import simulate_and_score
from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator

EXPLAINER_MODEL_NAME = "gpt-3.5-turbo"
SIMULATOR_MODEL_NAME = "text-davinci-003"
MODE = "Highlight"#"Original"#"Summary"#"Fixed"#"Highlight"

neuron_record = load_neuron(9, 6236)

# Load a neuron record.

cutoff = neuron_record.quantile_boundaries[2]
print(MODE, cutoff)
# Grab the activation records we'll need.
slice_params = ActivationRecordSliceParams(n_examples_per_split=5)
train_activation_records = neuron_record.train_activation_records(
    activation_record_slice_params=slice_params
)
#uses a train

valid_activation_records = neuron_record.valid_activation_records(
    activation_record_slice_params=slice_params
)

if MODE=="Summary":
    explainer = SummaryExplainer(
        model_name=EXPLAINER_MODEL_NAME,
        prompt_format=PromptFormat.HARMONY_V4,
        max_concurrent=1,
    )

    explanations = await explainer.generate_explanations(
        all_activation_records=train_activation_records,
        cutoff=cutoff,
        num_samples=1,
    )
    assert len(explanations) == 1
    explanation = explanations[0]

if MODE=="Highlight":
    explainer = HighlightExplainer(
        model_name=EXPLAINER_MODEL_NAME,
        prompt_format=PromptFormat.HARMONY_V4,
        max_concurrent=1,
    )

    explanations = await explainer.generate_explanations(
        all_activation_records=train_activation_records,
        cutoff=cutoff,
        num_samples=1,
    )
    assert len(explanations) == 1
    explanation = explanations[0]
    
    
elif MODE=="Original":
    explainer = TokenActivationPairExplainer(
        model_name=EXPLAINER_MODEL_NAME,
        prompt_format=PromptFormat.HARMONY_V4,
        max_concurrent=1,
    )
    explanations = await explainer.generate_explanations(
        all_activation_records=train_activation_records,
        max_activation=calculate_max_activation(train_activation_records),
        num_samples=1,
    )
    assert len(explanations) == 1
    explanation = explanations[0]

elif MODE=="Fixed":
    explanation = "transition words at the beginning of sentences."
    
print(f"{explanation=}")

# Simulate and score the explanation.
simulator = UncalibratedNeuronSimulator(
    ExplanationNeuronSimulator(
        SIMULATOR_MODEL_NAME,
        explanation,
        max_concurrent=1,
        prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
    )
)
scored_simulation = await simulate_and_score(simulator, valid_activation_records)
print(f"score={scored_simulation.get_preferred_score():.2f}")


Highlight 1.0078
[{'role': <Role.SYSTEM: 'system'>, 'content': "We're studying neurons in a neural network, and trying to identify their roles.Look at the parts/tokens of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words.\n\n We will show short text excerpts, and highlight the tokens(part of word):that activate highly using stars, i.e. *token*. Your task is to summarize what highly activating tokens have in common, taking the context into account."}, {'role': <Role.USER: 'user'>, 'content': "\n\nNeuron 1\nActivations:\n<start>\n0 - Text excerpt:\nturturro is fabulously funny and over the top as a 'very sneaky' butler who excels in the art of impossible* disappearing*/reapp*earing* acts\n\n<end>\n<start>\n1 - Text excerpt:\nesc*aping** the* studio , piccoli is warmly* affecting* and so is this adroitly minimalist movie .\n\n<end>\n\nExplanation of neuron 1 behavior: the main thing this neuron does is fin