In [1]:
import numpy as np
import pandas as pd
import utils
from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [2]:
EXPLAINER_MODEL = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
INPUT_PATH = "inputs/test_neurons.csv"

neuron_df = pd.read_csv(INPUT_PATH)
neuron_df

Unnamed: 0,layer,neuron,score,explanation
0,0,286,0.663509,words related to comparison.
1,10,1838,0.676886,phrases describing positions or situations in...
2,20,193,0.455764,verbs indicating questioning or challenging be...
3,30,1685,0.537097,expressions of gratitude and agreeing to rece...
4,40,431,0.36575,"numbers related to time, dates, and measureme..."


In [3]:
modes = ["Original", "Summary", "Highlight", "HighlightSummary", "AVHS"]
new_results = {"layer":[], "neuron":[], "explainer":[]}
for mode in modes:
    new_results[mode] = []

for i, row in neuron_df.iterrows():
    layer = row["layer"]
    neuron = row["neuron"]
    new_results["layer"].append(layer)
    new_results["neuron"].append(neuron)
    new_results["explainer"].append(EXPLAINER_MODEL)
    neuron_record = load_neuron(layer, neuron)
    
    for mode in modes:
        token_results = await utils.get_explanation(mode=mode, neuron_record=neuron_record, explainer_model=EXPLAINER_MODEL, get_token_only = True)
        new_results[mode].append(token_results['total_tokens'])

new_df = pd.DataFrame(new_results)
new_df

['\n\ncomparisons.']
['\n\nsentence starters or transitional phrases indicating comparison or contrast.']
['\n\nwords/phrases that are comparative, meaning they describe or highlight differences and similarities.']
['\n\nwords that indicate making comparisons or presenting an alternative, often in the context of evaluating two or more things against each other.']
["\n\nprepositions or conjunctions that indicate comparison, such as 'compared' to, but more specifically the words are 'compared', and their variants."]
["\n\nwords or phrases containing the pattern 'something(s) itself in a place' or related to unexpected or hidden identities."]
['\n\nphrases indicating a subject being in a specific location or position, either physical or metaphorical.']
['\n\nthird-person pronouns indicating self-awareness or involvement.']
['\n\nwords related to the pronoun "self" (itself, himself) in contexts mentioning a sense of being in a place or situation.']
['\n\nphrases implying possession or pres

Unnamed: 0,layer,neuron,explainer,Original,Summary,Highlight,HighlightSummary,AVHS
0,0,286,unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit,2365,940,878,987,1145
1,10,1838,unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit,2388,961,895,1013,1168
2,20,193,unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit,2405,967,898,1030,1190
3,30,1685,unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit,2277,1001,917,1082,1601
4,40,431,unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit,2504,987,901,1040,1295


In [7]:
new_df.to_csv("test_results/"+INPUT_PATH.split("/")[-1].split(".")[0] + "_tokens_"+EXPLAINER_MODEL.replace("/","--")+".csv")

In [8]:
for mode in modes:
    avg = np.mean(new_df[mode])
    print(mode, avg)

Original 2387.8
Summary 971.2
Highlight 897.8
HighlightSummary 1030.4
AVHS 1279.8
