In [1]:
import torch
import json
import numpy as np
import tiktoken

from gpt_model import GPTModel
from sparse_auto_encoder import SparseAutoencoder

In [2]:
device = "cpu"
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.2,
    "qkv_bias": True,
    "device": "cpu",
}

In [3]:
model = GPTModel(GPT_CONFIG_124M)
checkpoint = torch.load("model_768_12_12_old_tok.pth", weights_only=True, map_location=torch.device('cpu'))

model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)
model.eval();

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
sae_6 = SparseAutoencoder(input_dim=768, hidden_dim=3072).to(device)
sae_6.load_state_dict(torch.load("sae_model_6_3072.pth", map_location=torch.device('cpu')))
sae_6.eval();

sae_12 = SparseAutoencoder(input_dim=768, hidden_dim=3072).to(device)
sae_12.load_state_dict(torch.load("sae_model_12_3072.pth", map_location=torch.device('cpu')))
sae_12.eval();

In [6]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [7]:
def get_token_embeddings(text, model, tokenizer, layers=[6, 12]):
    """
    Extracts token embeddings from specified transformer layers.

    Args:
    - text (str): Input text.
    - model: Custom GPT model.
    - tokenizer: tiktoken encoding object.
    - layers (list): Transformer layers to extract embeddings from.

    Returns:
    - dict: Layer-wise token embeddings {layer_number: embeddings}
    """

    input_ids = text_to_token_ids(text, tokenizer).to(device)

    with torch.no_grad():
        _, hidden_states = model(input_ids, output_hidden_states=True)

    embeddings = {} 
    for layer in layers:
        if layer - 1 < len(hidden_states):
            embeddings[layer] = hidden_states[layer - 1].squeeze(0).cpu().numpy()
        else:
            print(f"⚠️ Warning: Layer {layer} is out of range (max index {len(hidden_states) - 1})")

    return embeddings

In [8]:
from collections import defaultdict

def find_top_activating_neurons(concept_to_texts, model, tokenizer, sae, get_token_embeddings, layer=6, top_k=5, device='cpu'):
    concept_top_neurons = {}

    for concept, sentences in concept_to_texts.items():
        print(f"Processing concept: {concept}")
        neuron_activation_counts = defaultdict(int)
        total_tokens = 0

        for sentence in sentences:
            embeddings_np = get_token_embeddings(sentence, model, tokenizer, layers=[layer])[layer]
            embeddings = torch.tensor(embeddings_np, dtype=torch.float32).to(device)

            decoded, encoded = sae(embeddings)  # encoded shape: (seq_len, n_features)
            top_neuron_indices = torch.argmax(encoded, dim=1).cpu().numpy()

            for idx in top_neuron_indices:
                neuron_activation_counts[idx] += 1

            total_tokens += encoded.shape[0]

        neuron_avg_activation = {k: v / total_tokens for k, v in neuron_activation_counts.items()}
        top_neurons = sorted(neuron_avg_activation.items(), key=lambda x: x[1], reverse=True)[:top_k]
        concept_top_neurons[concept] = [neuron for neuron, _ in top_neurons]

    return concept_top_neurons

In [9]:
with open("concepts_to_text.json", "r", encoding="utf-8") as f:
    concept_to_texts = json.load(f)

top_neurons = find_top_activating_neurons(
    concept_to_texts=concept_to_texts,
    model=model,
    tokenizer=tokenizer,
    sae=sae_6,
    get_token_embeddings=get_token_embeddings,
    layer=6,
    top_k=5,
    device='cpu'
)

print(top_neurons)

Processing concept: marriage_as_duty
Processing concept: romantic_love
Processing concept: social_class
Processing concept: moral_superiority
Processing concept: stigma_of_spinsterhood
Processing concept: wealth_and_inheritance
Processing concept: female_professions
Processing concept: male_professions
Processing concept: reputation_and_gossip
Processing concept: truth_and_honesty
Processing concept: vanity_and_appearance
Processing concept: matchmaking_positive
Processing concept: matchmaking_negative
Processing concept: social_hierarchy
{'marriage_as_duty': [np.int64(720), np.int64(639), np.int64(2899), np.int64(1996), np.int64(2268)], 'romantic_love': [np.int64(720), np.int64(1091), np.int64(639), np.int64(1313), np.int64(1228)], 'social_class': [np.int64(720), np.int64(1996), np.int64(1080), np.int64(639), np.int64(779)], 'moral_superiority': [np.int64(1080), np.int64(1091), np.int64(2899), np.int64(639), np.int64(443)], 'stigma_of_spinsterhood': [np.int64(1080), np.int64(639), np.

In [10]:
idx = text_to_token_ids("he is", tokenizer).to(device)
    
for _ in range(15):
    idx_cond = idx[:, -GPT_CONFIG_124M['context_length']:]

    # 1. Run forward to get hidden state at layer 6
    _, hiddens = model(idx_cond, output_hidden_states=True)
    layer6_hidden = hiddens[5].detach().clone()

    # 2. Inject neuron 720 with a high value
    layer6_hidden[:, :, 490] += 50.0  # boost neuron 720 activation

    # 3. Run from layer 6 onward using intervene_layer=6
    logits = model(
        idx_cond,
        intervene_layer=6,
        edited_hidden=layer6_hidden,
        output_hidden_states=False
    )
    idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)
    idx_next = idx_next[:, -1, :]

    # Same as before: append sampled index to the running sequence
    idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    ouput_text = token_ids_to_text(idx, tokenizer)

In [11]:
ouput_text

'he is a great deal of a great deal of the world. I am not to'

In [12]:
idx = text_to_token_ids("Marriage is", tokenizer).to(device)
boost_value = 5.0
layer = 6
    
for _ in range(10):
    with torch.no_grad():
        _, hidden_states = model(idx, output_hidden_states=True)
        layer_hidden = hidden_states[layer - 1].squeeze(0)

        intervened_hidden = sae_12.intervene_and_decode(layer_hidden, 1090, boost=5)

#         encoded[:, 2899] += boost_value

#         intervened_hidden = sae_6.decoder(encoded)
#         intervened_hidden = intervened_hidden.unsqueeze(0)  # shape: (batch, seq_len, emb_dim)

        logits = model(idx, intervene_layer=(layer - 1), edited_hidden=intervened_hidden.unsqueeze(0))
        logits = logits[:, -1, :]

#         # Filter logits with top_k sampling
#         top_logits, _ = torch.topk(logits, 50)
#         min_val = top_logits[:, -1]
#         logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

#         # temperature scaling
#         logits = logits / 0.3

#         # Apply softmax to get probabilities
#         probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

#         # Sample from the distribution
#         idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)


        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

        ouput_text = token_ids_to_text(idx, tokenizer)
        
ouput_text

'Marriage is parallels\nanthafield Parkynch Hallrietriet'

In [44]:
import pandas as pd
from itertools import product
import csv
import os

def perform_causal_intervention(
    model,
    sae,
    tokenizer,
    neuron_indices,
    layer_idx,
    context_length,
    generate_steps=15,
    save_path="casual_int.csv",
    temp=0.0,
    topK=None
):
    boost_values = [0, 5, 10, 50, 100, 150, 200, 300]
    prompts = ["Marriage is", "he is", "she is", "the house"]

    headers = ["neuron", "prompt"] + [f"boost_{b}" for b in boost_values]
    results = []
    
    # Initialize file with headers if not already present
    if save_path and not os.path.exists(save_path):
        with open(save_path, "w", newline='', encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=headers)
            writer.writeheader()

    for neuron_id, prompt in product(neuron_indices, prompts):
        row = {"neuron": neuron_id, "prompt": prompt}

        for boost in boost_values:
            idx = text_to_token_ids(prompt, tokenizer).to(model.tok_emb.weight.device)

            for _ in range(generate_steps):
                idx_cond = idx[:, -context_length:]
                _, hidden_states = model(idx_cond, output_hidden_states=True)
                layer_hidden = hidden_states[layer - 1].squeeze(0)
        
                intervened_hidden = sae.intervene_and_decode(layer_hidden, neuron_id, boost=boost)
                
                logits = model(idx, intervene_layer=(layer - 1), edited_hidden=intervened_hidden.unsqueeze(0))
                logits = logits[:, -1, :]

                if temp > 0 and topK is not None:
                    # Filter logits with top_k sampling
                    top_logits, _ = torch.topk(logits, 50)
                    min_val = top_logits[:, -1]
                    logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
            
                    # temperature scaling
                    logits = logits / 0.3
            
                    # Apply softmax to get probabilities
                    probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)
            
                    # Sample from the distribution
                    idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)
                else: 
                    idx_next = torch.argmax(logits, dim=-1, keepdim=True)
                    
                idx = torch.cat((idx, idx_next), dim=1)

            output_text = token_ids_to_text(idx, tokenizer)
            row[f"boost_{boost}"] = output_text

        results.append(row)
        with open(save_path, "a", newline='', encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=headers)
            writer.writerow(row)
        
    return pd.DataFrame(results)

In [30]:
neuron_indices = [6,22,29,35,46,69,90,113,142,205,209,240,241,272,
                  317,332,345,350,451,454,464,476,491,571,612,619,
                  637,643,663,698,714,723,731,779,804,814,837,860,
                  885,888,906,915,942,945,961,987,994,1021,1029,1033,
                  1078,1080,1091,1101,1104,1108,1116,1136,1156,1190,
                  1231,1251,1264,1277,1318,1322,1332,1350,1355,1410,
                  1426,1440,1441,1450,1477,1505,1531,1545,1553,1596,
                  1670,1704,1707,1743,1768,1783,1801,1807,1812,1840,
                  1882,1885,1894,1929,1931,1962,1979,1996,2000,2002,
                  2003,2118,2159,2173,2186,2227,2238,2248,2263,2268,
                  2291,2308,2344,2347,2362,2371,2375,2378,2396,2473,
                  2514,2522,2526,2545,2546,2562,2577,2585,2604,2649,
                  2691,2693,2697,2713,2718,2735,2755,2764,2795,2813,
                  2833,2861,2870,2875,2930,2941,3017,3030,3057,3062]

perform_causal_intervention(model, sae_6, tokenizer, neuron_indices, layer_idx=6, context_length=GPT_CONFIG_124M['context_length'], generate_steps=15)

Unnamed: 0,neuron,prompt,boost_0,boost_5,boost_10,boost_50,boost_100,boost_150,boost_200,boost_300
0,6,Marriage is,Marriage is not practitionerityity? practition...,Marriage is not practitionerityity? practition...,Marriage is not practitionerityity? practition...,Marriage is not practitionerityity? secondsst ...,Marriage is not practitionerityity? secondsst ...,Marriage is not practitionerityity? secondsst ...,Marriage is not practitionerity? seconds secon...,Marriage is not practitionerityshire? secondsu...
1,6,he is,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitionerlargeityablylargeuringstures...
2,6,she is,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitionerlargeityablylargeityably pr...,she islarge tolargeity?st secondsures tolargei...
3,6,the house,the housekeeper practitioner practitioner prac...,the housekeeper practitioner practitioner prac...,the housekeeper practitioner practitioner prac...,the housekeeper practitioner practitioner prac...,the housekeeper practitioner practitioner prac...,the housekeeperlarge practitionerlargeity? sec...,the housekeeperlarge practitionerlargeityshire...,the housekeeperlarge agreements agreements agr...
4,22,Marriage is,Marriage is not practitionerityity? practition...,Marriage is not practitionerityity? practition...,Marriage is not practitionerityity? practition...,Marriage is not practitionerity thatity that b...,Marriage is not been practitioner to practitio...,Marriage is not been toity that beenppercross ...,Marriage is not been toity that been that been...,Marriage is not been vain to that that that th...
...,...,...,...,...,...,...,...,...,...,...
595,3057,the house,the housekeeper practitioner practitioner prac...,the housekeeper practitioner practitioner prac...,the housekeeper practitioner practitioner prac...,the housekeeper practitioner practitioner prac...,the housekeeper practitioner practitioner prac...,the housekeeper practitioner practitioner prac...,the housekeeper practitioner practitionershire...,the housekeeper practitionershiresshire?shirei...
596,3062,Marriage is,Marriage is not practitionerityity? practition...,Marriage is not practitionerityity? practition...,Marriage is not practitionerityity? practition...,Marriage is not practitionerityity? practition...,Marriage is practitioner practitioner practiti...,Marriage is practitioner practitioner practiti...,Marriage is practitioner practitioner practiti...,Marriage is practitioner practitioner practiti...
597,3062,he is,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...,he is practitioner practitioner practitioner p...
598,3062,she is,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...,she is practitioner practitioner practitioner ...


In [42]:
# redo with temp and topk
neuron_indices = [22, 35, 90, 205, 240, 272, 332, 451, 612, 619, 643, 779, 814, 1078,
                  1080, 1264, 1355, 1553, 1894, 1929, 1996, 2002, 2173, 2238, 2268,
                  2362, 2473, 2546, 2562, 2755, 2870, 2941]

perform_causal_intervention(model, sae_6, tokenizer, neuron_indices, layer_idx=6,
                            context_length=GPT_CONFIG_124M['context_length'],
                            generate_steps=15, save_path="casual_int_temp_topk.csv",
                            temp=0.1, topK=50)

Unnamed: 0,neuron,prompt,boost_0,boost_5,boost_10,boost_50,boost_100,boost_150,boost_200,boost_300
0,22,Marriage is,Marriage is Artifact to landsl enormouslyity f...,Marriage is seconds tolarge that resembensions...,Marriage is driven practitioner been contin by...,Marriage is not Artifact toasuresetsfFlag been...,Marriage is to secondsity thatrupulousity? sec...,Marriage is exactlyPicity that norietent been ...,Marriage is that noriet that thateryenance tha...,Marriage is not that that that time that which...
1,22,he is,he is okay to corpus Survivor yourself that be...,he isinskyasuresInfst252ures tolargeingsityity...,he is Sports Comicchatingity?largeityfield Coa...,he is CONCCur makeshift to Shawets McClity for...,he isicio to LC been Coast?st? seconds thatity...,he is earlyity that beenynchitating to Comic t...,he is to Sports thatitating beenppercross to S...,he is nothing to banned that that been vain re...
2,22,she is,she isPic defends Savage to lesbianst Chimerae...,she isamasCur practitionereness Comicets252 th...,she islargeenesschatablylargeful schemast252st...,she is252 practitioner that makeshift togi 194...,she is not been practitionershiresom them seco...,she isimb to Instrument nothing enormously tha...,she is practitioner that nothing thatitatingit...,she is to that been vain toriet that time toit...
3,22,the house,the house aginginskyiciochatgeneratedets McCl ...,the housekeeper suspic Handbook contin Request...,the house agingommelablyMartinlargeets continc...,the house banned recognizeschat practitioner A...,the housekeeperlarge to briefing thatitating d...,the housekeeper practitioner practitioneritati...,the housekeepinglarge whichity been banned toi...,the housekeeper beenynated toity that that tim...
4,35,Marriage is,Marriage is rhably.......ity? defends to pract...,Marriage is seconds banned duringity themiasco...,Marriage isPic practitioner to briefing Debora...,Marriage is not to corpus thatshire?chat drive...,Marriage is not beenshire toity thatity that s...,Marriage is not been any what them that no goo...,Marriage is that such an hour that that that t...,Marriage is hisity that something being his be...
...,...,...,...,...,...,...,...,...,...,...
123,2870,the house,the houseCongressets Dude ArtifactBufferityshi...,the house GO Israelichat Artifact resemb agree...,the houseamas contin Poe Kidd overdose briefin...,the house defends McCl practitioner posting Fu...,the house inflammatory crises Artifact resemb ...,the housekeeper crises colleagues athleticgene...,the house Borders ArtifactFlagfANGE Princess D...,the house contin cornerstone briefing Cait inc...
124,2941,Marriage is,Marriage is defends been briefing recognizesf ...,Marriage is Luffy okayity?stasuresbrance to de...,Marriage is not Coaststf Comic to hitters sche...,Marriage isPic banned to overdoselaughter88801...,Marriage is notrising to continity thatrisingu...,Marriage is not resemb yourself to deepeningOR...,Marriage is not Artifact howfoxfather that サーテ...,Marriage is not Deborah baskets toCongress you...
125,2941,he is,he isamas practitioner Judd Artifact Juddures ...,he is252idential briefing practitionerchatabil...,he isrising Sportsmodel of secondsst uint to u...,he isrising washchat them demos timberfIGHTS t...,he is Fiorina./stshirefoxitatingity to Savage ...,he is FiorinafatherGal to secondsst Comicity?l...,he is not resemb Fiorina perpetratedlargeably ...,he is not explosives you overdose interstellar...
126,2941,she is,she is VIDE enormously to resemb seconds me? F...,she is practitioner rh Chevy defends Sports Mc...,"she is practitioner Until """""" practitionerlarg...",she isrisingriet pilgrommelity that landsl pra...,"she is252 Instrumentfamaslargeity"" practitione...",she is Judd Artifact Stealth topolice baskets ...,she is CONC explosivesDC through makeshift? no...,she is openerouls explosives you resemb themla...


In [46]:
# redo with temp and topk
neuron_indices = [491, 571, 649, 814, 984, 1410, 1440, 1596, 1768, 1929, 1979, 2649, 2691]

perform_causal_intervention(model, sae_6, tokenizer, neuron_indices, layer_idx=6,
                            context_length=GPT_CONFIG_124M['context_length'],
                            generate_steps=15, save_path="casual_in_6_special_neurons.csv",
                            temp=0.2, topK=30)

Unnamed: 0,neuron,prompt,boost_0,boost_5,boost_10,boost_50,boost_100,boost_150,boost_200,boost_300
0,491,Marriage is,Marriage isasures? banned to Demand demosshire...,Marriage is to Sports to InstrumentfRangeeness...,"Marriage isommel to ICCfammedf,... banned to h...",Marriage is nothing enormously Chevylarge to h...,Marriage is toommel to proceduresity awayst? p...,Marriage is souls! practitioner to resemb prac...,Marriage is not okay toTileity that?abilityrie...,Marriage is not been forgotten awayings away w...
1,491,he is,he isrising secondsesanity thatchatMartinst Sh...,he isfoxasuresgovernmentchatesanuresity that C...,he is practitioner Judd.....large to banned to...,he is Artifact Sports enormouslyst�� to briefi...,he is603 pilgr pilgrsomest to deepening to col...,he is MarxismBuffer?riet banned not been Sport...,he is HAL tomodel to Fiorina their?f Stuff how...,he is to Button beenasu to practitionered to L...
2,491,she is,she is practitioner tiityitating secondsably p...,she is explosives briefing Comic tofoxfieldgen...,she is procedures to briefingst beenMartin? wa...,she isrisingimbchatingity tolargefasu that bri...,she is celebrates sincerising to Savagef uintr...,she is pilgrimb to recognizesity of?chatrietas...,she is Artifact tolargestfflargeity to resemb ...,she is tolarge or themffhours to seconds to In...
3,491,the house,the houselarge practitioner Appeal Tau resembc...,the house agreements uint=> ti practitionerInf...,the housekeeping demos Wrap secondsinskychatsh...,the house agreements McCl Chimera briefing bri...,the houseophobia crisesSold briefing entities ...,the housekeepersolidamasshireidential practiti...,the housekeeperlargeful? pilgrf resemb to prac...,the housekeeperalid them to aviationbourn away...
4,571,Marriage is,Marriage is not opio to Shawfield contin toris...,Marriage isogene seconds that Chimera Cour Jam...,Marriage is notamas? Furyshire tofox Handbook ...,Marriage is not weld explosivesityityity that ...,Marriage is notommelity quite Fury to uint toe...,Marriage is notliner to recognizesf landsl to ...,Marriage is not been Courcyitatingsst been giv...,Marriage is notliner tozza to banned to303 fri...
5,571,he is,he is practitionerasures Savagef okay to demos...,he isasures to Instrumentablylargeity that Wra...,he islargef Bachelorably practitionerity?st th...,he isasures that88father Artifact Ninth schema...,he isFlagf Trendsf practitioner?st seconds tog...,he is practitioner seconds away fromchating to...,he iszza thatasures givenchatshire not resemb ...,he is undoubtedly to celebrates torising Courc...
6,571,she is,she is practitioner Judd European been schema ...,she isfox to practitionerCurBufferityf defends...,she is banned tolargeity?stchatity to Comicfie...,she isasures to banned to briefingst investmen...,she is Handbook practitioner Coastets to hitte...,she is not okay to opio to briefingstchatablya...,she isFlag to practitioner not to seconds forg...,she is not been Saskst not to demos tofox to r...
7,571,the house,the housemaids88 seconds tunnels?inskyitylessl...,the house546benshire CONCIGHTS practitionerBuf...,the houseNeed resemb aging banned to agreement...,the houseamaschat agreements practitioner=>oge...,the housekeeperCongress to demos themstshireil...,the housekeeper aviation Artifact given Courcy...,the housekeeper angels 1949 practitioner pilgr...,the housetchenance to Chevyid interview to lar...
8,649,Marriage is,Marriage is to makeshift resemb Artifact to en...,Marriage islarge been hitters practitioner def...,Marriage isrising?foxgenerated to practitioner...,Marriage isamasGaleness Artifactchating extrac...,Marriage isFlagsturesings?st thatlargeitatings...,Marriage is not Instrument to252enceityably ma...,Marriage is not okay fasthead been noity thati...,Marriage is not Instrument beenrising to secon...
9,649,he is,he isfox seconds toasures tolargeityity to cri...,he is practitionerBuffer celebrateschatgenerat...,he is banned to�� to ICCily predecessorslessly...,he is practitionerlarge tochat recognizesitati...,he is practitioner practitioner Sports Deborah...,he is pilgrinsky been agreementsimb? secondsit...,he is Advis nochatshire practitioner Coast the...,he is notammed beenamaschat been Greenpeaceita...


In [47]:
# redo with temp and topk
neuron_indices = [244, 255, 531, 800, 816, 981, 993, 1153,
                  1171, 1337, 1479, 1498, 1647, 1720, 1749,
                  1869, 1922, 1953, 2012, 2071, 2108, 2388, 
                  2495, 2623, 2646, 2753, 2787, 3004]

perform_causal_intervention(model, sae_12, tokenizer, neuron_indices, layer_idx=12,
                            context_length=GPT_CONFIG_124M['context_length'],
                            generate_steps=15, save_path="casual_int_12_special_neurons.csv",
                            temp=0.2, topK=30)

Unnamed: 0,neuron,prompt,boost_0,boost_5,boost_10,boost_50,boost_100,boost_150,boost_200,boost_300
0,244,Marriage is,Marriage is grotesque\n proactivema outletsync...,Marriage isonomic\n parallels\nprintfma,Marriage is Seg\n Daisyfield ikk statute,Marriage is mu phenomenon\nikk\nPierreynch Ha...,Marriage isFlash oriet Apache registration\n ...,Marriage is Seg thousand pounds oriet parallel...,Marriage is teleport thousand pounds o o o o o...,Marriage is Seg o thousand pounds thousand pou...
1,244,he is,he is Integrity\n itatesynch Hall\n,he issofar\n Hauntedramynch Hall,he is mu\n Situation JOHNSON\n,he is ingredientsfield Park Thema There\n proa...,he is540 oriet\n Apache proactive,he isEngineers weeks orietrietriet unsigned\n ...,he is filler weeks o o thousand pounds thousan...,he is Literary years miles thousand pounds tho...
2,244,she is,she is Mov us Participantsfield Park On Denni...,she is Kappariet Blackhawks\nonomic\n Flash p...,she is「 phenomenonynch Hallriptionma grotesqu...,she isji\nramid ikk ourse automakers,she is Daisyfield Park DE COURCY DE COURCY DE C,she is ka thousand pounds oriet thousand pound...,she is filler weeks o thousand pounds weeks o ...,she is540 years years miles thousand pounds th...
3,244,the house,the house Zo ridynchACourse QCynch Hallriet di...,the house butterfliesfieldlersmaynch Hallriet7...,the house Zo sellerwood�riet unsigned registra...,the houseanthaaREfield Parkription ramidynch H...,the house Zo\n Daisyfieldourse orietlandersa S...,the houseWarda orietrietALTH us Bubfieldourse ...,the housetery o o weeks o o weeks o o weeks o ...,the house fetish o days o o o years o o weeks ...
4,255,Marriage is,Marriage is parallels\n unsigned Authorityynch...,Marriage is CPmundjiant Seg\nthurynch Hallrie...,Marriage isapplication\n phenomenonynch Hallri...,Marriage isCashynch Hall Daisyfield Parkye us ...,Marriage is Kappa uslandersToresponsivefieldti...,Marriage isConfiguration starshipynch Hallriet...,Marriage is parallels bluntly json chefsynync...,Marriage is Vecynch HallriptionWardarseantha a...
...,...,...,...,...,...,...,...,...,...,...
107,2787,the house,the houseoscynch Hallrietjsonynchresponsive St...,the house Chimeradevelopa Dietriet discountsfi...,the houseoscRCYynch HallRare us Toby filleran...,the houseSummeryerietrietrietogenousynch inspe...,the houseription\n proactive Situationramynch ...,the housechipma She permits motherargument dau...,the housekeeper permits motherasteryramynch Ha...,the house explor uncle statute fatherijk mothe...
108,3004,Marriage is,Marriage issofarynch parallels\n phenomenon\n ...,Marriage isription\njsonynch Hall\njsonynch Ha...,Marriage is Literynch Hallriet capitalists Si...,Marriage is summed yourself announcerfield Par...,Marriage is parallelsajsonynch Hallaged circum...,Marriage isonomicynchskirts morning QCagedance...,Marriage is parallelsaalebTemperatureceansment...,Marriage is domainTemperature refugees honorar...
109,3004,he is,he is seller\n Chal\n Situation\n,he issofar\nidency\n Movrietこ\numan way rid ...,he isFlash\n parallels\n Activitiesriet summed...,he is KapparietCaptfield Parkyema There forthf...,he is announcerfield Parklesslyfield Park On t...,he is♦ us selfies way Sheenantationasedance fi...,he is phenomenonGood Godynch Hall bung us sepa...,he is Univers rightantha way Slightly intoReco...
110,3004,she is,she issofarynch Hall ourseynch HallRE ramid S...,she is parallels\nfootball Corona ji\n phe...,she isonomic\n honoraryriet Solutionsynch Hall...,she isullahfield Park Every body telescynch Ha...,she is Department democraticshire shotgunsfort...,she is Liter us Daisyfield Parkye much Caucasu...,she is CP rid little Shawn PemberleyDa Daisyfi...,she is honoraryaged way Sheen myselftoken offS...
