# Generating Outputs for Neuronpedia Upload

We use Callum McDougall's `sae_vis` library for generating JSON data to upload to Neuronpedia.


## Set Up

In [1]:
from sae_lens.toolkit.pretrained_saes import download_sae_from_hf
import os

MODEL_ID = "gpt2-small"
SAE_ID = "res-jb"

(_, SAE_WEIGHTS_PATH, _) = download_sae_from_hf(
    "jbloom/GPT2-Small-SAEs-Reformatted", "blocks.0.hook_resid_pre"
)

SAE_PATH = os.path.dirname(SAE_WEIGHTS_PATH)

## Save JSON to neuronpedia_outputs

In [2]:
from sae_lens.analysis.neuronpedia_runner import NeuronpediaRunner

print(SAE_PATH)
NP_OUTPUT_FOLDER = "../../neuronpedia_outputs"
runner = NeuronpediaRunner(
    sae_path=SAE_PATH,
    model_id=MODEL_ID,
    sae_id=SAE_ID,
    neuronpedia_outputs_folder=NP_OUTPUT_FOLDER,
    init_session=True,
    n_batches_to_sample_from=2**12,
    n_prompts_to_select=4096 * 6,
    n_features_at_a_time=24,
    buffer_tokens_left=64,
    buffer_tokens_right=62,
    start_batch_inclusive=1,
    end_batch_inclusive=1,
)
runner.run()

/Users/johnnylin/.cache/huggingface/hub/models--jbloom--GPT2-Small-SAEs-Reformatted/snapshots/5bd69d8ccac6b19d91934c5aeed4866f8b6e50c7/blocks.0.hook_resid_pre
Loaded pretrained model gpt2-small into HookedTransformer
Moving model to device:  mps


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


==== Starting at batch: 1
==== Ending at batch: 1
Total features to run: 19321
Total skipped: 5255
Total batches: 806
Hook Point Layer: 0
Hook Point: blocks.0.hook_resid_pre
Writing files to: ../../neuronpedia_outputs/gpt2-small_res-jb_blocks.0.hook_resid_pre


 84%|████████▍ | 3435/4096 [02:41<00:31, 21.29it/s]


KeyboardInterrupt: 

## Upload to Neuronpedia
#### This currently only works if you have admin access to the Neuronpedia database via localhost.

In [None]:
# Helpers that fix weird NaN stuff
from decimal import Decimal
from typing import Any
import math
import json
import os
import requests

folder_path = "../../neuronpedia_outputs/gpt2-small_blocks.0.hook_resid_pre_24576" #runner.neuronpedia_folder


def nanToNeg999(obj: Any) -> Any:
    if isinstance(obj, dict):
        return {k: nanToNeg999(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [nanToNeg999(v) for v in obj]
    elif (isinstance(obj, float) or isinstance(obj, Decimal)) and math.isnan(obj):
        return -999
    return obj


class NanConverter(json.JSONEncoder):
    def encode(self, o: Any, *args: Any, **kwargs: Any):
        return super().encode(nanToNeg999(o), *args, **kwargs)


# Server info
host = "http://localhost:3000"
sourceName = str(LAYER) + "-" + SOURCE

# Upload alive features
for file_name in os.listdir(folder_path):
    if file_name.startswith("batch-") and file_name.endswith(".json"):
        print("Uploading file: " + file_name)
        file_path = os.path.join(folder_path, file_name)
        f = open(file_path, "r")
        data = json.load(f)

        # Replace NaNs
        data_fixed = json.dumps(data, cls=NanConverter)
        data = json.loads(data_fixed)

        url = host + "/api/local/upload-features"
        resp = requests.post(
            url,
            json={
                "modelId": MODEL,
                "layer": sourceName,
                "features": data,
            },
        )

# Upload dead features (just makes blanks features)
# We want this for completeness
# skipped_path = os.path.join(folder_path, "skipped_indexes.json")
# f = open(skipped_path, "r")
# data = json.load(f)
# skipped_indexes = data["skipped_indexes"]
# url = host + "/api/internal/upload-dead-features"
# resp = requests.post(
#     url,
#     json={
#         "modelId": MODEL,
#         "layer": sourceName,
#         "deadIndexes": skipped_indexes,
#     },
# )

### TODO: Automatically validate the uploaded data