# Generating Outputs for Neuronpedia Upload

We use Callum McDougall's `sae_vis` library for generating JSON data to upload to Neuronpedia.


## Set Up

In [None]:
# from huggingface_hub import hf_hub_download

# MODEL = "gpt2-small"
# LAYER = 0
# SOURCE = "res-jb"
# REPO_ID = "jbloom/GPT2-Small-SAEs"
# FILENAME = f"final_sparse_autoencoder_gpt2-small_blocks.{LAYER}.hook_resid_pre_24576.pt"
# SAE_PATH = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

# Change these
MODEL = "pythia-70m-deduped"
LAYER = 0
TYPE = "resid"
SOURCE_AUTHOR_SUFFIX = "sm"
SOURCE = "res-sm"

# Change these depending on how your files are named
SAE_PATH = f"../data/{SOURCE_AUTHOR_SUFFIX}/sae_{LAYER}_{TYPE}.pt"
FEATURE_SPARSITY_PATH = (
    f"../data/{SOURCE_AUTHOR_SUFFIX}/feature_sparsity_{LAYER}_{TYPE}.pt"
)

## Save JSON to neuronpedia_outputs

In [None]:
from sae_lens.analysis.neuronpedia_runner import NeuronpediaRunner

NP_OUTPUT_FOLDER = "../neuronpedia_outputs"

runner = NeuronpediaRunner(
    sae_path=SAE_PATH,
    feature_sparsity_path=FEATURE_SPARSITY_PATH,
    neuronpedia_parent_folder=NP_OUTPUT_FOLDER,
    init_session=True,
    n_batches_to_sample_from=2**12,
    n_prompts_to_select=4096 * 6,
    n_features_at_a_time=512,
    buffer_tokens_left=64,
    buffer_tokens_right=63,
    start_batch_inclusive=22,
    end_batch_inclusive=23,
)
runner.run()

## Upload to Neuronpedia
#### This currently only works if you have admin access to the Neuronpedia database via localhost.

In [None]:
# Helpers that fix weird NaN stuff
from decimal import Decimal
from typing import Any
import math
import json
import os
import requests

folder_path = runner.neuronpedia_folder


def nanToNeg999(obj: Any) -> Any:
    if isinstance(obj, dict):
        return {k: nanToNeg999(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [nanToNeg999(v) for v in obj]
    elif (isinstance(obj, float) or isinstance(obj, Decimal)) and math.isnan(obj):
        return -999
    return obj


class NanConverter(json.JSONEncoder):
    def encode(self, o: Any, *args: Any, **kwargs: Any):
        return super().encode(nanToNeg999(o), *args, **kwargs)


# Server info
host = "http://localhost:3000"
sourceName = str(LAYER) + "-" + SOURCE

# Upload alive features
for file_name in os.listdir(folder_path):
    if file_name.startswith("batch-") and file_name.endswith(".json"):
        print("Uploading file: " + file_name)
        file_path = os.path.join(folder_path, file_name)
        f = open(file_path, "r")
        data = json.load(f)

        # Replace NaNs
        data_fixed = json.dumps(data, cls=NanConverter)
        data = json.loads(data_fixed)

        url = host + "/api/internal/upload-features"
        resp = requests.post(
            url,
            json={
                "modelId": MODEL,
                "layer": sourceName,
                "features": data,
            },
        )

# Upload dead features (just makes blanks features)
# We want this for completeness
# skipped_path = os.path.join(folder_path, "skipped_indexes.json")
# f = open(skipped_path, "r")
# data = json.load(f)
# skipped_indexes = data["skipped_indexes"]
# url = host + "/api/internal/upload-dead-features"
# resp = requests.post(
#     url,
#     json={
#         "modelId": MODEL,
#         "layer": sourceName,
#         "deadIndexes": skipped_indexes,
#     },
# )

### TODO: Automatically validate the uploaded data