import stuff

In [6]:
from dotenv import load_dotenv
from utils.models import MiniPileDataset
from utils.interp import count_non_zero_feature_activations, plot_feature_activation_histogram
import os

# Enable automatic reloading of modules when they change
%load_ext autoreload
%autoreload 2


# Load environment variables from .env file
load_dotenv()

# Access the OpenAI API key from the environment variables
openai_api_key = os.getenv("OPENAI_API_KEY")


In [10]:
# Load the model from the pickle file
import pickle 
from utils.sae import SparseAutoencoder, SparseAutoencoderConfig
import json

# load the dataset
file_name = "files/all_sentences_with_embeddings_20240707_132959.pkl"
with open(file_name, "rb") as f:
    mini_pile_dataset = pickle.load(f)

# Load the configuration from the JSON file
config_path = "sae/20240708_195600_config.json"
with open(config_path, "r") as config_file:
    config = json.load(config_file)

# Load the pre-trained model from the pickle file
sae_config = SparseAutoencoderConfig(d_model=config["dimensions"], d_sparse=8 * config["dimensions"], sparsity_alpha=config["sparsity_alpha"])
model = SparseAutoencoder(sae_config)
model_path = "sae/20240708_195600_sae.pkl"
with open(model_path, "rb") as f:
    model_state_dict = pickle.load(f)
    model.load_state_dict(model_state_dict)

  from .autonotebook import tqdm as notebook_tqdm


interpret the feature activations

In [11]:
count_non_zero_feature_activations(model, mini_pile_dataset)

Average Non-Zero Elements for first 100 samples: 12.8100004196167


In [None]:
plot_feature_activation_histogram(model, mini_pile_dataset)

automated interp pipeline

In [20]:
mini_pile_dataset.embeddings[0]

tensor([ 2.8020e-01, -1.4137e-01,  3.2113e-01,  8.3063e-02,  3.6259e-01,
        -3.7126e-02, -1.9216e-01,  2.6195e-01,  3.4063e-01, -1.7096e-01,
         2.0818e-01,  2.6181e-01,  5.7709e-02,  8.7417e-02,  1.1937e-01,
         5.5855e-02, -1.0736e-02,  1.0520e-01,  1.5058e-01,  5.4504e-02,
         1.1298e-01,  9.9395e-02,  1.7316e-01,  4.2729e-02, -1.1806e-02,
         2.5302e-01, -3.5497e-02, -1.0460e-01, -4.0585e-01, -2.2878e-02,
         1.8662e-01,  1.7364e-01,  1.5899e-02, -1.0241e-01, -2.0285e-01,
        -7.5663e-02, -2.9621e-01,  4.3233e-02, -3.6970e-02, -3.1656e-02,
        -1.7705e-01, -1.2123e-01,  3.7734e-02, -1.0784e-01, -3.6377e-01,
         3.7061e-02,  3.6988e-02,  8.4533e-03, -5.6062e-02,  6.9565e-02,
        -5.8853e-02,  1.2077e-02, -2.3677e-02,  1.5821e-01,  9.5576e-02,
         1.0761e-01,  9.2060e-03, -2.6331e-01, -2.0025e-01,  1.8597e-02,
         1.7239e-01, -3.4805e-01, -3.4177e-02, -2.2331e-01,  1.2295e-02,
        -3.1695e-01,  2.4510e-01,  1.9030e-01, -2.3

In [49]:
import numpy as np
from utils.ai import OpenAIClient
from utils.features import Feature, FeatureSample
import os
import json
from pprint import pprint
from datetime import datetime

# make folder
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
folder_name = f"features/sae_features_{timestamp}"
os.makedirs(folder_name, exist_ok=True)

ai = OpenAIClient(openai_api_key)

n = 100
feature_registry = np.zeros((config["dimensions"] * 8, n))


for i in range(n):
  embedding = mini_pile_dataset.embeddings[i]
  feature_activations = model.forward(embedding)[1]
  feature_registry[:, i] = feature_activations.detach().numpy()
    
for index, feature in enumerate(feature_registry):
    feature_samples = [FeatureSample(text=mini_pile_dataset.sentences[i], act=value) for i, value in enumerate(feature)]
    feature_samples.sort(key=lambda x: x.act, reverse=True)

    high_act_samples = feature_samples[:50]
    low_act_samples = feature_samples[-50:]

    interpetation = ai.get_interpretation(high_act_samples, low_act_samples)
    label = interpetation["label"]
    reasoning = interpetation["reasoning"]
    attributes = interpetation["attributes"]

    high_act_score = ai.score_interpretation(high_act_samples, attributes)['percent']
    low_act_score = ai.score_interpretation(low_act_samples, attributes)['percent']

    labelled_feature = Feature(
       index=index, 
       label=label, 
       attributes=attributes, 
       reasoning=reasoning, 
       confidence=abs(high_act_score - low_act_score), 
       density=(np.count_nonzero(feature) / len(feature)),
       high_act_samples=high_act_samples,
       low_act_samples=low_act_samples,
    )

    # write this feature
    with open(os.path.join(folder_name, f"feature_{index}.json"), "w") as json_file:
        json.dump(labelled_feature.dict(), json_file, indent=4)
    
    # print processed feature
    print(f"Processed feature {index}: {label}")

Processed feature 0: Detailed expository content
Processed feature 1: Diverse and broad topics discussed
Processed feature 2: Specific references and detailed explanations
Processed feature 3: Example-driven specific narratives
Processed feature 4: Context and purpose of writing
Processed feature 5: Diverse, multi-themed content
Processed feature 6: Mentions real-world products or entities


KeyboardInterrupt: 