Hunting for Polysemantic Neurons

# Introduction & loading content

In [None]:
#This notebook is the expoloration for neurons that encode for multiple features. Based on work done by Neel Nanda, & Anthropic

#Code in this notebook was authored by Neel Nanda, and edited for my own puposes.
#Source:

 "With Artifical
 
 This is a colab notebook dedicated to expoloring either intersting neurons, and attempting to find polysemantic neurons.

In [23]:
#upgrading the version of numpy in order for visualization code
!pip install --upgrade numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
#mounting my local drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Imports

In [2]:
#Checking for Google Colab
#Neel Nanda code

import os
import numpy as np
from IPython.core.display import HTML
from IPython.display import HTML

try:
    import google.colab

    IN_COLAB = True
    print("Running as a Colab notebook")

except:
    IN_COLAB = False
    print("Running as a Jupyter notebook - intended for development only!")
    from IPython import get_ipython

    ipython = get_ipython()
    # Code to automatically update the HookedTransformer code as its edited without restarting the kernel
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

Running as a Colab notebook


In [3]:
#Neel Nanda code
#Sets up transformers github file
if IN_COLAB:
    os.system("pip install git+https://github.com/neelnanda-io/TransformerLens.git")
    os.system("pip install gradio")

import gradio as gr
from transformer_lens import HookedTransformer
from transformer_lens.utils import to_numpy


Setting up GPT-2

In [4]:
#Bringing in GPT2 smol version, and setting up the custom hooks
model_name = "solu-4l"
model = HookedTransformer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading model_final.pth:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.04M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Loaded pretrained model solu-4l into HookedTransformer


Neuron activation function definition

In [5]:
#Neel Nanda
#Function definition to extract neuron activations
def get_neuron_acts(text, layer, neuron_index):
    # Hacky way to get out state from a single hook - we have a single element list and edit that list within the hook.
    cache = {}

    def caching_hook(act, hook):
        cache["activation"] = act[0, :, neuron_index]

    model.run_with_hooks(
        text, fwd_hooks=[(f"blocks.{layer}.mlp.hook_post", caching_hook)]
    )
    return to_numpy(cache["activation"])

In [None]:
#General
#My goal is to find polysemantic neurons, from the GPT2 smol model.
#In general, I want to  figure out what is the search space of my problem
# How can I measure whether 2 neurons are polysemantic?
# Are there different ways to performs this measurement?

#Ok so the thoery could be to pass multiple texts through the netwrok to see what gets activated.
# Do certain neurons fire together when specific ideas/topics/words who up under different circumstances?

#that neurons are "polysemantic", responding to several unrelated features.
#Instead it's this, its the idea that a single neuron accounts of differenet unrelated pieces of information.
# I mean this is extremely similar to how the brain works. Memories are not all kept in the same place.
#Thinking on this further, Ai researchers are truly on the path to create immortal entities that are able to shift to a new 
# computational piece of silicone once they notice a certain level of degradation.




Define Visualization

In [6]:
# This is some CSS (tells us what style )to give each token a thin gray border, to make it easy to see token separation
style_string = """<style> 
    span.token {
        border: 1px solid rgb(123, 123, 123)
        } 
    </style>"""


def calculate_color(val, max_val, min_val):
    # Hacky code that takes in a value val in range [min_val, max_val], normalizes it to [0, 1] and returns a color which interpolates between slightly off-white and red (0 = white, 1 = red)
    # We return a string of the form "rgb(240, 240, 240)" which is a color CSS knows
    normalized_val = (val - min_val) / max_val
    return f"rgb(240, {240*(1-normalized_val)}, {240*(1-normalized_val)})"

import numpy as np

def basic_neuron_vis(text, layer, neuron_index, max_val=None, min_val=None):
    """
    text: The text to visualize
    layer: The layer index
    neuron_index: The neuron index
    max_val: The top end of our activation range, defaults to the maximum activation
    min_val: The top end of our activation range, defaults to the minimum activation

    Returns a string of HTML that displays the text with each token colored according to its activation

    Note: It's useful to be able to input a fixed max_val and min_val, because otherwise the colors will change as you edit the text, which is annoying.
    """
    if layer is None:
        return "Please select a Layer"
    if neuron_index is None:
        return "Please select a Neuron"
    acts = get_neuron_acts(text, layer, neuron_index)
    act_max = acts.max()
    act_min = acts.min()
    # Defaults to the max and min of the activations
    if max_val is None:
        max_val = act_max
    if min_val is None:
        min_val = act_min
    # We want to make a list of HTML strings to concatenate into our final HTML string
    # We first add the style to make each token element have a nice border
    htmls = [style_string]
    # We then add some text to tell us what layer and neuron we're looking at - we're just dealing with strings and can use f-strings as normal
    # h4 means "small heading"
    htmls.append(f"<h4>Layer: <b>{layer}</b>. Neuron Index: <b>{neuron_index}</b></h4>")
    # We then add a line telling us the limits of our range
    htmls.append(
        f"<h4>Max Range: <b>{max_val:.4f}</b>. Min Range: <b>{min_val:.4f}</b></h4>"
    )
    # If we added a custom range, print a line telling us the range of our activations too.
    if act_max != max_val or act_min != min_val:
        htmls.append(
            f"<h4>Custom Range Set. Max Act: <b>{act_max:.4f}</b>. Min Act: <b>{act_min:.4f}</b></h4>"
        )
    # Convert the text to a list of tokens
    str_tokens = model.to_str_tokens(text)
    for tok, act in zip(str_tokens, acts):
        # A span is an HTML element that lets us style a part of a string (and remains on the same line by default)
        # We set the background color of the span to be the color we calculated from the activation
        # We set the contents of the span to be the token
        htmls.append(
            f"<span class='token' style='background-color:{calculate_color(act, max_val, min_val)}' >{tok}</span>"
        )

    return "".join(htmls)





Now the next phase is going to be printing out and just finding the polysemantic neurons.


what sort of text dp I need for activations

In [None]:
#Showcasing the contents of the model
model

HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (pos_embed): PosEmbed()
  (hook_pos_embed): HookPoint()
  (blocks): ModuleList(
    (0-3): 4 x TransformerBlock(
      (ln1): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): Attention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
      )
      (mlp): MLP(
        (hook_pre): HookPoint()
        (hook_post): HookPoint()
        (hook_mid): HookPoint()
        (ln): LayerNormPre(
          (hook_scale): HookPoint()
          (hook_normalized): HookPoint()
        )
      )
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()

# Initialize input text

In [None]:
#Defining my input word features

#These are 9 examples that are meant to be quite different  in subject matter to be interpreted as different text by the neural network. 
#The 9 features are set up in such a way to generate multiple examples, in an attempt to truly measure if the SoLU netis activating on multiiple examples of the same subject.
#These texts below were generated with May 6th version of GPT4. I initially elected to only use GPT3.5, however the sentences didn't have as much complexity in each 
# exaample.

cooking_examples = [
    "Cooking is a creative process that involves preparing food by combining, mixing, and heating ingredients to create delicious and nutritious meals.",
    "Baking is a popular method of cooking that uses dry heat in an oven to prepare cakes, bread, pastries, and other desserts.",
    "Sous vide is a cooking technique where vacuum-sealed food is submerged in a temperature-controlled water bath, resulting in even and precise cooking.",
    "Grilling is a high-heat method of cooking where food is cooked over an open flame, giving it a delicious smoky flavor and caramelized crust.",
    "Sauteing is a quick cooking method where food is cooked in a small amount of fat over high heat, allowing for browning and developing flavors."
]

nba_examples = [
    "The NBA is a professional basketball league in the United States, consisting of 30 teams and widely considered the premier men's basketball league globally.",
    "LeBron James is an NBA superstar who has played for the Cleveland Cavaliers, Miami Heat, and Los Angeles Lakers, winning multiple championships.",
    "The NBA All-Star Game is an annual exhibition match where fans vote for their favorite players to compete in a friendly game showcasing their skills.",
    "The NBA Draft is an annual event where teams select eligible players to join their rosters, often selecting young talent from college basketball programs.",
    "The NBA playoffs are a series of elimination rounds that determine the champion of the league, culminating in the NBA Finals between the conference winners."
]

sailing_examples = [
    "Sailing is a recreational and competitive sport that involves navigating a boat using the wind to propel it across the water.",
    "Regattas are organized sailing events that consist of multiple races, often featuring various boat classes and attracting sailors of different skill levels.",
    "Yachting is a form of sailing that involves recreational cruising or racing on larger, more luxurious boats, often in prestigious events.",
    "The America's Cup is the oldest international sailing competition, featuring high-tech, cutting-edge boats competing in a series of races for the coveted trophy.",
    "Sailboats come in various sizes and designs, from small dinghies to large, multi-hulled catamarans, each suited to specific sailing conditions and purposes."
]

travel_examples = [
    "Travel allows people to experience new cultures, meet new people, and gain a broader perspective on the world by exploring different destinations.",
    "Eco-tourism focuses on responsible travel to natural areas, aiming to conserve the environment, promote sustainable practices, and support local communities.",
    "Adventure travel is a type of tourism that involves exploration and physical challenges, such as hiking, mountain climbing, or whitewater rafting.",
    "Cultural travel is the exploration of a destination's history, art, architecture, and traditions, immersing oneself in the local customs and heritage.",
    "Culinary travel is a popular trend where travelers seek out unique food experiences, exploring local cuisine and learning about regional culinary traditions."
]

technology_examples = [
    "Technology has revolutionized our daily lives, enabling instant communication, access to information, and increased productivity through various digital tools.",
    "Artificial intelligence is a branch of computer science that seeks to create machines capable of learning, reasoning, and problem-solving like humans.",
    "Virtual reality is an immersive technology that allows users to experience computer-generated environments and interact with digital objects in 3D space.",
    "The Internet of Things (IoT) refers to the network of interconnected devices that communicate and exchange data, enabling smarter, more efficient systems.",
    "Blockchain technology is a decentralized, digital ledger that records transactions across multiple computers, providing increased"]

python_programming_examples = [
    "Python is a high-level, interpreted programming language known for its simplicity, readability, and versatility in various application domains.",
    "The Python Package Index (PyPI) is a repository of software packages for Python, allowing users to install and manage libraries for their projects.",
    "Python's extensive ecosystem of libraries, such as NumPy for numerical computing or TensorFlow for machine learning, makes it a popular choice among developers.",
    "Django and Flask are two popular web frameworks for building web applications using Python, each offering a different approach to web development.",
    "Jupyter Notebook is an interactive computing environment that allows users to create and share live code, equations, visualizations, and narrative text using Python and other languages."
]

python_animal_examples = [
    "The python is a family of nonvenomous snakes found in Africa, Asia, and Australia, characterized by their large size and constricting behavior.",
    "Ball pythons are a popular species of python in the pet trade, known for their docile nature, manageable size, and variety of color morphs.",
    "Python's unique method of hunting involves wrapping around its prey, constricting it until it suffocates, and then swallowing it whole.",
    "Green tree pythons are known for their striking appearance, with bright green scales and a prehensile tail that helps them navigate their arboreal habitat.",
    "Reticulated pythons are among the longest snake species in the world, with some individuals reaching lengths of over 20 feet."
]

mobile_nintendo_games_examples = [
    "Nintendo has expanded its presence in the mobile gaming market with popular titles such as Mario Kart Tour, Pokémon GO, and Animal Crossing: Pocket Camp.",
    "Super Mario Run is a side-scrolling, auto-runner mobile game developed by Nintendo, bringing the iconic Mario gameplay to smartphones.",
    "Fire Emblem Heroes is a mobile strategy RPG game by Nintendo, featuring characters from the Fire Emblem series in a free-to-play format.",
    "Dragalia Lost is an action RPG mobile game by Nintendo and Cygames, with players controlling a variety of characters as they explore a fantasy world.",
    "Pokémon Masters EX is a mobile game by Nintendo and DeNA, featuring team battles with famous trainers from the Pokémon series in a real-time strategy format."
]

math_examples = [
    "The square of a number x can be represented as x^2, which is the result of multiplying the number by itself. For example, the square of 3 is 3^2 = 3*3 = 9.",
    "To find the difference between two numbers, you can subtract the smaller number from the larger one. For instance, the difference between 7 and 4 is 7 - 4 = 3.",
    "The sum of two numbers can be found by adding them together. For example, the sum of 5 and 2 is 5 + 2 = 7.",
    "Dividing a number by another gives the quotient. For example, dividing 15 by 3 results in a quotient of 5, represented as 15 / 3 = 5.",
    "To calculate the cube of a number x, you can multiply the number by itself twice, represented as x^3. For instance, the cube of 2 is 2^3 = 2*2*2 = 8."
]




# Running Solu4

In [None]:

def find_top_neurons(text_list, top_n=50):
    #changing this to be a minimum val
    layer = [0, 1,2,3]
    neuron_indices = range(2048)

    # Store activations and neuron indices as tuples
    neuron_activations = {layer_num: {neuron_index: [] for neuron_index in neuron_indices} for layer_num in layer}

    for text in text_list:
        print(" On text ",text)
        for layer_num in layer:
            #print("On layer",layer_num)
            for neuron_index in neuron_indices:
                acts = get_neuron_acts(text, layer_num, neuron_index)
                activation_value = acts.mean()
                neuron_activations[layer_num][neuron_index].append(activation_value)

    # Calculate the mean activations for each neuron across all texts
    mean_activations = []
    for layer_num in layer:
        for neuron_index in neuron_indices:
            mean_activation = sum(neuron_activations[layer_num][neuron_index]) / len(text_list)
            mean_activations.append((layer_num, neuron_index, mean_activation))

    # Sort neurons based on mean activations
    sorted_neurons = sorted(mean_activations, key=lambda x: x[2], reverse=True)

    # Return Top N neurons
    return sorted_neurons[:top_n]



The hypothesis is that if I can run multiple texts with different levels of information(features), are there specific neurons that only light up for only 2 features? It could be that the neurons found indeed only encode for those two feautres such as knowledge on cooking and programming in Python. The assumption here is that the generated input text above has enough separability  to be considred features. When reading through Anthropic's Superposition work[1], they described the features of their dataset used, however I elected to have input text that is human readable, to build an intuitive sense of the work.

In [None]:


#Runing this for each of the input texts
# Runing this for each of the input texts
top_500_neurons_cook = find_top_neurons(cooking_examples)
print("top_500_neurons_cook:", top_500_neurons_cook[:4])

top_500_neurons_nba = find_top_neurons(nba_examples)
print("top_500_neurons_nba:", top_500_neurons_nba[:4])

top_500_neurons_sail = find_top_neurons(sailing_examples)
print("top_500_neurons_sail:", top_500_neurons_sail[:4])

top_500_neurons_travel = find_top_neurons(travel_examples)
print("top_500_neurons_travel:", top_500_neurons_travel[:4])

top_500_neurons_tech = find_top_neurons(technology_examples)
print("top_500_neurons_tech:", top_500_neurons_tech[:4])

top_500_neurons_python_pr = find_top_neurons(python_programming_examples)
print("top_500_neurons_python_pr:", top_500_neurons_python_pr[:4])

top_500_neurons_python_an = find_top_neurons(python_animal_examples)
print("top_500_neurons_python_an:", top_500_neurons_python_an[:4])

top_500_neurons_nintendo = find_top_neurons(mobile_nintendo_games_examples)
print("top_500_neurons_nintendo:", top_500_neurons_nintendo[:4])

top_500_neurons_math = find_top_neurons(math_examples)
print("top_500_neurons_nintendo:", top_500_neurons_math[:4])


#setting up max values
default_max_val = 5.0
default_min_val  = 0.0
text = cooking_examples[0]


print("Top 10 Neurons:")
for layer_sort, neuron_index, activation_value in top_500_neurons_cook[:10]:

    print(f"Neuron {neuron_index} with activation: {activation_value}")
    html_string = basic_neuron_vis(text, layer_sort, neuron_index, max_val=default_max_val, min_val=default_min_val)
    display(HTML(html_string))


 On text  Cooking is a creative process that involves preparing food by combining, mixing, and heating ingredients to create delicious and nutritious meals.
 On text  Baking is a popular method of cooking that uses dry heat in an oven to prepare cakes, bread, pastries, and other desserts.
 On text  Sous vide is a cooking technique where vacuum-sealed food is submerged in a temperature-controlled water bath, resulting in even and precise cooking.
 On text  Grilling is a high-heat method of cooking where food is cooked over an open flame, giving it a delicious smoky flavor and caramelized crust.
 On text  Sauteing is a quick cooking method where food is cooked in a small amount of fat over high heat, allowing for browning and developing flavors.
top_500_neurons_cook: [(3, 1297, 2.645723909139633), (3, 1665, 1.5170585632324218), (0, 1122, 1.4978406190872193), (1, 1369, 1.4520554065704345)]
 On text  The NBA is a professional basketball league in the United States, consisting of 30 teams a

Neuron 1665 with activation: 1.5170585632324218


Neuron 1122 with activation: 1.4978406190872193


Neuron 1369 with activation: 1.4520554065704345


Neuron 1115 with activation: 1.4488437294960022


Neuron 1680 with activation: 1.396847176551819


Neuron 1929 with activation: 1.28018097281456


Neuron 1620 with activation: 1.2527865409851073


Neuron 488 with activation: 1.2129051566123963


Neuron 1476 with activation: 1.0777533292770385


# Saving & Loading output data

In [7]:
!ls
%cd drive/MyDrive/Colab Notebooks/Mechanistic Interpretability/Polysemantic-Neurons

drive  sample_data
/content/drive/MyDrive/Colab Notebooks/Mechanistic Interpretability/Polysemantic-Neurons


In [None]:
import pandas as pd

def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

# Run this for each of the input texts
save_to_csv(top_500_neurons_cook, "top_500_neurons_cook.csv")
save_to_csv(top_500_neurons_nba, "top_500_neurons_nba.csv")
save_to_csv(top_500_neurons_sail, "top_500_neurons_sail.csv")
save_to_csv(top_500_neurons_travel, "top_500_neurons_travel.csv")
save_to_csv(top_500_neurons_tech, "top_500_neurons_tech.csv")
save_to_csv(top_500_neurons_python_pr, "top_500_neurons_python_pr.csv")
save_to_csv(top_500_neurons_python_an, "top_500_neurons_python_an.csv")
save_to_csv(top_500_neurons_nintendo, "top_500_neurons_nintendo.csv")
save_to_csv(top_500_neurons_math, "top_500_neurons_math.csv")


In [8]:
import pandas as pd

def load_csv(filename, feature_type):
    df = pd.read_csv(filename)
    df["feature_type"] = feature_type
    return df

# Load the datasets and add a column for the feature type
cook_df = load_csv("top_500_neurons_cook.csv", "cook")
nba_df = load_csv("top_500_neurons_nba.csv", "nba")
sail_df = load_csv("top_500_neurons_sail.csv", "sail")
travel_df = load_csv("top_500_neurons_travel.csv", "travel")
tech_df = load_csv("top_500_neurons_tech.csv", "tech")
python_pr_df = load_csv("top_500_neurons_python_pr.csv", "python_pr")
python_an_df = load_csv("top_500_neurons_python_an.csv", "python_an")
nintendo_df = load_csv("top_500_neurons_nintendo.csv", "nintendo")
math_df = load_csv("top_500_neurons_math.csv", "math")

# Combine all datasets into one DataFrame
all_neurons_df = pd.concat([cook_df, nba_df, sail_df, travel_df, tech_df, python_pr_df, python_an_df, nintendo_df, math_df], ignore_index=True)



# Experiments & Results

In [9]:
all_neurons_df.rename(columns={"0": "layer", "1": "neuron_index", "2": "activation"},inplace = True)
all_neurons_df

Unnamed: 0,layer,neuron_index,activation,feature_type
0,3,1297,2.645724,cook
1,3,1665,1.517059,cook
2,0,1122,1.497841,cook
3,1,1369,1.452055,cook
4,3,1115,1.448844,cook
...,...,...,...,...
445,3,679,0.540088,math
446,3,1772,0.523010,math
447,3,272,0.518179,math
448,3,1246,0.513699,math


After running all the differet examples of text through the SolU netowrk, it is now time to run different experiments to visualize the neurons that have arisen as potentially having polysemantic meaning. Even if 

In [14]:
def find_neurons_with_n_occurrences(df, n):
    neuron_counts = df.groupby(["layer","neuron_index"])["feature_type"].nunique()
    return neuron_counts[neuron_counts == n]

# Find neurons that show up in exactly 2 feature types
neurons_in_two_feature_types = find_neurons_with_n_occurrences(all_neurons_df, 2)
#print("Neurons in exactly 2 feature types:\n", neurons_in_two_feature_types)

# Find neurons that show up in all feature types
total_feature_types = len(all_neurons_df["feature_type"].unique())
neurons_in_all_feature_types = find_neurons_with_n_occurrences(all_neurons_df, total_feature_types)

#print("Neurons in all feature types:\n", neurons_in_all_feature_types)


In [None]:
all_neurons_df.groupby()

Unnamed: 0,layer,neuron_index,activation,feature_type
0,3,1297,2.645724,cook
1,3,1665,1.517059,cook
2,0,1122,1.497841,cook
3,1,1369,1.452055,cook
4,3,1115,1.448844,cook
...,...,...,...,...
445,3,679,0.540088,math
446,3,1772,0.523010,math
447,3,272,0.518179,math
448,3,1246,0.513699,math


In [None]:
all_neurons_3l = all_neurons_df[all_neurons_df['layer'] == 3]
all_neurons_3l[all_neurons_3l['neuron_index'].isin([450])]


Unnamed: 0,layer,neuron_index,activation,feature_type
328,3,450,0.441501,python_an
426,3,450,0.790655,math


In [None]:
all_neurons_3l = all_neurons_df[all_neurons_df['layer'] == 3]
all_neurons_3l[all_neurons_3l['neuron_index'].isin([65])]

Unnamed: 0,layer,neuron_index,activation,feature_type
267,3,65,0.834518,python_pr
409,3,65,1.100401,math


In [None]:
all_neurons_3l = all_neurons_df[all_neurons_df['layer'] == 3]
all_neurons_3l[all_neurons_3l['neuron_index'].isin([577])]

#write out the staistics of the total number of neurons that have poly semantics v.s number of all neurons
#what are the activations of these neurons on text especially the one in all 9

Unnamed: 0,layer,neuron_index,activation,feature_type
225,3,577,0.51617,tech
368,3,577,0.639124,nintendo


In [10]:
#pd.DataFrame(neurons_in_two_feature_types)


In [28]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def visualize_activations(activations, layer_shape):
  
    for layer_num in range(len(layer_shape)):
        #layer_size = layer_shape[layer_num]
        layer_size = 2048

        # Extract activations for the current layer
        layer_activations = [act for act in activations if act[0] == layer_num]

        # Initialize an array of zeros for the layer
        #normalized_activations = np.zeros(layer_size)
        normalized_activations = np.zeros(2048)

        # Update the normalized_activations array with the actual activation values
        for act in layer_activations:
            normalized_activations[act[1]] = act[2]

        # Reshape the activations to a 2D array for the heatmap
        side = int(np.sqrt(layer_size))
        heatmap_data = normalized_activations.reshape((side, side))

        # Plot the heatmap
        plt.figure(figsize=(10, 10))
        sns.heatmap(heatmap_data, cmap="coolwarm", square=True, cbar_kws={"shrink": 0.8})
        plt.title(f"Layer {layer_num} Activations")
        plt.show()


In [32]:
import pandas as pd
import numpy as np

def generate_complete_dataframe(layers, neurons_per_layer):
    data = []
    for layer in range(layers):
        for neuron in range(neurons_per_layer):
            data.append([layer, neuron, 0, None])

    df_complete = pd.DataFrame(data, columns=["layer", "neuron_index", "activation", "feature_type"])
    return df_complete



# Generate a complete dataframe
layers = 4
neurons_per_layer = 2048
df_complete = generate_complete_dataframe(layers, neurons_per_layer)

# Merge the two dataframes
df_merged = df_complete.merge(all_neurons_df, how="left", on=["layer", "neuron_index"], suffixes=("_x", "")).drop("activation_x", axis=1)

# Fill missing feature_type with "None"
df_merged["feature_type"].fillna(value="None", inplace=True)

print(df_merged)


      layer  neuron_index feature_type_x  activation feature_type
0         0             0           None         NaN         None
1         0             1           None         NaN         None
2         0             2           None         NaN         None
3         0             3           None         NaN         None
4         0             4           None         NaN         None
...     ...           ...            ...         ...          ...
8409      3          2043           None         NaN         None
8410      3          2044           None         NaN         None
8411      3          2045           None         NaN         None
8412      3          2046           None         NaN         None
8413      3          2047           None         NaN         None

[8414 rows x 5 columns]


In [31]:
all_neurons_df

Unnamed: 0,layer,neuron_index,activation,feature_type
0,3,1297,2.645724,cook
1,3,1665,1.517059,cook
2,0,1122,1.497841,cook
3,1,1369,1.452055,cook
4,3,1115,1.448844,cook
...,...,...,...,...
445,3,679,0.540088,math
446,3,1772,0.523010,math
447,3,272,0.518179,math
448,3,1246,0.513699,math


In [35]:
import pandas as pd

df_test = all_neurons_df.copy()

# create a new dataframe with all possible neuron indices for each layer
layer_list = [i for i in range(4)]
neuron_index_list = [i for i in range(2048)]

index = pd.MultiIndex.from_product([layer_list, neuron_index_list], names=['layer', 'neuron_index'])
new_df = pd.DataFrame(index=index).reset_index()

# merge the new dataframe with the original dataframe using a left join
merged_df = pd.merge(new_df, df_test, on=['layer', 'neuron_index'], how='left')

# replace missing activation values with zero
merged_df['activation'] = merged_df['activation'].fillna(0)

#merged_df


In [20]:
#making a new dataframe to keep the original and visualize the new one

visualize_df = all_neurons_df.copy()

visualize_df['neuron_data'] = visualize_df.apply(lambda row: (row['layer'], row['neuron_index'], row['activation']), axis=1)

# Drop the original columns
visualize_df = visualize_df.drop(columns=['layer', 'neuron_index', 'activation'])

print(visualize_df.head())


  feature_type                    neuron_data
0         cook   (3, 1297, 2.645723909139633)
1         cook  (3, 1665, 1.5170585632324218)
2         cook   (0, 1122, 1.497840619087219)
3         cook  (1, 1369, 1.4520554065704343)
4         cook  (3, 1115, 1.4488437294960022)


In [36]:
#moving data to a specific value to pass to visualization
#result = your_function(visualize_df['neuron_data'].tolist())
layer_shape = [(0, 2048), (1, 2048), (2, 2048), (3, 2048)]

#visual_activations_tuple = visualize_df['neuron_data'].tolist()

#visualize_activations(visual_activations_tuple, layer_shape)


In [None]:



#visualize_activations(activations, layer_shape)


In [None]:
#for the solu code I found that there was always the neuron of 211 in layer 0 being activated, so I'm adding in a new example and changing the model to add in more parameters. In order to find more diverse neurons.

In [None]:
#The END