Hunting for Polysemantic Neurons

In [None]:
#This notebook is the expoloration for neurons that encode for multiple features. Based on work done by Neel Nanda, & Anthropic

#Code in this notebook was authored by Neel Nanda, and edited for my own puposes.
#Source:

In [1]:
#mounting my local drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Imports

In [None]:
#Checking for Google Colab
#Neel Nanda code

import os

try:
    import google.colab

    IN_COLAB = True
    print("Running as a Colab notebook")

except:
    IN_COLAB = False
    print("Running as a Jupyter notebook - intended for development only!")
    from IPython import get_ipython

    ipython = get_ipython()
    # Code to automatically update the HookedTransformer code as its edited without restarting the kernel
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

Running as a Colab notebook


In [None]:
#Neel Nanda code
#Sets up transformers github file
import os

if IN_COLAB:
    os.system("pip install git+https://github.com/neelnanda-io/TransformerLens.git")
    os.system("pip install gradio")

In [None]:
import gradio as gr
from transformer_lens import HookedTransformer
from transformer_lens.utils import to_numpy
from IPython.display import HTML

Setting up GPT-2

In [None]:
#Bringing in GPT2 smol version, and setting up the custom hooks
model_name = "solu-2l"
model = HookedTransformer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading model_final.pth:   0%|          | 0.00/227M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.04M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Loaded pretrained model solu-2l into HookedTransformer


Neuron activation function definition

In [None]:
#Neel Nanda
#Function definition to extract neuron activations
def get_neuron_acts(text, layer, neuron_index):
    # Hacky way to get out state from a single hook - we have a single element list and edit that list within the hook.
    cache = {}

    def caching_hook(act, hook):
        cache["activation"] = act[0, :, neuron_index]

    model.run_with_hooks(
        text, fwd_hooks=[(f"blocks.{layer}.mlp.hook_post", caching_hook)]
    )
    return to_numpy(cache["activation"])

In [None]:
#General
#My goal is to find polysemantic neurons, from the GPT2 smol model.
#In general, I want to  figure out what is the search space of my problem
# How can I measure whether 2 neurons are polysemantic?
# Are there different ways to performs this measurement?

#Ok so the thoery could be to pass multiple texts through the netwrok to see what gets activated.
# Do certain neurons fire together when specific ideas/topics/words who up under different circumstances?

#that neurons are "polysemantic", responding to several unrelated features.
#Instead it's this, its the idea that a single neuron accounts of differenet unrelated pieces of information.
# I mean this is extremely similar to how the brain works. Memories are not all kept in the same place.
#Thinking on this further, Ai researchers are truly on the path to create immortal entities that are able to shift to a new 
# computational piece of silicone once they notice a certain level of degradation.




Define Visualization

In [None]:
# This is some CSS (tells us what style )to give each token a thin gray border, to make it easy to see token separation
style_string = """<style> 
    span.token {
        border: 1px solid rgb(123, 123, 123)
        } 
    </style>"""


def calculate_color(val, max_val, min_val):
    # Hacky code that takes in a value val in range [min_val, max_val], normalizes it to [0, 1] and returns a color which interpolates between slightly off-white and red (0 = white, 1 = red)
    # We return a string of the form "rgb(240, 240, 240)" which is a color CSS knows
    normalized_val = (val - min_val) / max_val
    return f"rgb(240, {240*(1-normalized_val)}, {240*(1-normalized_val)})"


def basic_neuron_vis(text, layer, neuron_index, max_val=None, min_val=None):
    """
    text: The text to visualize
    layer: The layer index
    neuron_index: The neuron index
    max_val: The top end of our activation range, defaults to the maximum activation
    min_val: The top end of our activation range, defaults to the minimum activation

    Returns a string of HTML that displays the text with each token colored according to its activation

    Note: It's useful to be able to input a fixed max_val and min_val, because otherwise the colors will change as you edit the text, which is annoying.
    """
    if layer is None:
        return "Please select a Layer"
    if neuron_index is None:
        return "Please select a Neuron"
    acts = get_neuron_acts(text, layer, neuron_index)
    act_max = acts.max()
    act_min = acts.min()
    # Defaults to the max and min of the activations
    if max_val is None:
        max_val = act_max
    if min_val is None:
        min_val = act_min
    # We want to make a list of HTML strings to concatenate into our final HTML string
    # We first add the style to make each token element have a nice border
    htmls = [style_string]
    # We then add some text to tell us what layer and neuron we're looking at - we're just dealing with strings and can use f-strings as normal
    # h4 means "small heading"
    htmls.append(f"<h4>Layer: <b>{layer}</b>. Neuron Index: <b>{neuron_index}</b></h4>")
    # We then add a line telling us the limits of our range
    htmls.append(
        f"<h4>Max Range: <b>{max_val:.4f}</b>. Min Range: <b>{min_val:.4f}</b></h4>"
    )
    # If we added a custom range, print a line telling us the range of our activations too.
    if act_max != max_val or act_min != min_val:
        htmls.append(
            f"<h4>Custom Range Set. Max Act: <b>{act_max:.4f}</b>. Min Act: <b>{act_min:.4f}</b></h4>"
        )
    # Convert the text to a list of tokens
    str_tokens = model.to_str_tokens(text)
    for tok, act in zip(str_tokens, acts):
        # A span is an HTML element that lets us style a part of a string (and remains on the same line by default)
        # We set the background color of the span to be the color we calculated from the activation
        # We set the contents of the span to be the token
        htmls.append(
            f"<span class='token' style='background-color:{calculate_color(act, max_val, min_val)}' >{tok}</span>"
        )

    return "".join(htmls)



Now the next phase is going to be printing out and just finding the polysemantic neurons.


what sort of text dp I need for activations

In [None]:
#Showcasing the contents of the model
model

HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (pos_embed): PosEmbed()
  (hook_pos_embed): HookPoint()
  (blocks): ModuleList(
    (0-1): 2 x TransformerBlock(
      (ln1): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): Attention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
      )
      (mlp): MLP(
        (hook_pre): HookPoint()
        (hook_post): HookPoint()
        (hook_mid): HookPoint()
        (ln): LayerNormPre(
          (hook_scale): HookPoint()
          (hook_normalized): HookPoint()
        )
      )
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()

In [None]:


#Calling the get activations for neurons
#I wonder if after this I can figure out how to measure the geometry of these neurons

text = "Python is a versatile programming language used in web development, data analysis, artificial intelligence, and more. Its simple syntax and readability make it a popular choice among developers."
layer =1
neuron_index = 50


acts = get_neuron_acts(text, layer, neuron_index)
act_max = acts.max()
act_min = acts.min()

print("activations",acts)

# The function outputs a string of HTML
default_max_val = 1.0
default_min_val = 0.0


default_html_string = basic_neuron_vis( text, layer, neuron_index, max_val=default_max_val, min_val=default_min_val,)

# IPython lets us display HTML
print("Displayed HTML")
display(HTML(default_html_string))




activations [-0.07029674 -0.04883455 -0.09451117 -0.09369259 -0.08990312 -0.0694131
 -0.08867509 -0.0403737  -0.05920746 -0.07319549 -0.08000108  0.00927977
  0.39140078 -0.03373516 -0.00293379 -0.06236422 -0.04099499  0.0324665
  0.06825745  0.07162876  0.0296132  -0.08078578 -0.07620943 -0.03312619
 -0.02866317 -0.07535275 -0.05811774 -0.09841597 -0.11112942 -0.11778004
 -0.09367945 -0.08623971 -0.05233225 -0.0661969   0.00065289]


In [None]:
# The function outputs a string of HTML
default_max_val = 1.0
default_min_val = 0.0


default_html_string = basic_neuron_vis(
    text,
    layer,
    neuron_index,
    max_val=default_max_val,
    min_val=default_min_val,
)

# IPython lets us display HTML
print("Displayed HTML")
display(HTML(default_html_string))



Displayed HTML


In [None]:

#Why don't you run the code o figure out, the top and minimum activations of all neurons in the network.



In [None]:
#The END