In [None]:
import torch
import numpy as np
from transformers import AutoModel

model = AutoModel.from_pretrained("bert-base-uncased")
model.eval()


# and `inputs` is symbolic_inputs.pt already loaded

all_hidden_states = []

with torch.no_grad():
    for idx in inputs:
        idx = idx.unsqueeze(0)  # (1, T)

        # forward pass
        outputs = model(idx, output_hidden_states=True)

        # outputs.hidden_states: tuple (num_layers, 1, T, D)
        hidden = torch.stack(outputs.hidden_states, dim=0)
        hidden = hidden[:, 0]  # (num_layers, T, D)

        all_hidden_states.append(hidden.cpu().numpy())

# Shape: (num_inputs, num_layers, T, D)
acts = np.stack(all_hidden_states)

np.save("transformer_hidden_states.npy", acts)
print("Saved transformer_hidden_states.npy")
print("Shape:", acts.shape)


In [None]:
import torch

# Create dummy symbolic inputs
# Shape: (num_inputs, T)
num_inputs = 200
T = 16

inputs = torch.randint(
    low=0,
    high=30522,   # valid BERT vocab range
    size=(num_inputs, T),
    dtype=torch.long
)

torch.save(inputs, "symbolic_inputs.pt")

print("symbolic_inputs.pt created")
print("Shape:", inputs.shape)

In [None]:
import os
print("transformer_hidden_states.npy" in os.listdir("."))

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load transformer hidden states
acts = np.load("transformer_hidden_states.npy")
num_inputs, num_layers, T, D = acts.shape

# Use last layer
acts = acts[:, -1]  # (num_inputs, T, D)

# Build neuron firing vectors (hidden dims = neurons)
FIRE_THRESHOLD = 1e-3
neuron_vectors = []

for d in range(D):
    values = acts[:, :, d].reshape(-1)
    firing = (np.abs(values) > FIRE_THRESHOLD).astype(np.float32)
    neuron_vectors.append(firing)

neuron_vectors = np.stack(neuron_vectors)

# Normalize
neuron_vectors = StandardScaler().fit_transform(neuron_vectors)

# Cluster neurons
kmeans = KMeans(n_clusters=8, random_state=0, n_init=10)
clusters = kmeans.fit_predict(neuron_vectors)

# Save clusters
np.save("transformer_neuron_clusters.npy", clusters)
print("Saved transformer_neuron_clusters.npy")

In [None]:
# ============================================================
# Segment 1: Transformer Cluster Inspection (FINAL)
# ============================================================

import torch
import numpy as np
import os

# ============================================================
# CONFIG
# ============================================================
CLUSTER_ID = 6          # which neuron cluster to inspect
LAYER_ID = -1           # last transformer layer
TOP_K = 10              # top activating inputs
FIRE_THRESHOLD = 1e-3   # dense firing threshold

# ============================================================
# SAFE TOKEN MAP (symbolic + fallback)
# ============================================================
REVERSE_TOKEN_MAP = {
    0: "x",
    1: "y",
    2: "n",
    3: "+",
    4: "-",
    5: "*",
    6: "/",
    7: "=",
    8: "1",
    9: "2",
    10: "3",
    11: "4",
    12: "5",
    13: "6",
    14: "7",
    15: "8",
    16: "9",
}

def decode_equation(token_tensor):
    """
    Safe decoder:
    - Known symbolic tokens → readable
    - Unknown BERT tokens → <token_id>
    """
    tokens = token_tensor.tolist()
    symbols = [
        REVERSE_TOKEN_MAP[t] if t in REVERSE_TOKEN_MAP else f"<{t}>"
        for t in tokens
    ]
    return " ".join(symbols)

# ============================================================
# SAFETY CHECKS
# ============================================================
required_files = [
    "symbolic_inputs.pt",
    "transformer_hidden_states.npy",
    "transformer_neuron_clusters.npy"
]

for f in required_files:
    if not os.path.exists(f):
        raise FileNotFoundError(
            f"Required file '{f}' not found. "
            f"Run the previous segments first."
        )

# ============================================================
# LOAD DATA
# ============================================================
inputs = torch.load("symbolic_inputs.pt")  # (num_inputs, T)
acts = np.load("transformer_hidden_states.npy")  # (num_inputs, num_layers, T, D)
clusters = np.load("transformer_neuron_clusters.npy")  # (D,)

num_inputs, num_layers, T, D = acts.shape
print("Loaded inputs shape :", inputs.shape)
print("Loaded acts shape   :", acts.shape)
print("Loaded clusters     :", clusters.shape)

# Select layer
acts = acts[:, LAYER_ID]  # (num_inputs, T, D)

# ============================================================
# IDENTIFY NEURONS IN CLUSTER
# ============================================================
cluster_dims = np.where(clusters == CLUSTER_ID)[0]

print(f"\nCluster {CLUSTER_ID} contains {len(cluster_dims)} neurons")

if len(cluster_dims) == 0:
    raise ValueError("Selected cluster has no neurons.")

# ============================================================
# COMPUTE CLUSTER ACTIVATION SCORE PER INPUT
# ============================================================
cluster_scores = []

for i in range(num_inputs):
    h = acts[i]  # (T, D)

    # Dense firing mask
    firing = np.abs(h) > FIRE_THRESHOLD

    # Restrict to cluster subspace
    cluster_firing = firing[:, cluster_dims]  # (T, |cluster|)

    # Score = how active this cluster is
    score = cluster_firing.sum()
    cluster_scores.append(score)

cluster_scores = np.array(cluster_scores)

# ============================================================
# SHOW TOP ACTIVATING INPUTS
# ============================================================
top_indices = np.argsort(cluster_scores)[-TOP_K:][::-1]

print("\nTop activating inputs for Transformer cluster:\n")

for idx in top_indices:
    print(f"Input {idx}, score = {cluster_scores[idx]:.2f}")
    print(decode_equation(inputs[idx]))
    print("-" * 60)

In [None]:
import numpy as np

# ============================================================
# CONFIG
# ============================================================
FIRE_THRESHOLD = 1e-3   # small → dense firing (Transformer property)

# ============================================================
# LOAD HIDDEN STATES
# ============================================================
acts = np.load("transformer_hidden_states.npy")
num_inputs, num_layers, T, D = acts.shape

print("Loaded hidden states:", acts.shape)

# Use last layer
acts = acts[:, -1]   # (num_inputs, T, D)

# ============================================================
# BUILD NEURON VECTORS
# ============================================================
# Each hidden dimension = one neuron
# Shape: (D, num_inputs * T)

neuron_vectors = []

for d in range(D):
    values = acts[:, :, d].reshape(-1)
    firing = (np.abs(values) > FIRE_THRESHOLD).astype(np.float32)
    neuron_vectors.append(firing)

neuron_vectors = np.stack(neuron_vectors)

# ============================================================
# SAVE
# ============================================================
np.save("transformer_neuron_vectors.npy", neuron_vectors)

print("Saved transformer_neuron_vectors.npy")
print("Shape:", neuron_vectors.shape)
print("Firing ratio:", neuron_vectors.mean())
print("Sparsity:", 1.0 - neuron_vectors.mean())

In [None]:
# ============================================================
# Segment 2: Build Transformer Activation Atlas (FIXED)
# ============================================================

import os
import numpy as np
import umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# ============================================================
# CONFIG
# ============================================================
UMAP_NEIGHBORS = 25        # higher → smoother, more overlap
UMAP_MIN_DIST = 0.4        # higher → dense clusters
RANDOM_STATE = 42
POINT_SIZE = 6

# ============================================================
# SAFETY CHECKS
# ============================================================
required_files = [
    "transformer_neuron_vectors.npy",
    "transformer_neuron_clusters.npy"
]

for f in required_files:
    if not os.path.exists(f):
        raise FileNotFoundError(
            f"Required file '{f}' not found. "
            f"Run the neuron-vector & clustering step first."
        )

# ============================================================
# LOAD DATA
# ============================================================
neuron_vectors = np.load("transformer_neuron_vectors.npy")
clusters = np.load("transformer_neuron_clusters.npy")

print("Neuron vectors shape:", neuron_vectors.shape)
print("Clusters shape      :", clusters.shape)

assert neuron_vectors.shape[0] == clusters.shape[0], \
    "Mismatch: neurons != cluster labels"

# ============================================================
# NORMALIZATION
# ============================================================
scaler = StandardScaler()
neuron_vectors = scaler.fit_transform(neuron_vectors)

# ============================================================
# UMAP PROJECTION
# ============================================================
reducer = umap.UMAP(
    n_neighbors=UMAP_NEIGHBORS,
    min_dist=UMAP_MIN_DIST,
    n_components=2,
    metric="cosine",
    random_state=RANDOM_STATE
)

coords = reducer.fit_transform(neuron_vectors)
np.save("transformer_atlas_coords.npy", coords)

print("Saved transformer_atlas_coords.npy")
# ============================================================
# Transformer Overlapping Cluster Atlas (Paper-Style)
# ============================================================

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from scipy.spatial import ConvexHull

# ============================================================
# LOAD DATA
# ============================================================
coords = np.load("transformer_atlas_coords.npy")      # (D, 2)
clusters = np.load("transformer_neuron_clusters.npy") # (D,)

unique_clusters = np.unique(clusters)
num_clusters = len(unique_clusters)

print("Clusters:", unique_clusters)

# ============================================================
# COLORS (paper-like)
# ============================================================
colors = [
    "#1f77b4",  # blue
    "#ff7f0e",  # orange
    "#2ca02c",  # green
    "#d62728",  # red
    "#9467bd",  # purple
    "#8c564b",  # brown
    "#e377c2",  # pink
    "#7f7f7f",  # gray
]

# ============================================================
# FIGURE
# ============================================================
plt.figure(figsize=(9, 9))

# plot small clusters first → big ones on top
for i, cid in enumerate(unique_clusters):
    mask = clusters == cid
    pts = coords[mask]

    # ---- jitter to reveal overlap (KEY STEP) ----
    jitter = np.random.normal(scale=0.015, size=pts.shape)
    pts_j = pts + jitter

    # scatter
    plt.scatter(
        pts_j[:, 0],
        pts_j[:, 1],
        s=22,
        alpha=0.45,
        color=colors[i % len(colors)],
        label=f"Cluster {cid}"
    )

    # ---- convex hull (paper look) ----
    if pts.shape[0] > 20:
        hull = ConvexHull(pts)
        hull_pts = pts[hull.vertices]
        poly = Polygon(
            hull_pts,
            facecolor=colors[i % len(colors)],
            alpha=0.08,
            edgecolor=None
        )
        plt.gca().add_patch(poly)

# ============================================================
# LABELS
# ============================================================
plt.title(
    "Transformer Neuron Atlas\nDense, Overlapping & Entangled Representations",
    fontsize=14
)
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")

plt.legend(
    fontsize=9,
    frameon=False,
    markerscale=1.3,
    loc="best"
)

plt.grid(alpha=0.15)
plt.tight_layout()
plt.show()

In [None]:
# ============================================================
# Segment 3: Transformer Neuron Clustering (FIXED)
# ============================================================

import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# ============================================================
# CONFIG
# ============================================================
LAYER_ID = -1              # analyze last transformer layer
FIRE_THRESHOLD = 1e-3      # small → dense firing
N_CLUSTERS = 8
RANDOM_STATE = 0

# ============================================================
# LOAD ACTIVATIONS
# ============================================================
if not os.path.exists("transformer_hidden_states.npy"):
    raise FileNotFoundError(
        "transformer_hidden_states.npy not found. "
        "Run the hidden-state logging step first."
    )

acts = np.load("transformer_hidden_states.npy")

num_inputs, num_layers, T, D = acts.shape
print("Loaded transformer activations:", acts.shape)

# Select layer
acts = acts[:, LAYER_ID]    # (num_inputs, T, D)

# ============================================================
# BUILD NEURON FIRING VECTORS
# ============================================================
# Each hidden dimension = one neuron
# Shape: (D, num_inputs * T)

neuron_vectors = []

for d in range(D):
    values = acts[:, :, d].reshape(-1)
    firing = (np.abs(values) > FIRE_THRESHOLD).astype(np.float32)
    neuron_vectors.append(firing)

neuron_vectors = np.stack(neuron_vectors)

# ============================================================
# SPARSITY CHECK
# ============================================================
firing_ratio = neuron_vectors.mean()
sparsity = 1.0 - firing_ratio

print(f"Transformer firing ratio  : {firing_ratio:.3f}")
print(f"Transformer sparsity      : {sparsity:.3f}")

# ============================================================
# NORMALIZE FOR CLUSTERING
# ============================================================
scaler = StandardScaler()
neuron_vectors_norm = scaler.fit_transform(neuron_vectors)

# ============================================================
# CLUSTER NEURONS
# ============================================================
kmeans = KMeans(
    n_clusters=N_CLUSTERS,
    random_state=RANDOM_STATE,
    n_init=10
)

clusters = kmeans.fit_predict(neuron_vectors_norm)

# ============================================================
# SAVE OUTPUTS (REQUIRED BY SEGMENT 1 & 2)
# ============================================================
np.save("transformer_neuron_vectors.npy", neuron_vectors)
np.save("transformer_neuron_clusters.npy", clusters)

print("Saved transformer_neuron_vectors.npy")
print("Saved transformer_neuron_clusters.npy")

In [None]:
# ============================================================
# Segment 4: Transformer Neuron Sparsity Analysis (FIXED)
# Path B – Interpretability
# ============================================================

import os
import numpy as np

# ============================================================
# CONFIG
# ============================================================
LAYER_ID = -1              # analyze last transformer layer
FIRE_THRESHOLD = 1e-3      # small → dense firing (Transformer property)

# ============================================================
# LOAD TRANSFORMER ACTIVATIONS
# Expected shape: (num_inputs, num_layers, T, D)
# ============================================================
if not os.path.exists("transformer_hidden_states.npy"):
    raise FileNotFoundError(
        "transformer_hidden_states.npy not found. "
        "Run the transformer activation logging step first."
    )

acts = np.load("transformer_hidden_states.npy")

num_inputs, num_layers, T, D = acts.shape
print("Loaded transformer activations:", acts.shape)

# Select layer to analyze
acts = acts[:, LAYER_ID]    # (num_inputs, T, D)

# ============================================================
# BUILD NEURON FIRING VECTORS
# Each hidden dimension = one neuron
# Output shape: (D, num_inputs * T)
# ============================================================
neuron_vectors = np.zeros((D, num_inputs * T), dtype=np.float32)

for d in range(D):
    values = acts[:, :, d].reshape(-1)
    neuron_vectors[d] = (np.abs(values) > FIRE_THRESHOLD).astype(np.float32)

# ============================================================
# SPARSITY / DENSITY CHECK
# ============================================================
firing_ratio = neuron_vectors.mean()
sparsity = 1.0 - firing_ratio

print("\n=== Transformer Neuron Sparsity ===")
print(f"Firing ratio  : {firing_ratio:.3f}")
print(f"Sparsity      : {sparsity:.3f}")

if firing_ratio > 0.6:
    print("✅ Dense firing confirmed (>60%) — entangled representation")
else:
    print("⚠️ Firing ratio below expectation; try lowering FIRE_THRESHOLD")

In [None]:
# ============================================================
# Segment 5: Log Transformer Hidden States (FINAL FIXED)
# Path B – Interpretability
# ============================================================

import os
import torch
import numpy as np
from transformers import AutoModel

# ============================================================
# DEVICE
# ============================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ============================================================
# LOAD PRETRAINED TRANSFORMER (NO CUSTOM MODULE)
# ============================================================
model = AutoModel.from_pretrained(
    "bert-base-uncased",
    output_hidden_states=True
).to(device)

model.eval()

# ============================================================
# LOAD INPUT DATA (SAFE)
# ============================================================
if not os.path.exists("symbolic_inputs.pt"):
    raise FileNotFoundError(
        "symbolic_inputs.pt not found. "
        "Run the data generation step first."
    )

inputs = torch.load("symbolic_inputs.pt")  # shape: (num_inputs, T)
print("Loaded inputs:", inputs.shape)

# ============================================================
# LOG TRANSFORMER HIDDEN STATES
# ============================================================
def log_transformer_hidden_states(model, inputs):
    """
    Returns:
        activations: np.ndarray
                     shape = (num_inputs, num_layers, T, D)
    """
    activations = []

    with torch.no_grad():
        for idx in inputs:
            idx = idx.unsqueeze(0).to(device)  # (1, T)

            outputs = model(idx)

            # outputs.hidden_states:
            # tuple of length (num_layers + 1)
            # each tensor shape: (1, T, D)
            hidden_states = outputs.hidden_states

            # Skip embedding layer → take only transformer layers
            layer_states = torch.stack(
                [h[0] for h in hidden_states[1:]], dim=0
            )  # (num_layers, T, D)

            activations.append(layer_states.cpu().numpy())

    return np.stack(activations)

# ============================================================
# RUN LOGGING
# ============================================================
acts = log_transformer_hidden_states(model, inputs)

np.save("transformer_hidden_states.npy", acts)

print("\nSaved transformer_hidden_states.npy")
print("Hidden states shape:", acts.shape)

# ============================================================
# DENSE FIRING CHECK (IMPORTANT FOR YOUR CLAIM)
# ============================================================
firing_ratio = (np.abs(acts) > 1e-3).mean()

print("\n=== Dense Firing Check ===")
print("Firing ratio:", firing_ratio)

if firing_ratio > 0.6:
    print("✅ Dense firing confirmed (>60%) — Transformer neurons are entangled")
else:
    print("⚠️ Firing ratio lower than expected — consider lowering threshold")