In [None]:
import torch
from dataset import create_dataloader
from basemodel_loader import load_model
from activation_analysis import ActivationAnalyzer, ActivationVisualizer, OptimizedActivationAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Baseline Max Activations

## 1. Load the base model

In [None]:
print("Loading model...")
model_path = 'base_model_best.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
print("Model loaded successfully!")

## 2. Create dataloader

In [None]:
print("\nCreating dataloader...")
dataloader = create_dataloader(
    data_path='validation.bin',
    block_size=256,
    batch_size=32,
    num_workers=4
)
print("Dataloader created!")

## 3. Initialize analyzer

In [None]:
print("\nCreating analyzer...")
analyzer = OptimizedActivationAnalyzer(model, device=device)
results = analyzer.analyze_activations(dataloader, num_batches=100)
print("Initial analysis complete!")

# Optional: print some basic stats about the results
print("\nResults summary:")
for layer_name in results:
    n_neurons = len(results[layer_name]['neurons'])
    total_examples = sum(len(data['examples']) for data in results[layer_name]['neurons'].values())
    print(f"{layer_name}: analyzed {n_neurons} neurons, collected {total_examples} examples")

## Save analysis results

In [None]:
import json
import os
from datetime import datetime

# Create a directory for results if it doesn't exist
os.makedirs('activation_results', exist_ok=True)

# Create filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f'activation_results/neuron_activations_{timestamp}.json'

# Convert any tensor/numpy values to Python native types and save
def convert_for_json(obj):
    if hasattr(obj, 'tolist'):  # Handle torch tensors and numpy arrays
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: convert_for_json(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_for_json(item) for item in obj]  # Changed 'list' to 'obj'
    else:
        return obj

# Convert and save results
json_results = convert_for_json(results)
with open(filename, 'w') as f:
    json.dump(json_results, f, indent=2)

print(f"Results saved to {filename}")

In [None]:
print("CUDA available:", torch.cuda.is_available())

# Load lbl model

In [None]:
from model_loader import load_model

In [None]:
print("Loading ablated model...")
model_path = 'best_model_20241016.pt'  # Using the path from config-2.yaml
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
print("Ablated model loaded successfully!")

In [None]:
print("\nCreating dataloader...")
dataloader = create_dataloader(
    data_path='validation.bin',  # From config-2.yaml
    block_size=256,
    batch_size=24,  # Updated to match config-2.yaml
    num_workers=4
)
print("Dataloader created!")

In [None]:
print("\nCreating analyzer...")
analyzer = OptimizedActivationAnalyzer(model, device=device)
results = analyzer.analyze_activations(dataloader, num_batches=100)
print("Initial analysis complete!")

## Print summary of results

In [None]:
print("\nResults summary:")
for layer_name in results:
    n_neurons = len(results[layer_name]['neurons'])
    total_examples = sum(len(data['examples']) for data in results[layer_name]['neurons'].values())
    print(f"{layer_name}: analyzed {n_neurons} neurons, collected {total_examples} examples")

## Save analysis results

In [None]:

import json
import os
from datetime import datetime

# Create a directory for ablation results if it doesn't exist
os.makedirs('ablation_results', exist_ok=True)

# Create filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f'ablation_results/neuron_activations_ablated_{timestamp}.json'

# Convert any tensor/numpy values to Python native types and save
def convert_for_json(obj):
    if hasattr(obj, 'tolist'):  # Handle torch tensors and numpy arrays
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: convert_for_json(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_for_json(item) for item in obj]
    else:
        return obj

# Convert and save results
json_results = convert_for_json(results)
with open(filename, 'w') as f:
    json.dump(json_results, f, indent=2)

print(f"Results saved to {filename}")


# Global ablation

## Load the globally ablated model

In [None]:
from model_loader import load_model

In [None]:
print("Loading globally ablated model...")
model_path = 'best_model_20241017.pt'  # Using the path from Globalconfig.yaml
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
print("Globally ablated model loaded successfully!")

## 2. Create dataloader

In [None]:

print("\nCreating dataloader...")
dataloader = create_dataloader(
    data_path='validation.bin',  
    block_size=256,
    batch_size=24,  
    num_workers=4
)
print("Dataloader created!")

## 3. Initialize and run analyzer

In [None]:
print("\nCreating analyzer...")
analyzer = OptimizedActivationAnalyzer(model, device=device)
results = analyzer.analyze_activations(dataloader, num_batches=100)
print("Initial analysis complete!")

## Print summary of results

In [None]:
print("\nResults summary:")
for layer_name in results:
    n_neurons = len(results[layer_name]['neurons'])
    total_examples = sum(len(data['examples']) for data in results[layer_name]['neurons'].values())
    print(f"{layer_name}: analyzed {n_neurons} neurons, collected {total_examples} examples")

## 4. Save analysis results

In [None]:
import os
from datetime import datetime
import json

In [None]:
# Create a directory for global ablation results if it doesn't exist
os.makedirs('global_ablation_results', exist_ok=True)

# Create filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f'global_ablation_results/neuron_activations_global_{timestamp}.json'

# Convert any tensor/numpy values to Python native types and save
def convert_for_json(obj):
    if hasattr(obj, 'tolist'):  # Handle torch tensors and numpy arrays
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: convert_for_json(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_for_json(item) for item in obj]
    else:
        return obj

# Convert and save results
json_results = convert_for_json(results)
with open(filename, 'w') as f:
    json.dump(json_results, f, indent=2)

print(f"Results saved to {filename}")

# Optional: Print CUDA status
print("CUDA available:", torch.cuda.is_available())

# Optional: Additional global ablation specific analysis
print("\nGlobal Ablation Specific Stats:")
print(f"Number of neurons kept per layer: {config.k_neurons}")
print(f"Number of attention components kept per layer: {config.k_attention}")

# Pruning

In [None]:
# Import needed modules
from pruning import SequencePruner, setup_logger
from model_loader import load_model
import os
from datetime import datetime
from tqdm.notebook import tqdm
import json
import torch
import time
import psutil

def get_memory_usage():
    """Get current memory usage statistics"""
    gpu_memory = f"{torch.cuda.max_memory_allocated()/1e9:.2f}GB" if torch.cuda.is_available() else "N/A"
    ram_memory = f"{psutil.Process().memory_info().rss/1e9:.2f}GB"
    return f"GPU: {gpu_memory}, RAM: {ram_memory}"

In [None]:
import torch
from model_loader import load_model

def test_model_load(model_path):
    print(f"\nTesting model load for: {model_path}")
    print("="*80)
    
    try:
        # Check CUDA
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f"Using device: {device}")
        
        # Load model
        print("\nLoading model...")
        model, config = load_model(model_path, device)
        
        # Create test input using dictionary config access
        print("\nTesting forward pass...")
        test_input = torch.randint(0, config['vocab_size'], (1, 32), device=device)
        
        # Test forward pass
        with torch.no_grad():
            outputs = model(test_input)
            
        # Print output shapes based on model type
        print("\nModel outputs:")
        if hasattr(outputs, 'keys'):
            for key, value in outputs.items():
                if isinstance(value, torch.Tensor):
                    print(f"- {key}: {value.shape}")
                elif value is None:
                    print(f"- {key}: None")
        else:
            print(f"- output: {outputs.shape}")
            
        print("\n✓ Test successful!")
        return model, config
        
    except Exception as e:
        print(f"\n✗ Test failed with error: {str(e)}")
        raise

# Test with base model
model_path = "configs/base_model_best.pt"
model, config = test_model_load(model_path)

# Analyzer

In [None]:
# Cell 1: Setup
from analysis_utils import ModelAnalyzer, AnalysisLogger
from datetime import datetime
import os

# Create output directory with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f'pruned_results_{timestamp}'

# Initialize analyzer (it will create the directory structure)
analyzer = ModelAnalyzer(output_dir=output_dir)

print(f"Analysis setup complete. Output directory: {output_dir}")

In [None]:
import os
from pathlib import Path

config_dir = Path('configs')
print("Config directory exists:", config_dir.exists())
print("YAML file exists:", (config_dir / 'base_model_best.yaml').exists())
print("Model file exists:", (config_dir / 'base_model_best.pt').exists())
print("Absolute path to YAML:", (config_dir / 'base_model_best.yaml').absolute())

In [None]:
# Cell 2: Base Model Analysis
base_results, base_stats = analyzer.load_and_analyze(
    model_path='/root/.local/configs/base_model_best.pt',
    activation_file='activation_results/neuron_activations_base.json'
)

# Graph Builder

In [None]:
# Imports for graph building
import torch
import networkx as nx
from neuron_graph_builder import NeuronAnalyzer
from model_loader import load_model
import os
from datetime import datetime
from tqdm.notebook import tqdm
import torch.cuda

print("CUDA available:", torch.cuda.is_available())

In [None]:
# Import needed libraries
import torch
import os
from datetime import datetime
import json
from tqdm.notebook import tqdm
from pathlib import Path

# Import our custom modules
from model_loader import load_model
from neuron_graph_builder import NeuronAnalyzer

# Create output directories
os.makedirs("neuron_graphs/base_model", exist_ok=True)
os.makedirs("neuron_graphs/lbl_model", exist_ok=True)
os.makedirs("neuron_graphs/global_model", exist_ok=True)

# Check CUDA availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
if device == 'cuda':
    print(f"GPU Memory Available: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f} GB")

## Processing  Base Model

In [None]:
# Cell 1: Base Model Analysis - Single Layer Debug Version
print("Processing Base Model (Single Layer Debug)...")
base_model_path = 'configs/base_model_best.pt'
base_activation_file = 'activation_results/neuron_activations_base.json'

# Load model
base_model, base_config = load_model(
    model_path=base_model_path,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# Initialize analyzer
base_analyzer = NeuronAnalyzer(base_model)

# Load activation data
base_data = base_analyzer.load_activation_data(base_activation_file)

# Process single layer (layer 0 for debug)
base_graphs = {}
debug_layer = 0  # Change this to analyze different layers
print(f"\nAnalyzing layer {debug_layer}")
layer_graphs = base_analyzer.analyze_layer(
    activation_data=base_data,
    layer=debug_layer,
    save_graphs=True,
    output_dir=f"neuron_graphs/base_model/layer_{debug_layer}"
)
base_graphs[debug_layer] = layer_graphs

# Cleanup
del base_model
torch.cuda.empty_cache()

In [None]:
# Cell 1: Base Model Analysis
print("Processing Base Model...")
base_model_path = 'configs/base_model_best.pt'
base_activation_file = 'activation_results/neuron_activations_base.json'

# Load model
base_model, base_config = load_model(
    model_path=base_model_path,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# Initialize analyzer
base_analyzer = NeuronAnalyzer(base_model)

# Load activation data
base_data = base_analyzer.load_activation_data(base_activation_file)

# Process each layer
base_graphs = {}
for layer in range(base_config.num_layers):
    print(f"\nAnalyzing layer {layer}")
    layer_graphs = base_analyzer.analyze_layer(
        activation_data=base_data,
        layer=layer,
        save_graphs=True,
        output_dir=f"neuron_graphs/base_model/layer_{layer}"
    )
    base_graphs[layer] = layer_graphs

# Cleanup
del base_model
torch.cuda.empty_cache()

In [None]:
from model_loader import load_model
from neuron_graph_builder import NeuronAnalyzer

def analyze_sequence():
    # 1. Load model
    print("Loading model...")
    model_path = 'configs/base_model_best.pt'  # Adjust path as needed
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, config = load_model(model_path, device)
    print("Model loaded successfully")
    
    # 2. Initialize analyzer
    analyzer = NeuronAnalyzer(model)
    print("Analyzer initialized")
    
    # 3. Test sequence (the shortest one from your data)
    test_sequence = [5045, 14028]
    
    # 4. Decode and print tokens
    print("\n=== Token Analysis ===")
    tokens_to_check = [14028, 5045, 1701]
    for token_id in tokens_to_check:
        decoded = analyzer.decode_token(token_id)
        print(f"Token {token_id}: '{decoded}'")
    
    # 5. Get raw activations for the sequence
    print("\n=== Activation Analysis ===")
    # Convert sequence to tokens with BOS
    input_tensor = analyzer.to_tokens(analyzer.decode_tokens(test_sequence), prepend_bos=True)
    print(f"Input tensor shape: {input_tensor.shape}")
    
    try:
        # Get raw activations from model
        outputs, cache = analyzer._model_forward(input_tensor, return_cache=True)
        
        # Get activations for layer 2, neuron 5
        activations = cache['transformer.h.2.mlp'][0, :, 5]  # [0] for batch, : for all positions, 5 for neuron 5
        
        print("\nRaw activations for layer 2, neuron 5:")
        decoded_tokens = analyzer.to_str_tokens(analyzer.decode_tokens(test_sequence), prepend_bos=True)
        for i, (token, activation) in enumerate(zip(decoded_tokens, activations)):
            print(f"Position {i}: Token '{token}' -> Activation: {activation:.6f}")
            
    except Exception as e:
        print(f"Error getting activations: {str(e)}")
        import traceback
        traceback.print_exc()
        
    # Cleanup
    del model
    torch.cuda.empty_cache()

# Run analysis
if __name__ == "__main__":
    analyze_sequence()

In [None]:
from model_loader import load_model
from neuron_graph_builder import NeuronAnalyzer
from graph_utils import fast_prune, fast_measure_importance

def trace_activation_flow():
    # Setup with same test sequence
    print("Loading model...")
    model_path = 'configs/base_model_best.pt'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, config = load_model(model_path, device)
    analyzer = NeuronAnalyzer(model)
    
    # Test sequence
    test_sequence = [5045, 14028]  # "Tim wondered"
    
    print("\n=== Step 1: Initial Sequence Processing ===")
    # Convert to text and back to ensure consistent processing
    text = analyzer.decode_tokens(test_sequence)
    print(f"Text: {text}")
    
    input_tensor = analyzer.to_tokens(text, prepend_bos=True)
    print(f"Input tensor shape: {input_tensor.shape}")
    print(f"Input tokens: {input_tensor.tolist()}")
    
    print("\n=== Step 2: Getting Base Activations ===")
    outputs, cache = analyzer._model_forward(input_tensor, return_cache=True)
    activations = cache['transformer.h.2.mlp'][0, :, 5]
    print("Raw activations:")
    for i, act in enumerate(activations):
        print(f"Position {i}: {act.item():.6f}")
    
    print("\n=== Step 3: Fast Prune Entry ===")
    try:
        result, max_idx, initial_max, truncated_max = fast_prune(
            analyzer=analyzer,  # Pass analyzer instance
            layer=2,
            neuron=5,
            text_input=text,
            pivot_index=1,  # Known position of 'wondered'
            original_activation=3.134399,  # From original data
            proportion_threshold=-0.5,
            batch_size=4,
            return_maxes=True
        )
        print(f"Result text: {result}")
        print(f"Max index: {max_idx}")
        print(f"Initial max activation: {initial_max:.6f}")
        print(f"Truncated max activation: {truncated_max:.6f}")
    except Exception as e:
        print(f"Error in fast_prune: {str(e)}")
        import traceback
        traceback.print_exc()
        
    print("\n=== Step 4: Processing Single Example ===")
    try:
        processed = analyzer.process_single_example(
            text=text,
            pivot_index=1,
            original_activation=3.134399,
            layer=2,
            neuron=5
        )
        print(f"Original sequence: {processed.original_sequence}")
        print(f"Pruned sequence: {processed.pruned_sequence}")
        print(f"Activating token: {processed.activating_token}")
        print(f"Context tokens: {processed.context_tokens}")
        print(f"Activation value: {processed.activation_value}")
        print(f"Activation ratio: {processed.activation_ratio}")
    except Exception as e:
        print(f"Error in process_single_example: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    trace_activation_flow()

In [None]:
# Cell 1: Base Model Analysis - Single Neuron Debug Version
print("Processing Base Model (Single Neuron Debug)...")
base_model_path = 'configs/base_model_best.pt'
base_activation_file = 'activation_results/neuron_activations_base.json'

# Load model
base_model, base_config = load_model(
    model_path=base_model_path,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# Initialize analyzer
base_analyzer = NeuronAnalyzer(base_model)

# Load activation data
base_data = base_analyzer.load_activation_data(base_activation_file)

# Get specific layer and neuron data
layer_name = 'transformer.h.2.mlp.c_fc'
neuron_id = '5'

# Get just 5 examples
examples = base_data[layer_name]['neurons'][neuron_id]['examples'][:5]

# Build graph for just this neuron
print(f"\nAnalyzing layer 2, neuron 0 with 5 examples")
graph = base_analyzer.build_graph(
    layer=2,
    neuron=5,
    examples=examples
)

# Save single graph
base_analyzer.save_graph(
    graph=graph,
    layer=2,
    neuron=5,
    output_dir="neuron_graphs/debug"
)

# Cleanup
del base_model
torch.cuda.empty_cache()

In [None]:
# test_neuron_analyzer.ipynb

import torch
from model_loader import load_model
from neuron_graph_builder import NeuronAnalyzer
import json
import os
from tqdm.notebook import tqdm

# Test with just one neuron from one layer
def test_single_neuron():
    print("Setting up test...")
    
    # 1. Load model
    model_path = 'configs/base_model_best.pt'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, config = load_model(model_path, device)
    print("Model loaded successfully")
    
    # 2. Initialize analyzer
    analyzer = NeuronAnalyzer(model)
    print("Analyzer initialized")
    
    # 3. Load a small subset of activation data
    activation_file = 'activation_results/neuron_activations_base.json'
    with open(activation_file, 'r') as f:
        data = json.load(f)
    
    # 4. Get first layer and first neuron
    layer = 0
    layer_name = f'transformer.h.{layer}.mlp.c_fc'
    first_neuron_id = list(data[layer_name]['neurons'].keys())[0]
    neuron_data = data[layer_name]['neurons'][first_neuron_id]
    
    print(f"\nTesting with:")
    print(f"Layer: {layer}")
    print(f"Neuron: {first_neuron_id}")
    print(f"Number of examples: {len(neuron_data['examples'])}")
    
    # 5. Test processing a single example
    try:
        print("\nProcessing single example...")
        example = neuron_data['examples'][0]
        processed = analyzer.process_single_example(
            sequence=example['sequence'],
            pivot_index=example['pivot_index'],
            original_activation=max(example['activations']),
            layer=layer,
            neuron=int(first_neuron_id)
        )
        print("Single example processed successfully!")
        print(f"\nResults:")
        print(f"Activating token: {processed.activating_token}")
        print(f"Number of context tokens: {len(processed.context_tokens)}")
        print(f"Activation ratio: {processed.activation_ratio:.3f}")
        
        # 6. Test building graph
        print("\nBuilding graph...")
        graph = analyzer.build_graph(
            layer=layer,
            neuron=int(first_neuron_id),
            examples=neuron_data['examples'][:5]  # Just use first 5 examples for test
        )
        print("Graph built successfully!")
        print(f"Number of nodes: {len(graph.nodes)}")
        print(f"Number of edges: {len(graph.edges)}")
        
        # 7. Test saving graph
        test_output_dir = "test_neuron_graphs"
        print(f"\nSaving graph to {test_output_dir}...")
        analyzer.save_graph(graph, layer, int(first_neuron_id), test_output_dir)
        print("Graph saved successfully!")
        
        print("\nAll tests passed!")
        return True
        
    except Exception as e:
        print(f"\nError during testing: {str(e)}")
        return False
    finally:
        # Cleanup
        del model
        torch.cuda.empty_cache()

# Run the test
test_single_neuron()

In [None]:
def test_neuron_flow():
    # Load model as before
    model_path = '/root/.local/configs/base_model_best.pt'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, config = load_model(model_path, device)
    
    # Initialize analyzer 
    analyzer = NeuronAnalyzer(model)
    
    # Load data
    with open('activation_results/neuron_activations_base.json', 'r') as f:
        data = json.load(f)
    
    # Get example data
    layer_name = 'transformer.h.0.mlp.c_fc'
    first_neuron_id = list(data[layer_name]['neurons'].keys())[0]
    example = data[layer_name]['neurons'][first_neuron_id]['examples'][0]
    
    # Print detailed info about the example
    print("\nExample data:")
    print(f"Sequence length: {len(example['sequence'])}")
    print(f"Pivot index: {example['pivot_index']}")
    print(f"Raw sequence: {example['sequence'][:10]}...")  # First 10 tokens
    
    # Test token decoding
    text = analyzer.decode_tokens(example['sequence'])
    print(f"\nDecoded text (first 100 chars): {text[:100]}")
    
    # Test token encoding
    tokens = analyzer.to_tokens(text)
    print(f"\nRe-encoded tokens shape: {tokens.shape}")
    
    str_tokens = analyzer.to_str_tokens(text)
    print(f"\nString tokens (first 5): {str_tokens[:5]}")
    
    return example, text, tokens, str_tokens

# Run test
test_neuron_flow()

In [None]:
def test_full_flow():
    # Setup as before
    model_path = '/root/.local/configs/base_model_best.pt'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, config = load_model(model_path, device)
    analyzer = NeuronAnalyzer(model)
    
    # Load data
    with open('activation_results/neuron_activations_base.json', 'r') as f:
        data = json.load(f)
    
    # Get example data
    layer_name = 'transformer.h.0.mlp.c_fc'
    first_neuron_id = list(data[layer_name]['neurons'].keys())[0]
    example = data[layer_name]['neurons'][first_neuron_id]['examples'][0]
    
    print("\nProcessing example...")
    try:
        processed = analyzer.process_single_example(
            sequence=example['sequence'],
            pivot_index=example['pivot_index'],
            original_activation=max(example['activations']),
            layer=0,
            neuron=int(first_neuron_id)
        )
        
        print("\nProcessed example results:")
        print(f"Original sequence: {processed.original_sequence}")
        print(f"Pruned sequence: {processed.pruned_sequence}")
        print(f"Activating token: {processed.activating_token}")
        print(f"Context tokens: {processed.context_tokens}")
        print(f"Activation value: {processed.activation_value}")
        print(f"Activation ratio: {processed.activation_ratio}")
        
        return processed
        
    except Exception as e:
        print(f"\nError during processing: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

# Run test
test_full_flow()

In [None]:
print("Setting up test...")
    
# 1. Load model
model_path = '/root/.local/configs/base_model_best.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
print("Model loaded successfully")

# 2. Initialize analyzer
analyzer = NeuronAnalyzer(model)
print("Analyzer initialized")

# 3. Load activation data
activation_file = 'activation_results/neuron_activations_base.json'
with open(activation_file, 'r') as f:
    data = json.load(f)

# 4. Get first layer and first neuron
layer = 0
layer_name = f'transformer.h.{layer}.mlp.c_fc'
first_neuron_id = list(data[layer_name]['neurons'].keys())[0]
neuron_data = data[layer_name]['neurons'][first_neuron_id]

print(f"\nTesting with:")
print(f"Layer: {layer}")
print(f"Neuron: {first_neuron_id}")
print(f"Number of examples: {len(neuron_data['examples'])}")

# 5. Build graph using all examples
try:
    print("\nBuilding graph...")
    graph = analyzer.build_graph(
        layer=layer,
        neuron=int(first_neuron_id),
        examples=neuron_data['examples'][:6],  # Using all examples
        min_pattern_frequency=2  # Keeping original frequency threshold
    )
    print("Graph built successfully!")
    print(f"Number of nodes: {len(graph.nodes)}")
    print(f"Number of edges: {len(graph.edges)}")
    
    # 6. Print graph details
    if len(graph.nodes) > 0:
        print("\nGraph details:")
        print("\nNodes:")
        for node in graph.nodes(data=True):
            print(f"Token: {node[0]}")
            print(f"Data: {node[1]}")
            print()
            
        print("\nEdges:")
        for edge in graph.edges(data=True):
            print(f"{edge[0]} -> {edge[1]}")
            print(f"Weight: {edge[2]['weight']}")
            print()
    
    # 7. Save the graph
    test_output_dir = "test_neuron_graphs"
    print(f"\nSaving graph to {test_output_dir}...")
    analyzer.save_graph(graph, layer, int(first_neuron_id), test_output_dir)
    print("Graph saved successfully!")
    
except Exception as e:
    print(f"\nError during graph building: {str(e)}")
    import traceback
    traceback.print_exc()

## Edge Case Testing

In [None]:
from graph_utils import fast_prune
print("Setting up test...")

# 1. Load model
model_path = '/root/.local/configs/base_model_best.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
print("Model loaded successfully")

# 2. Initialize analyzer
analyzer = NeuronAnalyzer(model)
print("Analyzer initialized")

# 3. Load activation data
activation_file = 'activation_results/neuron_activations_base.json'
with open(activation_file, 'r') as f:
    data = json.load(f)

# 4. Get first layer and first neuron
layer = 0
layer_name = f'transformer.h.{layer}.mlp.c_fc'
first_neuron_id = list(data[layer_name]['neurons'].keys())[0]
neuron_data = data[layer_name]['neurons'][first_neuron_id]

print(f"\nTesting with:")
print(f"Layer: {layer}")
print(f"Neuron: {first_neuron_id}")
print(f"Number of examples: {len(neuron_data['examples'])}")

# 5. Build graph using all examples
try:
    print("\nBuilding graph...")
    graph = analyzer.build_graph(
        layer=layer,
        neuron=int(first_neuron_id),
        examples=neuron_data['examples'][:6], # Using all examples
        min_pattern_frequency=2 # Keeping original frequency threshold
    )
    print("Graph built successfully!")
    print(f"Number of nodes: {len(graph.nodes)}")
    print(f"Number of edges: {len(graph.edges)}")

    # 6. Print graph details
    if len(graph.nodes) > 0:
        print("\nGraph details:")
        print("\nNodes:")
        for node in graph.nodes(data=True):
            print(f"Token: {node[0]}")
            print(f"Data: {node[1]}")
            print()

        print("\nEdges:")
        for edge in graph.edges(data=True):
            print(f"{edge[0]} -> {edge[1]}")
            print(f"Weight: {edge[2]['weight']}")
            print()

    # 7. Save the graph
    test_output_dir = "test_neuron_graphs"
    print(f"\nSaving graph to {test_output_dir}...")
    analyzer.save_graph(graph, layer, int(first_neuron_id), test_output_dir)
    print("Graph saved successfully!")

except Exception as e:
    print(f"\nError during graph building: {str(e)}")
    import traceback
    traceback.print_exc()

# 8. Run edge case tests
print("\n=== Running Edge Case Tests ===")

def test_edge_cases(analyzer):
    """Test position tracking with challenging inputs"""
    test_cases = [
        # Case 1: Very short input
        ("A.", "Short single-token input"),
        
        # Case 2: Input with all sentence boundaries
        ("Stop. Look. Listen.", "Multiple short sentences"),
        
        # Case 3: No clear sentence boundaries
        ("this is just a bunch of words without any proper punctuation or structure", 
         "No sentence boundaries"),
        
        # Case 4: Mixed boundaries
        ("First sentence. Second sentence... Third sentence! Fourth sentence?",
         "Mixed punctuation"),
        
        # Case 5: Nested quotes
        ('He said "Stop." Then "Go!" Finally, "Wait..."',
         "Nested quotes"),
        
        # Case 6: Maximum length edge
        ("The " * 512, "Length limit test"),
        
        # Case 7: Special characters
        ("Line 1\nLine 2\nLine 3.", "Multiple newlines"),
        
        # Case 8: Empty spaces
        ("   Lots   of   spaces   ", "Multiple spaces"),
    ]
    
    print("Starting edge case tests...")
    
    for text, description in test_cases:
        print(f"\n=== Testing: {description} ===")
        print(f"Input: {text[:100]}{'...' if len(text) > 100 else ''}")
        
        try:
            result, position = fast_prune(
                analyzer=analyzer,
                layer=0,
                neuron=0,
                text_input=text,
                max_length=1024,
                proportion_threshold=-0.5
            )
            
            print(f"Success!")
            print(f"Output length: {len(result)}")
            print(f"Final position: {position}")
            
        except Exception as e:
            print(f"Failed: {str(e)}")
            import traceback
            traceback.print_exc()

    print("\nEdge case testing complete!")

# Run edge case tests with our analyzer
test_edge_cases(analyzer)

## Processing LBL Model

In [None]:
# Required imports
import torch
import os
import json
import traceback
from tqdm import tqdm
from model_loader import load_model
from neuron_graph_builder import NeuronAnalyzer

# Cell 1: Test LBL Model Loading
print("Testing LBL Model Loading...")
try:
    # Model paths
    lbl_model_path = 'configs/lbl_model_20241016.pt'
    
    # Load model
    print(f"\nAttempting to load model from: {lbl_model_path}")
    lbl_model, lbl_config = load_model(
        model_path=lbl_model_path,
        device='cuda' if torch.cuda.is_available() else 'cpu',
        config_dir='configs'
    )
    
    # Print model info
    print("\nModel Configuration:")
    print(f"Ablation mask level: {'layer-by-layer' if lbl_config.has_layer_by_layer_ablation_mask else 'overall'}")
    print(f"Number of layers: {lbl_config.num_layers}")
    print(f"Hidden size: {lbl_config.hidden_size}")
    print(f"MLP hidden size: {lbl_config.mlp_hidden_size}")
    print(f"Number of heads: {lbl_config.num_heads}")
    print(f"Device: {next(lbl_model.parameters()).device}")
    print(f"Number of parameters: {sum(p.numel() for p in lbl_model.parameters())}")

    # Print ablation-specific config
    print("\nAblation Configuration:")
    print(f"k_attention: {lbl_config.k_attention}")
    print(f"k_neurons: {lbl_config.k_neurons}")
    print(f"Temperature attention: {lbl_config.temperature_attention}")
    print(f"Temperature neurons: {lbl_config.temperature_neurons}")
    print(f"Loss coefficient base: {lbl_config.loss_coeff_base}")
    print(f"Loss coefficient ablated: {lbl_config.loss_coeff_ablated}")
    print(f"Reconstruction coefficient: {lbl_config.reconstruction_coeff}")

    # Cleanup
    del lbl_model
    torch.cuda.empty_cache()
    print("\nCleanup complete")

except FileNotFoundError as e:
    print(f"\nFile not found error: {str(e)}")
    print("Please check that both the model file and its config exist in the configs directory")
except Exception as e:
    print(f"\nError loading model: {str(e)}")
    print("Stack trace:")
    import traceback
    traceback.print_exc()

In [None]:
start_neuron = 62  # Start from the next neuron after the last successful one
batch_size = 10

print("Processing LBL Model (Single Layer Debug)...")
try:
    lbl_model_path = 'configs/lbl_model_20241016.pt'
    lbl_activation_file = 'activation_results/neuron_activations_lbl.json'
    debug_layer = 1
    
    # Calculate batch end, now using 511 as max
    end_neuron = min(start_neuron + batch_size, 511 + 1)  # +1 because range is exclusive
    print(f"\nProcessing neurons {start_neuron} to {end_neuron-1} in layer {debug_layer}")
    
    # Load model
    lbl_model, lbl_config = load_model(
        model_path=lbl_model_path,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )
    
    # Initialize analyzer
    lbl_analyzer = NeuronAnalyzer(lbl_model)
    
    # Load activation data
    lbl_data = lbl_analyzer.load_activation_data(lbl_activation_file)
    
    # Process batch of neurons
    lbl_graphs = {}
    layer_graphs = {}
    layer_name = f'transformer.h.{debug_layer}.mlp.c_fc'
    
    # Process only the specified range of neurons
    for neuron_id in range(start_neuron, end_neuron):
        print(f"\nProcessing neuron {neuron_id}")
        try:
            graph = lbl_analyzer.build_graph(
                layer=debug_layer,
                neuron=neuron_id,
                examples=lbl_data[layer_name]['neurons'][str(neuron_id)]['examples']
            )
            layer_graphs[neuron_id] = graph
            
            # Save individual graph
            lbl_analyzer.save_graph(
                graph,
                debug_layer,
                neuron_id,
                f"neuron_graphs/lbl_model/layer_{debug_layer}"
            )
            
        except Exception as e:
            print(f"Error processing neuron {neuron_id}: {str(e)}")
            continue
    
    lbl_graphs[debug_layer] = layer_graphs
    
    # Cleanup
    del lbl_model
    torch.cuda.empty_cache()
    
    print(f"\nBatch complete! Processed neurons {start_neuron} to {end_neuron-1}")
    if end_neuron > 511:
        print("Layer processing complete!")
    else:
        print(f"Next batch will start at neuron {end_neuron}")
        
except Exception as e:
    print(f"Error in analysis: {str(e)}")
    print("Stack trace:")
    traceback.print_exc()

In [None]:
# Required imports
import torch
from model_loader import load_model
from utils.compatibility import convert_model_to_hooked_transformer, get_ablation_hooks_for_tl

print("Testing Model Compatibility...")

try:
    # Model paths
    model_path = 'configs/lbl_model_20241016.pt'
    
    # Load model
    print(f"\nAttempting to load model from: {model_path}")
    our_model, config = load_model(
        model_path=model_path,
        device='cuda' if torch.cuda.is_available() else 'cpu',
        config_dir='configs'
    )
    
    print("\nModel loaded successfully!")
    print(f"Ablation mask level: {'layer-by-layer' if config.has_layer_by_layer_ablation_mask else 'overall'}")
    
    # Create and process test input
    test_input = torch.tensor([42, 123, 1001], device=next(our_model.parameters()).device, dtype=torch.long).unsqueeze(0)
    our_result = our_model(test_input)
    print("\nModel inference successful!")
    print(f"Attention ablations shape: {our_result['attention_ablations'].shape}")
    print(f"Neuron ablations shape: {our_result['neuron_ablations'].shape}")
    
    # Convert to HookedTransformer
    ht = convert_model_to_hooked_transformer(our_model)
    print("\nConverted to HookedTransformer successfully!")
    
    # Run with original model 
    ht_result = ht(test_input)
    print("\nHookedTransformer inference successful!")
    
    # Verify outputs match (accounting for unembed centering)
    softmax_diff = (our_result["logits_clean"].softmax(-1) - ht_result.softmax(-1)).abs().max()
    print(f"\nMax softmax difference: {softmax_diff}")
    
    # Test ablation hooks
    last_token_ablation_hooks = get_ablation_hooks_for_tl(our_result, -1, our_model.config)
    ht_result_ablated = ht.run_with_hooks(test_input, "logits", fwd_hooks=last_token_ablation_hooks)
    print("\nAblation hooks test successful!")
    
    # Verify ablated outputs match
    clean_diff = (our_result["logits_clean"][0,:-1].softmax(-1) - ht_result_ablated[0,:-1].softmax(-1)).abs().max()
    ablated_diff = (our_result["logits_ablated"][0,-1:].softmax(-1) - ht_result_ablated[0,-1:].softmax(-1)).abs().max()
    
    print(f"\nMax clean difference: {clean_diff}")
    print(f"Max ablated difference: {ablated_diff}")
    
    # Cleanup
    del our_model
    del ht
    torch.cuda.empty_cache()
    print("\nCleanup complete")

except FileNotFoundError as e:
    print(f"\nFile not found error: {str(e)}")
    print("Please check that both the model file and its config exist in the configs directory")
except Exception as e:
    print(f"\nError loading model: {str(e)}")
    print("Stack trace:")
    import traceback
    traceback.print_exc()

In [None]:
# Test ablated model inference
print("Testing Ablated Model Inference...")

try:
    # Model paths
    model_path = 'configs/lbl_model_20241016.pt'
    
    # Load model
    print(f"\nLoading model from: {model_path}")
    model, config = load_model(
        model_path=model_path,
        device='cuda' if torch.cuda.is_available() else 'cpu',
        config_dir='configs'
    )
    model.eval()  # Set to evaluation mode
    
    # Create a simple test input
    print("\nPreparing test input...")
    test_input = torch.tensor([[1, 2, 3, 4, 5]], device=next(model.parameters()).device)  # Simple sequence
    
    # Run inference
    print("\nRunning inference...")
    with torch.no_grad():
        outputs = model(test_input)
    
    # Check outputs
    print("\nChecking outputs...")
    print("Output keys:", list(outputs.keys()))
    
    if "logits_clean" in outputs:
        print("\nClean logits shape:", outputs["logits_clean"].shape)
        print("Clean logits sample (first position, first 5 values):")
        print(outputs["logits_clean"][0, 0, :5])
        
    if "logits_ablated" in outputs:
        print("\nAblated logits shape:", outputs["logits_ablated"].shape)
        print("Ablated logits sample (first position, first 5 values):")
        print(outputs["logits_ablated"][0, 0, :5])
        
    if "attention_ablations" in outputs:
        print("\nAttention ablations shape:", outputs["attention_ablations"].shape)
        print("Attention ablations mean:", outputs["attention_ablations"].mean().item())
        
    if "neuron_ablations" in outputs:
        print("\nNeuron ablations shape:", outputs["neuron_ablations"].shape)
        print("Neuron ablations mean:", outputs["neuron_ablations"].mean().item())
    
    # Test text generation
    print("\nTesting text generation...")
    generated = model.generate(
        input_ids=test_input,
        max_new_tokens=5,
        temperature=0.7
    )
    print("Generated token IDs:", generated)
    
    # Cleanup
    del model
    torch.cuda.empty_cache()
    print("\nTest complete!")
    
except Exception as e:
    print(f"\nError during inference test: {str(e)}")
    print("Stack trace:")
    import traceback
    traceback.print_exc()

In [None]:
# Required imports
import torch
from model_loader import load_model
from utils.compatibility import convert_model_to_hooked_transformer, get_ablation_hooks_for_tl

print("Testing Full HookedTransformer Compatibility...")

try:
    # Load model
    model_path = 'configs/lbl_model_20241016.pt'
    print(f"\nLoading model from: {model_path}")
    our_model, config = load_model(
        model_path=model_path,
        device='cuda' if torch.cuda.is_available() else 'cpu',
        config_dir='configs'
    )
    
    # Convert to HookedTransformer
    print("\nConverting to HookedTransformer...")
    ht = convert_model_to_hooked_transformer(our_model)
    
    # Create test input
    test_input = torch.tensor([42, 123, 1001], device=next(our_model.parameters()).device, dtype=torch.long).unsqueeze(0)
    
    # Get outputs from both models
    print("\nRunning inference on both models...")
    our_output = our_model(test_input)
    
    # Get ablation hooks
    ablation_hooks = get_ablation_hooks_for_tl(our_output, -1, our_model.config)
    
    # Run HookedTransformer with caching and hooks
    print("\nRunning HookedTransformer with cache and hooks...")
    with ht.hooks(fwd_hooks=ablation_hooks):
        logits, cache = ht.run_with_cache(test_input)
    
    # Print activation shapes from cache
    print("\nCache activation shapes:")
    for key in sorted(cache.keys()):
        if 'hook_' in key:
            print(f"{key}: {cache[key].shape}")
    
    # Verify key activations match
    print("\nVerifying specific activations...")
    # Check MLP activations for a specific layer
    layer_idx = 0
    mlp_act_key = f'blocks.{layer_idx}.mlp.hook_post'
    print(f"\nLayer {layer_idx} MLP activations shape: {cache[mlp_act_key].shape}")
    
    # Check attention pattern for a specific layer
    attn_key = f'blocks.{layer_idx}.attn.hook_pattern'
    if attn_key in cache:
        print(f"Layer {layer_idx} attention pattern shape: {cache[attn_key].shape}")
    
    # Verify outputs match (comparing logits)
    print("\nVerifying outputs...")
    clean_diff = (our_output["logits_clean"][0,:-1].softmax(-1) - logits[0,:-1].softmax(-1)).abs().max()
    ablated_diff = (our_output["logits_ablated"][0,-1:].softmax(-1) - logits[0,-1:].softmax(-1)).abs().max()
    
    print(f"Max clean difference: {clean_diff}")
    print(f"Max ablated difference: {ablated_diff}")
    
    # Memory cleanup
    del our_model
    del ht
    torch.cuda.empty_cache()
    print("\nTest complete!")

except Exception as e:
    print(f"\nError during testing: {str(e)}")
    print("Stack trace:")
    import traceback
    traceback.print_exc()

In [None]:
# Single Neuron Analysis Debug Cell
print("Setting up single neuron debug analysis...")

# Model setup
lbl_model_path = 'configs/lbl_model_20241016.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\nLoading model from {lbl_model_path}")
lbl_model, lbl_config = load_model(model_path=lbl_model_path, device=device)

# Initialize analyzer
lbl_analyzer = NeuronAnalyzer(lbl_model)

# Load activation data
lbl_activation_file = 'activation_results/neuron_activations_lbl.json'
print(f"\nLoading activation data from {lbl_activation_file}")
with open(lbl_activation_file, 'r') as f:
    lbl_data = json.load(f)

# Debug settings
debug_layer = 0
debug_neuron = 150  # Will analyze first neuron in layer
layer_name = f'transformer.h.{debug_layer}.mlp.c_fc'

print(f"\nAnalyzing:")
print(f"Layer: {debug_layer}")
print(f"Neuron: {debug_neuron}")

try:
    # Get neuron data
    neuron_data = lbl_data[layer_name]['neurons'][str(debug_neuron)]
    print(f"Found {len(neuron_data['examples'])} examples for this neuron")
    
    # Process just a few examples first
    num_debug_examples = 3
    debug_examples = neuron_data['examples'][:num_debug_examples]
    
    print(f"\nProcessing {num_debug_examples} examples:")
    for i, example in enumerate(debug_examples):
        print(f"\nExample {i+1}:")
        print(f"Sequence length: {len(example['sequence'])}")
        print(f"Pivot index: {example['pivot_index']}")
        print(f"Max activation: {max(example['activations'])}")
        
        try:
            processed = lbl_analyzer.process_single_example(
                text=example['sequence'],
                pivot_index=example['pivot_index'],
                original_activation=max(example['activations']),
                layer=debug_layer,
                neuron=debug_neuron
            )
            
            print("\nProcessed successfully!")
            print(f"Activating token: {processed.activating_token}")
            print(f"Context tokens: {processed.context_tokens}")
            print(f"Activation ratio: {processed.activation_ratio:.3f}")
            
        except Exception as e:
            print(f"Error processing example: {str(e)}")
            import traceback
            traceback.print_exc()
    
    # If examples processed successfully, try building graph
    print("\nAttempting to build graph...")
    graph = lbl_analyzer.build_graph(
        layer=debug_layer,
        neuron=debug_neuron,
        examples=debug_examples
    )
    
    print("\nGraph statistics:")
    print(f"Nodes: {len(graph.nodes)}")
    print(f"Edges: {len(graph.edges)}")
    
    # Save debug graph
    debug_output_dir = "debug_neuron_graphs"
    os.makedirs(debug_output_dir, exist_ok=True)
    lbl_analyzer.save_graph(
        graph=graph,
        layer=debug_layer,
        neuron=debug_neuron,
        output_dir=debug_output_dir
    )
    print(f"\nDebug graph saved to {debug_output_dir}")

except Exception as e:
    print(f"\nError in analysis: {str(e)}")
    import traceback
    traceback.print_exc()

finally:
    # Cleanup
    print("\nCleaning up...")
    del lbl_model
    torch.cuda.empty_cache()

print("\nDebug analysis complete!")

In [None]:
#HERE
print("Processing LBL Model (Single Layer Debug)...")

try:
    lbl_model_path = 'configs/lbl_model_20241016.pt'
    lbl_activation_file = 'activation_results/neuron_activations_lbl.json'

    # Load model
    lbl_model, lbl_config = load_model(
        model_path=lbl_model_path,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )

    # Initialize analyzer
    lbl_analyzer = NeuronAnalyzer(lbl_model)

    # Load activation data
    lbl_data = lbl_analyzer.load_activation_data(lbl_activation_file)

    # Process single layer
    lbl_graphs = {}
    debug_layer = 1
    print(f"\nAnalyzing layer {debug_layer}")
    layer_graphs = lbl_analyzer.analyze_layer(
        activation_data=lbl_data,
        layer=debug_layer,
        save_graphs=True,
        output_dir=f"neuron_graphs/lbl_model/layer_{debug_layer}"
    )
    lbl_graphs[debug_layer] = layer_graphs

    # Cleanup
    del lbl_model
    torch.cuda.empty_cache()

except Exception as e:
    print(f"Error in analysis: {str(e)}")
    print("Stack trace:")
    traceback.print_exc()

In [None]:
# Import required dependencies 
from model_loader import load_model
from neuron_graph_builder import NeuronAnalyzer
import torch
import traceback

print("Testing Ablated Model Processing...")
try:
    # Load model and config
    model_path = 'configs/lbl_model_20241016.pt'  # Adjust path as needed
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, config = load_model(
        model_path=model_path,
        device=device
    )
    print("Model loaded successfully")
    
    # Initialize analyzer
    analyzer = NeuronAnalyzer(model)
    print("Analyzer initialized")
    
    # Test single example processing
    test_layer = 0
    test_neuron = 0
    test_text = "The quick brown fox jumps over the lazy dog."
    print(f"\nTesting with:")
    print(f"Layer: {test_layer}")
    print(f"Neuron: {test_neuron}")
    print(f"Text: {test_text}")
    
    try:
        print("\nTesting pruning and importance measurement...")
        processed = analyzer.process_single_example(
            text=test_text,
            pivot_index=5,  # Test with a known position
            original_activation=1.0,  # Test activation value
            layer=test_layer,
            neuron=test_neuron
        )
        
        print("\nResults:")
        print(f"Pruned sequence: {processed.pruned_sequence}")
        print(f"Activating token: {processed.activating_token}")
        print(f"Number of context tokens: {len(processed.context_tokens)}")
        print(f"Context tokens: {processed.context_tokens}")
        print(f"Activation ratio: {processed.activation_ratio:.3f}")
        
        print("\nTest completed successfully!")
        
    except Exception as e:
        print(f"\nError in example processing: {str(e)}")
        print("Stack trace:")
        traceback.print_exc()
    
    # Cleanup
    del model
    torch.cuda.empty_cache()
    
except Exception as e:
    print(f"Error in setup: {str(e)}")
    print("Stack trace:")
    traceback.print_exc()

In [None]:
print("Testing Ablated Model Processing with Real Data...")
try:
    # Load model and config
    model_path = 'configs/lbl_model_20241016.pt'
    activation_file = 'activation_results/neuron_activations_lbl.json'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Load model
    model, config = load_model(
        model_path=model_path,
        device=device
    )
    print("Model loaded successfully")
    
    # Initialize analyzer
    analyzer = NeuronAnalyzer(model)
    print("Analyzer initialized")
    
    # Load activation data
    data = analyzer.load_activation_data(activation_file)
    print("Activation data loaded")
    
    # Get example from actual data
    debug_layer = 0
    layer_name = f'transformer.h.{debug_layer}.mlp.c_fc'
    
    # Get first neuron data
    first_neuron_id = list(data[layer_name]['neurons'].keys())[0]
    neuron_data = data[layer_name]['neurons'][first_neuron_id]
    
    # Get eleventh example (long)
    example = neuron_data['examples'][10]
    # After getting the example
    #print("Raw example data:")
    #print(example)
    #print("\nAll available examples:")
    #for i, ex in enumerate(neuron_data['examples']):
    #    print(f"Example {i}: Length = {len(ex['sequence'])}, First few tokens = {ex['sequence'][:5]}")
    
    print(f"\nTesting with real data:")
    print(f"Layer: {debug_layer}")
    print(f"Neuron: {first_neuron_id}")
    print(f"Original sequence length: {len(example['sequence'])}")
    print(f"Original pivot index: {example['pivot_index']}")
    print(f"Original activation: {max(example['activations'])}")
    
    try:
        print("\nProcessing example...")
        processed = analyzer.process_single_example(
            text=example['sequence'],
            pivot_index=example['pivot_index'],
            original_activation=max(example['activations']),
            layer=debug_layer,
            neuron=int(first_neuron_id)
        )
        
        print("\nResults:")
        print(f"Pruned sequence: {processed.pruned_sequence}")
        print(f"Activating token: {processed.activating_token}")
        print(f"Number of context tokens: {len(processed.context_tokens)}")
        print(f"Context tokens: {processed.context_tokens}")
        print(f"Activation value: {processed.activation_value}")
        print(f"Activation ratio: {processed.activation_ratio:.3f}")
        
    except Exception as e:
        print(f"\nError in example processing: {str(e)}")
        print("Stack trace:")
        traceback.print_exc()
    
    # Cleanup
    del model
    torch.cuda.empty_cache()
    
except Exception as e:
    print(f"Error in setup: {str(e)}")
    print("Stack trace:")
    traceback.print_exc()

## One Last Test!

In [None]:
import torch
from dataset import create_dataloader
from model_loader import load_model
from utils.compatibility import convert_model_to_hooked_transformer, get_ablation_hooks_for_tl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

# Set tokenizer parallelism off
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 1. Load model
print("Loading model...")
model_path = 'configs/lbl_model_20241016.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
model.eval()  # Ensure we're in eval mode
print("Model loaded")

# 2. Simple dataloader
print("\nCreating dataloader...")
dataloader = create_dataloader(
    data_path='validation.bin',
    block_size=256,
    batch_size=1,
    num_workers=0  # No multiprocessing for now
)
print("Dataloader created")

# 3. Collect activation statistics
print("\nCollecting activation statistics...")
attention_ablations = []
neuron_ablations = []
num_samples = 20  # Reduced sample size

with torch.no_grad():  # Add no_grad context
    for i, (input_ids, _) in enumerate(dataloader):
        if i >= num_samples:
            break
            
        print(f"Processing sample {i+1}/{num_samples}")
        input_ids = input_ids.to(device)
        
        # Get model outputs
        outputs = model(input_ids)
        
        # Collect ablation values (with detach)
        if 'attention_ablations' in outputs:
            attn_vals = outputs['attention_ablations'].detach().abs().cpu().numpy()
            attention_ablations.extend(attn_vals.flatten())
            print(f"Attention ablation shape: {outputs['attention_ablations'].shape}")
        
        if 'neuron_ablations' in outputs:
            neuron_vals = outputs['neuron_ablations'].detach().abs().cpu().numpy()
            neuron_ablations.extend(neuron_vals.flatten())
            print(f"Neuron ablation shape: {outputs['neuron_ablations'].shape}")

print("\nGenerating visualizations...")

# 4. Visualize results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Ablation Value Analysis', fontsize=14)

# Attention ablations
if attention_ablations:
    sns.histplot(data=attention_ablations, bins=50, ax=axes[0,0])
    axes[0,0].set_title('Attention Ablation Distribution')
    axes[0,0].set_yscale('log')
    axes[0,0].set_xlabel('Ablation Value')

# Neuron ablations
if neuron_ablations:
    sns.histplot(data=neuron_ablations, bins=50, ax=axes[0,1])
    axes[0,1].set_title('Neuron Ablation Distribution')
    axes[0,1].set_yscale('log')
    axes[0,1].set_xlabel('Ablation Value')

# Print summary statistics
summary_text = "Ablation Statistics:\n\n"

stats = {
    'Attention Ablations': np.array(attention_ablations) if attention_ablations else None,
    'Neuron Ablations': np.array(neuron_ablations) if neuron_ablations else None
}

for name, values in stats.items():
    if values is not None:
        summary_text += f"{name}:\n"
        summary_text += f"  Mean: {np.mean(values):.6f}\n"
        summary_text += f"  Median: {np.median(values):.6f}\n"
        summary_text += f"  Std: {np.std(values):.6f}\n"
        summary_text += f"  Max: {np.max(values):.6f}\n"
        summary_text += f"  95th pct: {np.percentile(values, 95):.6f}\n\n"
        
        # Print to console also
        print(f"\n{name}:")
        print(f"  Mean: {np.mean(values):.6f}")
        print(f"  Median: {np.median(values):.6f}")
        print(f"  95th percentile: {np.percentile(values, 95):.6f}")

axes[1,0].text(0.05, 0.95, summary_text,
               transform=axes[1,0].transAxes,
               verticalalignment='top',
               fontsize=10,
               family='monospace')
axes[1,0].axis('off')

# Add recommendations
recommendations = "Recommended Thresholds:\n\n"
for name, values in stats.items():
    if values is not None:
        median = np.median(values)
        p95 = np.percentile(values, 95)
        recommendations += f"{name}:\n"
        recommendations += f"  Standard: {median:.4f} (median)\n"
        recommendations += f"  Conservative: {p95:.4f} (95th)\n\n"

axes[1,1].text(0.05, 0.95, recommendations,
               transform=axes[1,1].transAxes,
               verticalalignment='top',
               fontsize=10,
               family='monospace')
axes[1,1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
import torch
from dataset import create_dataloader
from model_loader import load_model
from utils.compatibility import convert_model_to_hooked_transformer, get_ablation_hooks_for_tl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

# Set tokenizer parallelism off
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 1. Load both models
print("Loading models...")
model_path = 'configs/lbl_model_20241016.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
model.eval()

# Convert to HookedTransformer
ht_model = convert_model_to_hooked_transformer(model)
ht_model.eval()
print("Models loaded")

# 2. Create dataloader
print("\nCreating dataloader...")
dataloader = create_dataloader(
    data_path='validation.bin',
    block_size=256,
    batch_size=1,
    num_workers=0
)
print("Dataloader created")

# 3. Collect statistics
print("\nCollecting activation statistics...")
stats = {
    'attention_ablations': [],
    'neuron_ablations': [], 
    'hooked_attention': [],
    'hooked_neurons': []
}
num_samples = 20

with torch.no_grad():
    for i, (input_ids, _) in enumerate(dataloader):
        if i >= num_samples:
            break
            
        print(f"Processing sample {i+1}/{num_samples}")
        input_ids = input_ids.to(device)
        
        # Get original model outputs
        outputs = model(input_ids)
        
        # Collect ablation values
        if 'attention_ablations' in outputs:
            attn_vals = outputs['attention_ablations'].detach().abs().cpu().numpy()
            stats['attention_ablations'].extend(attn_vals.flatten())
            print(f"Attention ablation shape: {outputs['attention_ablations'].shape}")
        
        if 'neuron_ablations' in outputs:
            neuron_vals = outputs['neuron_ablations'].detach().abs().cpu().numpy()
            stats['neuron_ablations'].extend(neuron_vals.flatten())
            print(f"Neuron ablation shape: {outputs['neuron_ablations'].shape}")
        
        # Process with HookedTransformer
        ablation_hooks = get_ablation_hooks_for_tl(outputs, -1, model.config)
        with ht_model.hooks(fwd_hooks=ablation_hooks):
            _, cache = ht_model.run_with_cache(input_ids)
            
            # Collect attention and neuron values from each layer
            for layer in range(config.num_layers):
                # Get attention outputs (z)
                attn_out = cache[f'blocks.{layer}.attn.hook_z'][0].detach().abs().cpu().numpy()
                stats['hooked_attention'].extend(attn_out.flatten())
                
                # Get MLP outputs (after ablation)
                mlp_out = cache[f'blocks.{layer}.mlp.hook_post'][0].detach().abs().cpu().numpy()
                stats['hooked_neurons'].extend(mlp_out.flatten())
                
        print(f"HookedTransformer shapes - Attention: {attn_out.shape}, Neurons: {mlp_out.shape}")

print("\nGenerating visualizations...")

# 4. Visualize results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Ablation Analysis: Original vs HookedTransformer', fontsize=14)

# Plot distributions
plots = [
    ('attention_ablations', 'Attention Ablation Distribution (Original)', axes[0,0]),
    ('neuron_ablations', 'Neuron Ablation Distribution (Original)', axes[0,1]),
    ('hooked_attention', 'Attention Values (HookedTransformer)', axes[1,0]),
    ('hooked_neurons', 'Neuron Values (HookedTransformer)', axes[1,1])
]

for key, title, ax in plots:
    if stats[key]:
        sns.histplot(data=stats[key], bins=50, ax=ax)
        ax.set_title(title)
        ax.set_yscale('log')
        ax.set_xlabel('Value')

plt.tight_layout()
plt.show()

# Print statistics
print("\nStatistics Summary:")
for name, values in stats.items():
    if values:
        values = np.array(values)
        print(f"\n{name}:")
        print(f"  Mean: {np.mean(values):.6f}")
        print(f"  Median: {np.median(values):.6f}")
        print(f"  Std: {np.std(values):.6f}")
        print(f"  Max: {np.max(values):.6f}")
        print(f"  95th pct: {np.percentile(values, 95):.6f}")

# Plot additional comparison
fig, ax = plt.subplots(figsize=(10, 6))
labels = []
values = []

for name, data in stats.items():
    if data:
        labels.append(name)
        values.append(np.array(data))

plt.boxplot(values, labels=labels)
plt.xticks(rotation=45)
plt.title('Value Distribution Comparison')
plt.ylabel('Value')
plt.tight_layout()
plt.show()

In [None]:
import torch
from dataset import create_dataloader
from model_loader import load_model
from utils.compatibility import convert_model_to_hooked_transformer, get_ablation_hooks_for_tl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

# Set tokenizer parallelism off
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 1. Load both models
print("Loading models...")
model_path = 'configs/lbl_model_20241016.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
model.eval()

# Convert to HookedTransformer
ht_model = convert_model_to_hooked_transformer(model)
ht_model.eval()
print("Models loaded")

# 2. Create dataloader
print("\nCreating dataloader...")
dataloader = create_dataloader(
    data_path='validation.bin',
    block_size=256,
    batch_size=1,
    num_workers=0
)
print("Dataloader created")

# 3. Collect statistics
print("\nCollecting activation statistics...")
stats = {
    'original_ablations': [],
    'hooked_ablated_values': []
}
num_samples = 20

with torch.no_grad():
    for i, (input_ids, _) in enumerate(dataloader):
        if i >= num_samples:
            break
            
        print(f"Processing sample {i+1}/{num_samples}")
        input_ids = input_ids.to(device)
        
        # Get original model outputs with ablation masks
        outputs = model(input_ids)
        
        # Get original ablation masks
        if 'neuron_ablations' in outputs:
            neuron_vals = outputs['neuron_ablations'].detach().abs().cpu().numpy()
            stats['original_ablations'].extend(neuron_vals.flatten())
            print(f"Original ablation shape: {outputs['neuron_ablations'].shape}")
        
        # Process with HookedTransformer using same ablation masks
        ablation_hooks = get_ablation_hooks_for_tl(outputs, -1, model.config)
        
        with ht_model.hooks(fwd_hooks=ablation_hooks):
            _, cache = ht_model.run_with_cache(input_ids)
            
            # Collect MLP outputs after ablation hooks
            for layer in range(config.num_layers):
                # Get post-ablation MLP values
                mlp_out = cache[f'blocks.{layer}.mlp.hook_post'][0].detach().abs().cpu().numpy()
                stats['hooked_ablated_values'].extend(mlp_out.flatten())
                print(f"Layer {layer} HookedTransformer post-ablation shape: {mlp_out.shape}")

print("\nGenerating visualizations...")

# 4. Visualize results
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
fig.suptitle('Ablation Analysis: Original vs HookedTransformer (Post-Ablation)', fontsize=14)

# Plot distributions
plots = [
    ('original_ablations', 'Original Model Ablation Masks', axes[0]),
    ('hooked_ablated_values', 'HookedTransformer Post-Ablation Values', axes[1])
]

for key, title, ax in plots:
    if stats[key]:
        sns.histplot(data=stats[key], bins=50, ax=ax)
        ax.set_title(title)
        ax.set_yscale('log')
        ax.set_xlabel('Value')

plt.tight_layout()
plt.show()

# Print statistics
print("\nStatistics Summary:")
for name, values in stats.items():
    if values:
        values = np.array(values)
        print(f"\n{name}:")
        print(f"  Mean: {np.mean(values):.6f}")
        print(f"  Median: {np.median(values):.6f}")
        print(f"  Std: {np.std(values):.6f}")
        print(f"  Max: {np.max(values):.6f}")
        print(f"  95th pct: {np.percentile(values, 95):.6f}")

# Plot value comparison
fig, ax = plt.subplots(figsize=(10, 6))
boxplot_data = [np.array(stats[key]) for key in stats if stats[key]]
plt.boxplot(boxplot_data, labels=list(stats.keys()))
plt.xticks(rotation=45)
plt.title('Value Distribution Comparison')
plt.ylabel('Value')
plt.tight_layout()
plt.show()

In [None]:
import torch
from dataset import create_dataloader
from model_loader import load_model
from utils.compatibility import convert_model_to_hooked_transformer, get_ablation_hooks_for_tl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

def analyze_hooked_transformer(model_path: str, num_samples: int = 20):
    """Analyze activations and ablations using HookedTransformer"""
    
    print("Loading models...")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, config = load_model(model_path, device)
    model.eval()

    ht_model = convert_model_to_hooked_transformer(model)
    ht_model.eval()
    print("Models loaded")

    dataloader = create_dataloader(
        data_path='validation.bin',
        block_size=256,
        batch_size=1,
        num_workers=0
    )

    stats = {
        'pre_ablation_activations': [],   # Values before ablation
        'post_ablation_activations': [],  # Values after ablation
    }

    with torch.no_grad():
        for i, (input_ids, _) in enumerate(dataloader):
            if i >= num_samples:
                break
                
            print(f"Processing sample {i+1}/{num_samples}")
            input_ids = input_ids.to(device)
            
            # Get ablation hooks from original model
            output = model(input_ids)
            ablation_hooks = get_ablation_hooks_for_tl(output, -1, model.config)
            
            # Run with ablation hooks
            with ht_model.hooks(fwd_hooks=ablation_hooks):
                _, cache = ht_model.run_with_cache(
                    input_ids,
                    names_filter=lambda name: 'mlp.hook_' in name
                )
                
                # Collect pre and post ablation values
                for layer in range(config.num_layers):
                    # Pre-ablation
                    pre_acts = cache[f'blocks.{layer}.mlp.hook_pre'][0]
                    stats['pre_ablation_activations'].extend(
                        pre_acts.abs().cpu().numpy().flatten()
                    )
                    
                    # Post-ablation 
                    post_acts = cache[f'blocks.{layer}.mlp.hook_post'][0]
                    stats['post_ablation_activations'].extend(
                        post_acts.abs().cpu().numpy().flatten()
                    )

    # Visualize results
    fig, axes = plt.subplots(2, 1, figsize=(12, 10))
    fig.suptitle('HookedTransformer Ablation Analysis', fontsize=14)

    plots = [
        ('pre_ablation_activations', 'Pre-Ablation Activations (hook_pre)'),
        ('post_ablation_activations', 'Post-Ablation Activations (hook_post)')
    ]

    for (key, title), ax in zip(plots, axes):
        if stats[key]:
            sns.histplot(data=stats[key], bins=50, ax=ax)
            ax.set_title(title)
            ax.set_yscale('log')
            ax.set_xlabel('Value')
        else:
            print(f"Warning: No data collected for {key}")

    plt.tight_layout()
    plt.show()

    # Print statistics
    print("\nStatistics Summary:")
    for name, values in stats.items():
        if values:
            values = np.array(values)
            print(f"\n{name}:")
            print(f"  Mean: {np.mean(values):.6f}")
            print(f"  Median: {np.median(values):.6f}")
            print(f"  Std: {np.std(values):.6f}")
            print(f"  Max: {np.max(values):.6f}")
            print(f"  95th pct: {np.percentile(values, 95):.6f}")
            print(f"  # Values: {len(values)}")
        else:
            print(f"\n{name}: No data collected")

    # Calculate and print ablation effect
    if stats['pre_ablation_activations'] and stats['post_ablation_activations']:
        pre_vals = np.array(stats['pre_ablation_activations'])
        post_vals = np.array(stats['post_ablation_activations'])
        print("\nAblation Effect:")
        print(f"  Average activation reduction: {(1 - np.mean(post_vals)/np.mean(pre_vals))*100:.2f}%")
        print(f"  Median activation reduction: {(1 - np.median(post_vals)/np.median(pre_vals))*100:.2f}%")

    return stats

if __name__ == "__main__":
    model_path = 'configs/lbl_model_20241016.pt'
    stats = analyze_hooked_transformer(model_path)

## Processing Global Model

In [None]:
print("Processing Global Model...")
global_model_path = 'global_model_20241017.pt'
global_activation_file = 'activation_results/neuron_activations_global.json'

# Load model
global_model, global_config = load_model(
    model_path=global_model_path,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# Initialize analyzer 
global_analyzer = NeuronAnalyzer(global_model)

# Load activation data
global_data = global_analyzer.load_activation_data(global_activation_file)

# Process each layer
global_graphs = {}
for layer in range(global_config.num_layers):
    print(f"\nAnalyzing layer {layer}")
    layer_graphs = global_analyzer.analyze_layer(
        activation_data=global_data,
        layer=layer,
        save_graphs=True,
        output_dir=f"neuron_graphs/global_model/layer_{layer}"
    )
    global_graphs[layer] = layer_graphs

# Cleanup
del global_model
torch.cuda.empty_cache()

# Graph Analysis

## Base Model

In [1]:
import os
import json
import numpy as np
from collections import defaultdict

In [4]:
base_dir = "neuron_graphs/base_model"
stats = defaultdict(lambda: defaultdict(list))

# For each layer (0-7)
for layer in range(8):
    layer_dir = os.path.join(base_dir, f"layer_{layer}")
    
    # For neurons 0-199 
    for neuron in range(200):
        graph_path = os.path.join(layer_dir, f"l{layer}_n{neuron}_graph.json")
        
        if os.path.exists(graph_path):
            with open(graph_path, 'r') as f:
                graph = json.load(f)
                
            # Collect statistics
            stats[layer]['num_nodes'].append(len(graph['nodes']))
            stats[layer]['num_edges'].append(len(graph.get('edges', {})))
            
            # Count activating nodes and get activation values
            activating_nodes = 0
            activation_values = []
            importance_values = []
            
            for node, data in graph['nodes'].items():
                if data.get('is_activating', False):
                    activating_nodes += 1
                    if 'activation' in data:
                        activation_values.append(data['activation'])
                elif 'importance' in data:
                    importance_values.append(data['importance'])
            
            stats[layer]['num_activating'].append(activating_nodes)
            if activation_values:
                stats[layer]['avg_activation'].append(np.mean(activation_values))
            if importance_values:
                stats[layer]['avg_importance'].append(np.mean(importance_values))

In [None]:
print("Layer Statistics:")
for layer in sorted(stats.keys()):
    print(f"\nLayer {layer}:")
    
    # Basic stats
    n_neurons = len(stats[layer]['num_nodes'])
    print(f"Neurons processed: {n_neurons}")
    
    # Print averages with standard deviations
    for metric in ['num_nodes', 'num_edges', 'num_activating']:
        values = stats[layer][metric]
        print(f"Average {metric}: {np.mean(values):.2f} ± {np.std(values):.2f}")
    
    # Print activation and importance stats
    if stats[layer]['avg_activation']:
        print(f"Average activation: {np.mean(stats[layer]['avg_activation']):.4f}")
    if stats[layer]['avg_importance']:
        print(f"Average importance: {np.mean(stats[layer]['avg_importance']):.4f}")

In [None]:
import matplotlib.pyplot as plt

# Set figure size
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Layer-wise Graph Statistics', fontsize=16)

# Plot average nodes per layer
layer_nums = sorted(stats.keys())
avg_nodes = [np.mean(stats[layer]['num_nodes']) for layer in layer_nums]
std_nodes = [np.std(stats[layer]['num_nodes']) for layer in layer_nums]
axes[0,0].errorbar(layer_nums, avg_nodes, yerr=std_nodes, marker='o')
axes[0,0].set_title('Average Number of Nodes per Layer')
axes[0,0].set_xlabel('Layer')
axes[0,0].set_ylabel('Number of Nodes')
axes[0,0].grid(True)

# Plot average edges per layer
avg_edges = [np.mean(stats[layer]['num_edges']) for layer in layer_nums]
std_edges = [np.std(stats[layer]['num_edges']) for layer in layer_nums]
axes[0,1].errorbar(layer_nums, avg_edges, yerr=std_edges, marker='o')
axes[0,1].set_title('Average Number of Edges per Layer')
axes[0,1].set_xlabel('Layer')
axes[0,1].set_ylabel('Number of Edges')
axes[0,1].grid(True)

# Plot average activating nodes
avg_act = [np.mean(stats[layer]['num_activating']) for layer in layer_nums]
std_act = [np.std(stats[layer]['num_activating']) for layer in layer_nums]
axes[1,0].errorbar(layer_nums, avg_act, yerr=std_act, marker='o')
axes[1,0].set_title('Average Number of Activating Nodes')
axes[1,0].set_xlabel('Layer')
axes[1,0].set_ylabel('Number of Activating Nodes')
axes[1,0].grid(True)

# Plot average activation values
avg_actval = [np.mean(stats[layer]['avg_activation']) for layer in layer_nums]
std_actval = [np.std(stats[layer]['avg_activation']) for layer in layer_nums]
axes[1,1].errorbar(layer_nums, avg_actval, yerr=std_actval, marker='o')
axes[1,1].set_title('Average Activation Values')
axes[1,1].set_xlabel('Layer')
axes[1,1].set_ylabel('Activation Value')
axes[1,1].grid(True)

plt.tight_layout()
plt.show()

## Ablated Model

In [7]:
base_dir = "neuron_graphs/lbl"
stats = defaultdict(lambda: defaultdict(list))

# For each layer (0-7)
for layer in range(8):
    layer_dir = os.path.join(base_dir, f"layer_{layer}")
    
    # For neurons 0-199 
    for neuron in range(200):
        graph_path = os.path.join(layer_dir, f"l{layer}_n{neuron}_graph.json")
        
        if os.path.exists(graph_path):
            with open(graph_path, 'r') as f:
                graph = json.load(f)
                
            # Collect statistics
            stats[layer]['num_nodes'].append(len(graph['nodes']))
            stats[layer]['num_edges'].append(len(graph.get('edges', {})))
            
            # Count activating nodes and get activation values
            activating_nodes = 0
            activation_values = []
            importance_values = []
            
            for node, data in graph['nodes'].items():
                if data.get('is_activating', False):
                    activating_nodes += 1
                    if 'activation' in data:
                        activation_values.append(data['activation'])
                elif 'importance' in data:
                    importance_values.append(data['importance'])
            
            stats[layer]['num_activating'].append(activating_nodes)
            if activation_values:
                stats[layer]['avg_activation'].append(np.mean(activation_values))
            if importance_values:
                stats[layer]['avg_importance'].append(np.mean(importance_values))

In [None]:
print("Layer Statistics:")
for layer in sorted(stats.keys()):
    print(f"\nLayer {layer}:")
    
    # Basic stats
    n_neurons = len(stats[layer]['num_nodes'])
    print(f"Neurons processed: {n_neurons}")
    
    # Print averages with standard deviations
    for metric in ['num_nodes', 'num_edges', 'num_activating']:
        values = stats[layer][metric]
        print(f"Average {metric}: {np.mean(values):.2f} ± {np.std(values):.2f}")
    
    # Print activation and importance stats
    if stats[layer]['avg_activation']:
        print(f"Average activation: {np.mean(stats[layer]['avg_activation']):.4f}")
    if stats[layer]['avg_importance']:
        print(f"Average importance: {np.mean(stats[layer]['avg_importance']):.4f}")

In [None]:
import matplotlib.pyplot as plt

# Set figure size
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Layer-wise Graph Statistics', fontsize=16)

# Plot average nodes per layer
layer_nums = sorted(stats.keys())
avg_nodes = [np.mean(stats[layer]['num_nodes']) for layer in layer_nums]
std_nodes = [np.std(stats[layer]['num_nodes']) for layer in layer_nums]
axes[0,0].errorbar(layer_nums, avg_nodes, yerr=std_nodes, marker='o')
axes[0,0].set_title('Average Number of Nodes per Layer')
axes[0,0].set_xlabel('Layer')
axes[0,0].set_ylabel('Number of Nodes')
axes[0,0].grid(True)

# Plot average edges per layer
avg_edges = [np.mean(stats[layer]['num_edges']) for layer in layer_nums]
std_edges = [np.std(stats[layer]['num_edges']) for layer in layer_nums]
axes[0,1].errorbar(layer_nums, avg_edges, yerr=std_edges, marker='o')
axes[0,1].set_title('Average Number of Edges per Layer')
axes[0,1].set_xlabel('Layer')
axes[0,1].set_ylabel('Number of Edges')
axes[0,1].grid(True)

# Plot average activating nodes
avg_act = [np.mean(stats[layer]['num_activating']) for layer in layer_nums]
std_act = [np.std(stats[layer]['num_activating']) for layer in layer_nums]
axes[1,0].errorbar(layer_nums, avg_act, yerr=std_act, marker='o')
axes[1,0].set_title('Average Number of Activating Nodes')
axes[1,0].set_xlabel('Layer')
axes[1,0].set_ylabel('Number of Activating Nodes')
axes[1,0].grid(True)

# Plot average activation values
avg_actval = [np.mean(stats[layer]['avg_activation']) for layer in layer_nums]
std_actval = [np.std(stats[layer]['avg_activation']) for layer in layer_nums]
axes[1,1].errorbar(layer_nums, avg_actval, yerr=std_actval, marker='o')
axes[1,1].set_title('Average Activation Values')
axes[1,1].set_xlabel('Layer')
axes[1,1].set_ylabel('Activation Value')
axes[1,1].grid(True)

plt.tight_layout()
plt.show()

## Comparison

In [14]:
def collect_model_stats(model_dir):
    stats = defaultdict(lambda: defaultdict(list))
    
    # For each layer (0-7)
    for layer in range(8):
        layer_dir = os.path.join(model_dir, f"layer_{layer}")
        
        # For neurons 0-199 
        for neuron in range(200):
            graph_path = os.path.join(layer_dir, f"l{layer}_n{neuron}_graph.json")
            
            if os.path.exists(graph_path):
                with open(graph_path, 'r') as f:
                    graph = json.load(f)
                    
                # Collect statistics
                stats[layer]['num_nodes'].append(len(graph['nodes']))
                stats[layer]['num_edges'].append(len(graph.get('edges', {})))
                
                # Count activating nodes and get activation values
                activating_nodes = 0
                activation_values = []
                importance_values = []
                
                for node, data in graph['nodes'].items():
                    if data.get('is_activating', False):
                        activating_nodes += 1
                        if 'activation' in data:
                            activation_values.append(data['activation'])
                    elif 'importance' in data:
                        importance_values.append(data['importance'])
                
                stats[layer]['num_activating'].append(activating_nodes)
                if activation_values:
                    stats[layer]['avg_activation'].append(np.mean(activation_values))
                if importance_values:
                    stats[layer]['avg_importance'].append(np.mean(importance_values))
    
    return stats

In [15]:
# Collect stats for both models
lbl_stats = collect_model_stats("neuron_graphs/lbl")
base_stats = collect_model_stats("neuron_graphs/base_model")

In [None]:
import matplotlib.pyplot as plt

# Create figure with 2x2 subplots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Comparative Layer-wise Statistics: Base Model vs LBL Model', fontsize=16)

# Plot data
layer_nums = sorted(lbl_stats.keys())
metrics = {
    (0,0): ('num_nodes', 'Average Number of Nodes'),
    (0,1): ('num_edges', 'Average Number of Edges'),
    (1,0): ('num_activating', 'Average Number of Activating Nodes'),
    (1,1): ('avg_activation', 'Average Activation Values')
}

for (i,j), (metric, title) in metrics.items():
    # Base model
    avg_base = [np.mean(base_stats[layer][metric]) for layer in layer_nums]
    std_base = [np.std(base_stats[layer][metric]) for layer in layer_nums]
    
    # LBL model
    avg_lbl = [np.mean(lbl_stats[layer][metric]) for layer in layer_nums]
    std_lbl = [np.std(lbl_stats[layer][metric]) for layer in layer_nums]
    
    # Plot both models
    axes[i,j].errorbar(layer_nums, avg_base, yerr=std_base, marker='o', label='Base Model', 
                      color='blue', capsize=5)
    axes[i,j].errorbar(layer_nums, avg_lbl, yerr=std_lbl, marker='s', label='LBL Model', 
                      color='red', capsize=5)
    
    axes[i,j].set_title(title)
    axes[i,j].set_xlabel('Layer')
    axes[i,j].set_ylabel(title.split()[-1])
    axes[i,j].grid(True)
    axes[i,j].legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create two separate figures
fig_base, axes_base = plt.subplots(2, 2, figsize=(20, 16))
fig_lbl, axes_lbl = plt.subplots(2, 2, figsize=(20, 16))

fig_base.suptitle('Base Model Distribution Analysis', fontsize=16, y=1.02)
fig_lbl.suptitle('LBL Model Distribution Analysis', fontsize=16, y=1.02)

layer_nums = range(8)  # 8 layers (0-7)
metrics = {
    (0,0): ('num_nodes', 'Average Number of Nodes'),
    (0,1): ('num_edges', 'Average Number of Edges'),
    (1,0): ('num_activating', 'Average Number of Activating Nodes'),
    (1,1): ('avg_activation', 'Average Activation Values')
}

# Helper function to create boxplot
def create_model_boxplots(axes, stats, model_name, color):
    for (i,j), (metric, title) in metrics.items():
        # Prepare data
        data = [stats[layer][metric] for layer in layer_nums]
        
        # Create boxplot
        bp = axes[i,j].boxplot(data,
                              patch_artist=True,
                              showfliers=True,
                              medianprops=dict(color="black", linewidth=2),
                              boxprops=dict(facecolor=color, alpha=0.8, linewidth=1.5),
                              whiskerprops=dict(linewidth=1.5),
                              capprops=dict(linewidth=1.5),
                              flierprops=dict(marker='o', markerfacecolor=color, markersize=6, alpha=0.6))
        
        # Use log scale for nodes and edges
        if metric in ['num_nodes', 'num_edges']:
            axes[i,j].set_yscale('log')
        
        # Customize plot
        axes[i,j].yaxis.grid(True, linestyle='--', alpha=0.7)
        axes[i,j].set_axisbelow(True)
        
        axes[i,j].set_title(f"{title} - {model_name}", pad=20)
        axes[i,j].set_xlabel('Layer', labelpad=10)
        axes[i,j].set_ylabel(title.split()[-1], labelpad=10)
        
        # Set x-ticks
        axes[i,j].set_xticks(range(1, len(layer_nums) + 1))
        axes[i,j].set_xticklabels([f'Layer {l}' for l in layer_nums])
        
        # Add summary statistics as text
        stats_text = f"Mean: {np.mean([np.mean(d) for d in data]):.2f}\n"
        stats_text += f"Median: {np.mean([np.median(d) for d in data]):.2f}\n"
        stats_text += f"Max: {np.max([np.max(d) for d in data]):.2f}"
        axes[i,j].text(0.02, 0.98, stats_text,
                      transform=axes[i,j].transAxes,
                      verticalalignment='top',
                      bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Create plots for each model
create_model_boxplots(axes_base, base_stats, 'Base Model', 'lightblue')
create_model_boxplots(axes_lbl, lbl_stats, 'LBL Model', 'lightcoral')

# Adjust layout
plt.figure(fig_base.number)
plt.tight_layout(pad=2.0)

plt.figure(fig_lbl.number)
plt.tight_layout(pad=2.0)

plt.show()

# Print summary statistics for verification
for model_name, stats in [("Base Model", base_stats), ("LBL Model", lbl_stats)]:
    print(f"\n{model_name} Statistics:")
    for metric in ['num_nodes', 'num_edges', 'num_activating', 'avg_activation']:
        all_values = []
        for layer in layer_nums:
            all_values.extend(stats[layer][metric])
        print(f"\n{metric}:")
        print(f"Overall - Min: {np.min(all_values):.3f}, Max: {np.max(all_values):.3f}")
        print(f"Overall - Mean: {np.mean(all_values):.3f}, Median: {np.median(all_values):.3f}")

## Secondary Analysis

In [1]:
import seaborn

In [None]:
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm

# Initialize data structures for each model
base_stats = defaultdict(lambda: defaultdict(list))
lbl_stats = defaultdict(lambda: defaultdict(list))

# Load and analyze graphs from each directory
for model_dir, stats in [('neuron_graphs/base_model', base_stats), 
                        ('neuron_graphs/lbl', lbl_stats)]:
    # Iterate through layer directories
    for layer_dir in sorted(os.listdir(model_dir)):
        if not layer_dir.startswith('layer'):
            continue
            
        layer = int(layer_dir.split('_')[1])  # Extract layer number
        layer_path = os.path.join(model_dir, layer_dir)
        
        # Process each graph in the layer
        for graph_file in os.listdir(layer_path):
            if not graph_file.endswith('_graph.json'):
                continue
                
            with open(os.path.join(layer_path, graph_file), 'r') as f:
                graph = json.load(f)
            
            # Calculate graph metrics
            nodes = graph['nodes']
            edges = graph['edges']
            
            # Basic counts
            stats[layer]['total_nodes'].append(len(nodes))
            stats[layer]['total_edges'].append(len(edges))
            
            # Activation metrics
            activating_nodes = [n for n, data in nodes.items() 
                              if data.get('is_activating', False)]
            stats[layer]['activating_nodes'].append(len(activating_nodes))
            
            # Get activation values for activating nodes
            activation_values = [data.get('activation', 0) for data in nodes.values() 
                               if data.get('is_activating', False)]
            avg_activation = np.mean(activation_values) if activation_values else 0
            stats[layer]['avg_activation'].append(avg_activation)
            
            # Graph density
            if len(nodes) > 1:
                density = (2 * len(edges)) / (len(nodes) * (len(nodes) - 1))
            else:
                density = 0
            stats[layer]['density'].append(density)

# Calculate mean and std for each metric
metrics = ['total_nodes', 'total_edges', 'activating_nodes', 'avg_activation', 'density']
metric_names = {
    'total_nodes': 'Total Nodes',
    'total_edges': 'Total Edges', 
    'activating_nodes': 'Activating Nodes',
    'avg_activation': 'Average Activation',
    'density': 'Graph Density'
}

# Set up the figure
sns.set_theme()
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Layer-wise Graph Analysis: Base vs LBL Model', fontsize=16)

for idx, metric in enumerate(metrics):
    ax = axes[idx // 3, idx % 3] if idx < 5 else None
    if ax is None:
        continue
        
    layers = sorted(base_stats.keys())
    
    # Calculate means and standard deviations
    base_means = [np.mean(base_stats[l][metric]) for l in layers]
    base_stds = [np.std(base_stats[l][metric]) for l in layers]
    lbl_means = [np.mean(lbl_stats[l][metric]) for l in layers]
    lbl_stds = [np.std(lbl_stats[l][metric]) for l in layers]
    
    # Plot with error bands
    ax.plot(layers, base_means, 'b-o', label='Base Model', linewidth=2)
    ax.fill_between(layers, 
                   np.array(base_means) - np.array(base_stds),
                   np.array(base_means) + np.array(base_stds),
                   alpha=0.2, color='blue')
    
    ax.plot(layers, lbl_means, 'r-o', label='LBL Model', linewidth=2)
    ax.fill_between(layers,
                   np.array(lbl_means) - np.array(lbl_stds),
                   np.array(lbl_means) + np.array(lbl_stds),
                   alpha=0.2, color='red')
    
    ax.set_xlabel('Layer')
    ax.set_ylabel(metric_names[metric])
    ax.set_title(f'{metric_names[metric]} by Layer')
    ax.grid(True, alpha=0.3)
    ax.legend()

# Remove empty subplot
fig.delaxes(axes.flatten()[-1])
plt.tight_layout()
plt.show()

# Print summary statistics
print("\nSummary Statistics:")
for model_name, stats in [("Base Model", base_stats), ("LBL Model", lbl_stats)]:
    print(f"\n{model_name}:")
    for metric in metrics:
        print(f"\n{metric_names[metric]}:")
        for layer in sorted(stats.keys()):
            values = stats[layer][metric]
            print(f"  Layer {layer}:")
            print(f"    Mean: {np.mean(values):.3f}")
            print(f"    Std:  {np.std(values):.3f}")

In [None]:
import json
import os
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict

def collect_graph_stats(model_dir):
    """Collect node counts and their corresponding activation values by layer"""
    layer_data = defaultdict(list)
    
    # Iterate through layer directories
    for layer_dir in sorted(os.listdir(model_dir)):
        if not layer_dir.startswith('layer'):
            continue
            
        layer = int(layer_dir.split('_')[1])
        layer_path = os.path.join(model_dir, layer_dir)
        
        # Process each graph in the layer
        for graph_file in os.listdir(layer_path):
            if not graph_file.endswith('_graph.json'):
                continue
                
            with open(os.path.join(layer_path, graph_file), 'r') as f:
                graph = json.load(f)
            
            activating_nodes = []
            activation_values = []
            
            # Get activating nodes and their values
            for node_id, node_data in graph['nodes'].items():
                if node_data.get('is_activating', False):
                    activating_nodes.append(node_id)
                    activation_values.append(node_data.get('activation', 0))
            
            layer_data[layer].append({
                'total_nodes': len(graph['nodes']),
                'num_activating': len(activating_nodes),
                'max_activation': max(activation_values) if activation_values else 0,
                'mean_activation': np.mean(activation_values) if activation_values else 0
            })
    
    return layer_data

# Collect data
print("Loading model data...")
base_data = collect_graph_stats('neuron_graphs/base_model')
lbl_data = collect_graph_stats('neuron_graphs/lbl')

# Create figure
plt.figure(figsize=(12, 8))

# Prepare data for plotting
def prepare_plot_data(data):
    total_nodes = []
    num_activating = []
    max_activations = []
    
    for layer_stats in data.values():
        for graph in layer_stats:
            total_nodes.append(graph['total_nodes'])
            num_activating.append(graph['num_activating'])
            max_activations.append(graph['max_activation'])
    
    return np.array(total_nodes), np.array(num_activating), np.array(max_activations)

# Plot with size indicating max activation
for model_name, data, color in [('Base Model', base_data, 'blue'),
                               ('LBL Model', lbl_data, 'red')]:
    total_nodes, num_activating, max_activations = prepare_plot_data(data)
    
    # Normalize sizes for visualization
    sizes = (max_activations / max_activations.max() * 200) + 20
    
    plt.scatter(total_nodes, num_activating, 
               s=sizes, 
               alpha=0.5, 
               color=color, 
               label=model_name)
    
    # Add trend line if we have data
    if len(total_nodes) > 0:
        z = np.polyfit(total_nodes, num_activating, 1)
        p = np.poly1d(z)
        x_trend = np.linspace(min(total_nodes), max(total_nodes), 100)
        plt.plot(x_trend, p(x_trend), '--', color=color, alpha=0.8)

plt.xlabel('Total Nodes in Graph')
plt.ylabel('Number of Activating Nodes')
plt.title('Node Relationships\n(Point size indicates maximum activation value)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\nSummary Statistics:")
for model_name, data in [("Base Model", base_data), ("LBL Model", lbl_data)]:
    print(f"\n{model_name}:")
    total_nodes, num_activating, max_activations = prepare_plot_data(data)
    
    print(f"Average graph size: {np.mean(total_nodes):.2f} ± {np.std(total_nodes):.2f} nodes")
    print(f"Average activating nodes: {np.mean(num_activating):.2f} ± {np.std(num_activating):.2f}")
    print(f"Average max activation: {np.mean(max_activations):.2f} ± {np.std(max_activations):.2f}")
    
    # Calculate correlations only if we have data
    if len(total_nodes) > 0:
        node_act_corr = np.corrcoef(total_nodes, num_activating)[0,1]
        print(f"Correlation (nodes vs activating): {node_act_corr:.3f}")
        
        # Calculate correlation with max activation for graphs that have activating nodes
        active_mask = num_activating > 0
        if np.any(active_mask):
            max_act_corr = np.corrcoef(total_nodes[active_mask], 
                                     max_activations[active_mask])[0,1]
            print(f"Correlation (nodes vs max activation): {max_act_corr:.3f}")

In [None]:
# Import libraries
import json
import os
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm

# Initialize variables
base_dir = 'neuron_graphs'
models = ['base_model', 'lbl']
num_layers = 8

# Create figure
fig = plt.figure(figsize=(20, 15))
gs = plt.GridSpec(3, 2, figure=fig)

# Initialize data storage
all_metrics = defaultdict(lambda: defaultdict(list))

# Process graphs and collect data
for model in models:
    print(f"\nProcessing {model}")
    for layer in range(num_layers):
        layer_dir = os.path.join(base_dir, model, f'layer_{layer}')
        if not os.path.exists(layer_dir):
            continue
            
        graph_files = [f for f in os.listdir(layer_dir) if f.endswith('_graph.json')]
        for graph_file in tqdm(graph_files):
            # Load graph
            with open(os.path.join(layer_dir, graph_file), 'r') as f:
                data = json.load(f)
            G = nx.DiGraph()
            for node, attrs in data['nodes'].items():
                G.add_node(node, **attrs)
            for edge_str, attrs in data['edges'].items():
                source, target = edge_str.split('->')
                G.add_edge(source, target, **attrs)
            
            # Collect metrics
            edge_weights = [d['weight'] for u, v, d in G.edges(data=True)]
            activating_nodes = [n for n, d in G.nodes(data=True) if d.get('is_activating', False)]
            
            metrics = {
                'avg_edge_weight': np.mean(edge_weights) if edge_weights else 0,
                'avg_out_degree': np.mean([G.out_degree(n) for n in G.nodes()]),
                'avg_in_degree': np.mean([G.in_degree(n) for n in G.nodes()]),
                'edges_to_activating': sum(1 for u, v in G.edges() if v in activating_nodes),
                'density': nx.density(G),
                'transitivity': nx.transitivity(G)
            }
            
            for metric_name, value in metrics.items():
                all_metrics[model][f'layer_{layer}_{metric_name}'].append(value)

# 1. Edge Weight Distribution
ax1 = fig.add_subplot(gs[0, 0])
for model in models:
    model_data = pd.DataFrame(all_metrics[model])
    for layer in range(num_layers):
        weights = [col for col in model_data.columns if f'layer_{layer}_avg_edge_weight' in col]
        if weights:
            sns.kdeplot(data=model_data[weights].mean(axis=1), 
                       label=f'{model} Layer {layer}', ax=ax1)
ax1.set_title('Edge Weight Distribution by Layer and Model')
ax1.set_xlabel('Average Edge Weight')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 2. Connectivity Patterns
ax2 = fig.add_subplot(gs[0, 1])
metrics = ['avg_out_degree', 'avg_in_degree', 'edges_to_activating']
for model in models:
    model_data = pd.DataFrame(all_metrics[model])
    layer_means = []
    for layer in range(num_layers):
        means = [model_data[col].mean() for col in model_data.columns 
                if any(f'layer_{layer}_{m}' in col for m in metrics)]
        layer_means.append(means)
    layer_means = np.array(layer_means)
    ax2.plot(range(num_layers), layer_means, label=f'{model}', marker='o')
ax2.set_title('Connectivity Patterns Across Layers')
ax2.set_xlabel('Layer')
ax2.set_ylabel('Average Value')
ax2.legend()

# 3. Network Structure Metrics
data_for_plot = []
for model in models:
    for metric in ['density', 'transitivity']:
        for layer in range(num_layers):
            metric_vals = [v for k, v in all_metrics[model].items() 
                         if f'layer_{layer}_{metric}' in k]
            if metric_vals:
                mean_val = np.mean(metric_vals)
                data_for_plot.append({
                    'Model': model,
                    'Layer': str(layer),
                    'Metric': metric,
                    'Value': mean_val
                })

plot_df = pd.DataFrame(data_for_plot)

# Split into two subplots
for i, metric in enumerate(['density', 'transitivity']):
    metric_data = plot_df[plot_df['Metric'] == metric]
    ax = fig.add_subplot(gs[1, i])
    sns.barplot(data=metric_data, x='Layer', y='Value', hue='Model', ax=ax)
    ax.set_title(f'{metric.capitalize()} by Layer')
    ax.set_xlabel('Layer')
    ax.set_ylabel('Value')
    if i > 0:  # Only keep one legend
        ax.get_legend().remove()

# 4. Edge Pattern Analysis
ax4 = fig.add_subplot(gs[2, :])
edge_patterns = []

# Collect edge patterns
for model in models:
    print(f"\nCollecting edge patterns for {model}")
    for layer in range(num_layers):
        layer_dir = os.path.join(base_dir, model, f'layer_{layer}')
        if not os.path.exists(layer_dir):
            continue
            
        for graph_file in os.listdir(layer_dir):
            if not graph_file.endswith('_graph.json'):
                continue
                
            with open(os.path.join(layer_dir, graph_file), 'r') as f:
                data = json.load(f)
                
            # Collect patterns for this graph
            patterns = defaultdict(list)
            for edge_str, attrs in data['edges'].items():
                source, target = edge_str.split('->')
                source_type = 'activating' if data['nodes'][source].get('is_activating', False) else 'context'
                target_type = 'activating' if data['nodes'][target].get('is_activating', False) else 'context'
                pattern_type = f'{source_type}->{target_type}'
                patterns[pattern_type].append(attrs['weight'])
            
            # Store mean weights per pattern type
            for pattern_type, weights in patterns.items():
                edge_patterns.append({
                    'model': model,
                    'layer': layer,
                    'pattern': pattern_type,
                    'mean_weight': np.mean(weights)
                })

# Convert to DataFrame
pattern_df = pd.DataFrame(edge_patterns)
pivot_df = pattern_df.pivot_table(
    values='mean_weight',
    index=['model', 'pattern'],
    aggfunc='mean'
).reset_index()

# Create correlation matrix
corr_data = []
for m1 in models:
    for m2 in models:
        m1_patterns = pivot_df[pivot_df['model'] == m1]
        m2_patterns = pivot_df[pivot_df['model'] == m2]
        
        # Match patterns
        for p1 in m1_patterns['pattern'].unique():
            for p2 in m2_patterns['pattern'].unique():
                v1 = m1_patterns[m1_patterns['pattern'] == p1]['mean_weight'].values[0]
                v2 = m2_patterns[m2_patterns['pattern'] == p2]['mean_weight'].values[0]
                corr_data.append({
                    'model1': f"{m1}\n{p1}",
                    'model2': f"{m2}\n{p2}",
                    'correlation': v1 * v2  # Simplified correlation measure
                })

corr_df = pd.DataFrame(corr_data)
corr_matrix = corr_df.pivot(index='model1', columns='model2', values='correlation')

# Plot correlation heatmap
sns.heatmap(corr_matrix, 
            cmap='coolwarm',
            center=0,
            annot=True,
            fmt='.2f',
            ax=ax4)
ax4.set_title('Edge Pattern Correlations')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Graph pattern analysis: Load and pair graphs
import json
import os
import networkx as nx
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm

def load_paired_graphs(base_dir='neuron_graphs', layer_range=range(8)):
    """Load corresponding graphs from base and ablated models"""
    paired_graphs = defaultdict(dict)
    
    for layer in layer_range:
        base_dir_path = os.path.join(base_dir, 'base_model', f'layer_{layer}')
        lbl_dir_path = os.path.join(base_dir, 'lbl', f'layer_{layer}')
        
        if not (os.path.exists(base_dir_path) and os.path.exists(lbl_dir_path)):
            continue
            
        # Get all neuron IDs from filenames
        base_neurons = {f.split('_')[1] for f in os.listdir(base_dir_path) if f.endswith('_graph.json')}
        lbl_neurons = {f.split('_')[1] for f in os.listdir(lbl_dir_path) if f.endswith('_graph.json')}
        
        # Get intersection of neurons present in both models
        common_neurons = base_neurons & lbl_neurons
        
        for neuron in common_neurons:
            base_path = os.path.join(base_dir_path, f'l{layer}_{neuron}_graph.json')
            lbl_path = os.path.join(lbl_dir_path, f'l{layer}_{neuron}_graph.json')
            
            if os.path.exists(base_path) and os.path.exists(lbl_path):
                with open(base_path, 'r') as f:
                    base_data = json.load(f)
                with open(lbl_path, 'r') as f:
                    lbl_data = json.load(f)
                    
                paired_graphs[layer][neuron] = {
                    'base': base_data,
                    'lbl': lbl_data
                }
    
    return paired_graphs

# Load paired graphs
paired_graphs = load_paired_graphs()
print(f"Loaded graphs for {len(paired_graphs)} layers")
for layer, neurons in paired_graphs.items():
    print(f"Layer {layer}: {len(neurons)} paired neurons")

In [None]:
# Testing Refined sequence analysis
def print_sequence_details(seqs, model_name):
    """Print detailed information about sequences"""
    print(f"\n{model_name} Sequences:")
    for i, seq in enumerate(seqs, 1):
        print(f"\nSequence {i}:")
        print(f"Tokens: {' -> '.join(seq['path'])}")
        print(f"Weights: {[f'{w:.3f}' for w in seq['weights']]}")
        print(f"Mean weight: {seq['mean_weight']:.3f}")
        print(f"Activating token: {seq['activating_token']}")

def analyze_activating_tokens(base_seqs, lbl_seqs):
    """Compare activating tokens between models"""
    base_activating = {seq['activating_token'] for seq in base_seqs}
    lbl_activating = {seq['activating_token'] for seq in lbl_seqs}
    
    print("\nActivating Token Analysis:")
    print(f"Base model unique activating tokens: {len(base_activating)}")
    print(f"Ablated model unique activating tokens: {len(lbl_activating)}")
    print(f"Common activating tokens: {len(base_activating & lbl_activating)}")
    
    if base_activating & lbl_activating:
        print("\nCommon activating tokens:")
        for token in base_activating & lbl_activating:
            print(f"- {token}")

def find_similar_sequences(base_seqs, lbl_seqs, similarity_threshold=0.5):
    """Find sequences that share similar patterns even if not identical"""
    similar_pairs = []
    
    for base_seq in base_seqs:
        for lbl_seq in lbl_seqs:
            # Check if they share the same activating token
            if base_seq['activating_token'] == lbl_seq['activating_token']:
                # Convert paths to sets for partial matching
                base_set = set(base_seq['path'])
                lbl_set = set(lbl_seq['path'])
                
                # Calculate Jaccard similarity
                similarity = len(base_set & lbl_set) / len(base_set | lbl_set)
                
                if similarity >= similarity_threshold:
                    similar_pairs.append({
                        'base_seq': base_seq,
                        'lbl_seq': lbl_seq,
                        'similarity': similarity,
                        'common_tokens': base_set & lbl_set
                    })
    
    return similar_pairs

# Analyze with lower threshold and more detail
for layer in list(paired_graphs.keys())[:2]:
    print(f"\n{'='*50}")
    print(f"Analyzing Layer {layer}")
    print('='*50)
    
    for neuron in list(paired_graphs[layer].keys())[:3]:
        print(f"\nNeuron {neuron}:")
        
        # Extract sequences with lower threshold
        base_seqs = extract_token_sequences(
            paired_graphs[layer][neuron]['base'],
            min_weight=0.05,  # Lower threshold
            max_length=3      # Shorter sequences
        )
        lbl_seqs = extract_token_sequences(
            paired_graphs[layer][neuron]['lbl'],
            min_weight=0.05,
            max_length=3
        )
        
        # Print sequence details
        print_sequence_details(base_seqs, "Base Model")
        print_sequence_details(lbl_seqs, "Ablated Model")
        
        # Analyze activating tokens
        analyze_activating_tokens(base_seqs, lbl_seqs)
        
        # Find similar sequences
        similar_pairs = find_similar_sequences(base_seqs, lbl_seqs)
        if similar_pairs:
            print("\nSimilar Sequences Found:")
            for pair in similar_pairs:
                print(f"\nSimilarity: {pair['similarity']:.3f}")
                print(f"Base sequence: {' -> '.join(pair['base_seq']['path'])}")
                print(f"Ablated sequence: {' -> '.join(pair['lbl_seq']['path'])}")
                print(f"Common tokens: {pair['common_tokens']}")
        else:
            print("\nNo similar sequences found")

In [None]:
import json
import os
import networkx as nx
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

# Load spaCy for token analysis
nlp = spacy.load('en_core_web_sm')

def load_paired_graphs(base_dir='neuron_graphs', layer_range=range(8)):
    """Load corresponding graphs from base and ablated models"""
    paired_graphs = defaultdict(dict)
    
    for layer in layer_range:
        base_dir_path = os.path.join(base_dir, 'base_model', f'layer_{layer}')
        lbl_dir_path = os.path.join(base_dir, 'lbl', f'layer_{layer}')
        
        if not (os.path.exists(base_dir_path) and os.path.exists(lbl_dir_path)):
            continue
            
        # Get all neuron IDs from filenames
        base_neurons = {f.split('_')[1] for f in os.listdir(base_dir_path) if f.endswith('_graph.json')}
        lbl_neurons = {f.split('_')[1] for f in os.listdir(lbl_dir_path) if f.endswith('_graph.json')}
        
        # Get intersection of neurons present in both models
        common_neurons = base_neurons & lbl_neurons
        
        for neuron in common_neurons:
            base_path = os.path.join(base_dir_path, f'l{layer}_{neuron}_graph.json')
            lbl_path = os.path.join(lbl_dir_path, f'l{layer}_{neuron}_graph.json')
            
            if os.path.exists(base_path) and os.path.exists(lbl_path):
                with open(base_path, 'r') as f:
                    base_data = json.load(f)
                with open(lbl_path, 'r') as f:
                    lbl_data = json.load(f)
                    
                paired_graphs[layer][neuron] = {
                    'base': base_data,
                    'lbl': lbl_data
                }
    
    return paired_graphs

class SequenceAnalyzer:
    def __init__(self, paired_graphs):
        self.paired_graphs = paired_graphs
        self.patterns_data = []
        self.token_stats = defaultdict(lambda: defaultdict(int))
        self.sequence_stats = defaultdict(lambda: defaultdict(list))
        
    def get_token_type(self, token: str) -> str:
        """Classify token type using spaCy"""
        doc = nlp(token.strip())
        if len(doc) == 0:
            return 'UNKNOWN'
        
        token = doc[0]
        if token.is_punct:
            return 'PUNCT'
        elif token.is_stop:
            return 'STOP'
        elif token.pos_ in ['VERB', 'NOUN', 'ADJ', 'ADV']:
            return token.pos_
        return 'OTHER'
    
    def extract_sequences(self, graph_data: Dict, min_weight: float = 0.05, max_length: int = 4) -> List[Dict]:
        """Extract sequences with enhanced metadata"""
        G = nx.DiGraph()
        for node, attrs in graph_data['nodes'].items():
            G.add_node(node, **attrs)
        for edge_str, attrs in graph_data['edges'].items():
            source, target = edge_str.split('->')
            if attrs['weight'] >= min_weight:
                G.add_edge(source, target, **attrs)
        
        sequences = []
        activating_nodes = [n for n, d in G.nodes(data=True) if d.get('is_activating', False)]
        
        for act_node in activating_nodes:
            for node in G.nodes():
                if node != act_node:
                    for path in nx.all_simple_paths(G, node, act_node, cutoff=max_length):
                        weights = []
                        token_types = []
                        for i in range(len(path)-1):
                            weights.append(G.edges[path[i], path[i+1]]['weight'])
                            token_types.append(self.get_token_type(path[i]))
                        token_types.append(self.get_token_type(act_node))
                        
                        sequences.append({
                            'path': path,
                            'weights': weights,
                            'token_types': token_types,
                            'min_weight': min(weights),
                            'mean_weight': np.mean(weights),
                            'activating_token': act_node,
                            'activating_type': token_types[-1],
                            'length': len(path)
                        })
        return sequences
    
    def analyze_layer(self, layer: int, base_data: Dict, lbl_data: Dict) -> Dict:
        """Analyze sequence patterns for a layer"""
        base_seqs = self.extract_sequences(base_data)
        lbl_seqs = self.extract_sequences(lbl_data)
        
        # Collect stats
        stats = {
            'base_seq_count': len(base_seqs),
            'lbl_seq_count': len(lbl_seqs),
            'base_avg_length': np.mean([s['length'] for s in base_seqs]) if base_seqs else 0,
            'lbl_avg_length': np.mean([s['length'] for s in lbl_seqs]) if lbl_seqs else 0,
            'base_avg_weight': np.mean([s['mean_weight'] for s in base_seqs]) if base_seqs else 0,
            'lbl_avg_weight': np.mean([s['mean_weight'] for s in lbl_seqs]) if lbl_seqs else 0
        }
        
        # Analyze token types
        for seq in base_seqs:
            for t_type in seq['token_types']:
                self.token_stats['base'][t_type] += 1
                
        for seq in lbl_seqs:
            for t_type in seq['token_types']:
                self.token_stats['lbl'][t_type] += 1
        
        # Collect sequence patterns
        for seq in base_seqs:
            pattern = '->'.join(seq['token_types'])
            self.sequence_stats['base'][pattern].append(seq['mean_weight'])
            
        for seq in lbl_seqs:
            pattern = '->'.join(seq['token_types'])
            self.sequence_stats['lbl'][pattern].append(seq['mean_weight'])
        
        return stats
    
    def analyze_all_layers(self):
        """Analyze all layers and collect statistics"""
        layer_stats = []
        
        for layer in tqdm(list(self.paired_graphs.keys())):
            for neuron, data in self.paired_graphs[layer].items():
                stats = self.analyze_layer(layer, data['base'], data['lbl'])
                stats.update({'layer': layer, 'neuron': neuron})
                layer_stats.append(stats)
        
        self.layer_stats = pd.DataFrame(layer_stats)
        
    def plot_analysis(self):
        """Create comprehensive visualizations"""
        fig = plt.figure(figsize=(20, 15))
        gs = plt.GridSpec(3, 2)
        
        # 1. Sequence counts by layer
        ax1 = fig.add_subplot(gs[0, 0])
        self.layer_stats.groupby('layer')[['base_seq_count', 'lbl_seq_count']].mean().plot(
            kind='bar', ax=ax1)
        ax1.set_title('Average Sequence Count by Layer')
        ax1.set_ylabel('Count')
        
        # 2. Sequence lengths
        ax2 = fig.add_subplot(gs[0, 1])
        self.layer_stats.groupby('layer')[['base_avg_length', 'lbl_avg_length']].mean().plot(
            kind='bar', ax=ax2)
        ax2.set_title('Average Sequence Length by Layer')
        ax2.set_ylabel('Length')
        
        # 3. Token type distribution
        ax3 = fig.add_subplot(gs[1, :])
        token_df = pd.DataFrame(self.token_stats)
        token_df.plot(kind='bar', ax=ax3)
        ax3.set_title('Token Type Distribution')
        ax3.set_ylabel('Count')
        
        # 4. Top sequence patterns
        patterns_data = []
        for model in ['base', 'lbl']:
            for pattern, weights in self.sequence_stats[model].items():
                patterns_data.append({
                    'model': model,
                    'pattern': pattern,
                    'count': len(weights),
                    'avg_weight': np.mean(weights)
                })
        
        patterns_df = pd.DataFrame(patterns_data)
        ax4 = fig.add_subplot(gs[2, :])
        top_patterns = patterns_df.nlargest(10, 'count')
        sns.barplot(data=top_patterns, x='pattern', y='count', hue='model', ax=ax4)
        ax4.set_title('Top Sequence Patterns')
        ax4.set_xticklabels(ax4.get_xticklabels(), rotation=45, ha='right')
        
        plt.tight_layout()
        plt.show()
        
        return patterns_df

# Load graphs and run analysis
print("Loading paired graphs...")
graphs = load_paired_graphs()
print(f"Loaded {len(graphs)} layers of graphs")

print("\nRunning sequence analysis...")
analyzer = SequenceAnalyzer(graphs)
analyzer.analyze_all_layers()
patterns_df = analyzer.plot_analysis()

# Print summary statistics
print("\nSummary Statistics:")
print("\nAverage Sequence Counts by Layer:")
print(analyzer.layer_stats.groupby('layer')[['base_seq_count', 'lbl_seq_count']].mean())

print("\nMost Common Pattern Types:")
print(patterns_df.sort_values('count', ascending=False).head(10))

# Token Substituion Analysis

In [None]:
import os
import json
from pathlib import Path
from model_loader import load_model

def examine_graph(graph_path):
    """Examine and display the structure of a neuron graph"""
    print(f"\nExamining graph at: {graph_path}")
    
    with open(graph_path) as f:
        graph_data = json.load(f)
    
    print("\nGraph Structure:")
    print("=" * 50)
    
    print("\nNodes:")
    for node, data in graph_data['nodes'].items():
        print(f"\nToken: {node}")
        for key, value in data.items():
            print(f"  {key}: {value}")
            
    print("\nEdges:")
    for edge, data in graph_data['edges'].items():
        print(f"\n{edge}:")
        for key, value in data.items():
            print(f"  {key}: {value}")
            
    # Find activating nodes
    activating_nodes = []
    for node, data in graph_data['nodes'].items():
        if data.get('is_activating', False):
            activating_nodes.append({
                'token': node,
                'activation': data.get('activation', 0),
                'count': data.get('count', 0)
            })
    
    print("\nActivating Nodes Summary:")
    print("=" * 50)
    for node in activating_nodes:
        print(f"\nToken: {node['token']}")
        print(f"Activation: {node['activation']:.3f}")
        print(f"Count: {node['count']}")
        
        # Find connected nodes
        connected = []
        for edge in graph_data['edges'].keys():
            source, target = edge.split('->')
            if target == node['token']:
                connected.append(source)
            elif source == node['token']:
                connected.append(target)
                
        if connected:
            print("Connected tokens:")
            for token in connected:
                print(f"  - {token}")

# Load model and examine a specific neuron's graph
print("Loading model...")
model_path = 'configs/base_model_best.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
model.eval()
print("Model loaded successfully")

# Let's look at layer 0, neuron 5 as before
layer = 0
neuron = 40
graph_dir = Path('neuron_graphs/base_model')
graph_path = graph_dir / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'

examine_graph(graph_path)

In [None]:
import torch
from pathlib import Path
from model_loader import load_model
from transformers import BertTokenizer, BertForMaskedLM
import tiktoken
import numpy as np
import networkx as nx
import json
import string

class TokenSubstitutionAnalyzer:
    """
    Analyzes neuron activation patterns and finds token substitutions that maintain activation.
    Uses BERT for suggesting replacement tokens and tests them in the original model.
    """
    def __init__(self, model, device='cuda'):
        """Initialize with both GPT model for testing and BERT for suggestions"""
        self.model = model
        self.device = device
        
        # Initialize GPT-2 tokenizer for our base model
        self.gpt2_tokenizer = tiktoken.get_encoding("gpt2")
        
        # Initialize BERT for masked token prediction
        print("Loading BERT model for token suggestions...")
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)
        self.bert_model.eval()
        
        # Create set of punctuation tokens in GPT-2's vocabulary
        # This helps us handle punctuation substitutions appropriately
        self.punct_tokens = set(self.gpt2_tokenizer.encode(p)[0] 
                              for p in string.punctuation 
                              if len(self.gpt2_tokenizer.encode(p)) == 1)
    
    def load_neuron_graph(self, graph_path):
        """Load and parse the neuron's activation graph from JSON"""
        print(f"Loading graph from {graph_path}")
        with open(graph_path) as f:
            graph_data = json.load(f)
        
        # Create a directed graph to represent token relationships
        graph = nx.DiGraph()
        
        # Add nodes (tokens) with their properties
        for node, data in graph_data['nodes'].items():
            graph.add_node(node, **data)
            
        # Add edges (token transitions) with weights
        for edge_str, data in graph_data['edges'].items():
            source, target = edge_str.split('->')
            graph.add_edge(source, target, **data)
        
        print(f"Loaded graph with {len(graph.nodes)} nodes and {len(graph.edges)} edges")
        return graph
    
    def get_activation_patterns(self, graph):
        """Extract patterns of tokens that activate the neuron strongly"""
        patterns = []
        # Look through all nodes to find activating tokens
        for node, data in graph.nodes(data=True):
            if data.get('is_activating', False):
                # Get the context before and after the activating token
                pre_context = list(graph.predecessors(node))
                post_context = list(graph.successors(node))
                
                patterns.append({
                    'token': node,
                    'activation': data['activation'],
                    'pre_context': pre_context,
                    'post_context': post_context
                })
                
        print(f"Found {len(patterns)} activation patterns")
        return patterns
    
    def get_bert_predictions(self, pre_context, post_context, original_token, k=10):
        """Use BERT to predict likely tokens that could replace the original"""
        # Convert context tokens to text, handling empty contexts gracefully
        pre_text = ' '.join([t.replace('Ġ', ' ').strip() for t in pre_context]) if pre_context else ''
        post_text = ' '.join([t.replace('Ġ', ' ').strip() for t in post_context]) if post_context else ''
        
        # Create the masked input text based on available context
        if pre_text and post_text:
            masked_text = f"{pre_text} [MASK] {post_text}"
        elif pre_text:
            masked_text = f"{pre_text} [MASK]"
        elif post_text:
            masked_text = f"[MASK] {post_text}"
        else:
            masked_text = "[MASK]"
            
        print(f"Getting BERT predictions for: {masked_text}")
        
        # Get BERT's token predictions
        inputs = self.bert_tokenizer(masked_text, return_tensors='pt').to(self.device)
        mask_idx = torch.where(inputs['input_ids'] == self.bert_tokenizer.mask_token_id)[1]
        
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
            logits = outputs.logits[0, mask_idx]
            probs = torch.nn.functional.softmax(logits, dim=-1)
            
            # Get extra predictions initially so we can filter them
            top_k_mult = torch.topk(probs, k * 2)
            
            suggestions = []
            for prob, token_id in zip(top_k_mult.values[0], top_k_mult.indices[0]):
                token = self.bert_tokenizer.decode([token_id])
                # Convert BERT's token to GPT-2 tokens
                gpt2_tokens = self.gpt2_tokenizer.encode(token)
                
                # Only consider single-token predictions
                if len(gpt2_tokens) == 1:
                    is_punct = gpt2_tokens[0] in self.punct_tokens
                    suggestions.append({
                        'token': token,
                        'gpt2_id': gpt2_tokens[0],
                        'probability': prob.item(),
                        'is_punct': is_punct
                    })
            
            # Filter suggestions to match original token type (punctuation or not)
            is_original_punct = original_token.strip() in string.punctuation
            filtered_suggestions = [
                s for s in suggestions 
                if s['is_punct'] == is_original_punct
            ][:k]
            
            print(f"Found {len(filtered_suggestions)} suitable predictions")
            for s in filtered_suggestions:
                print(f"  {s['token']} (prob: {s['probability']:.3f})")
            
        return filtered_suggestions
    
    def test_activation(self, pre_context, token_id, post_context, layer, neuron):
        """Test how strongly a token activates the neuron in its context"""
        # Handle pre-context tokens, converting strings if needed
        if not pre_context:
            pre_ids = []
        elif isinstance(pre_context[0], str):
            pre_ids = [self.gpt2_tokenizer.encode(t)[0] for t in pre_context]
        else:
            pre_ids = pre_context
            
        # Handle post-context tokens, converting strings if needed
        if not post_context:
            post_ids = []
        elif isinstance(post_context[0], str):
            post_ids = [self.gpt2_tokenizer.encode(t)[0] for t in post_context]
        else:
            post_ids = post_context
        
        # Create the complete sequence
        sequence = pre_ids + [token_id] + post_ids
        input_ids = torch.tensor([sequence], device=self.device)
        
        # Get neuron activations
        with torch.no_grad():
            outputs, cache = self.model(input_ids, return_cache=True)
            
        # Get activation at the target position (after pre_context)
        target_pos = len(pre_ids)
        activation = cache[f'transformer.h.{layer}.mlp'][0, target_pos, neuron].item()
        
        return activation

    def analyze_substitutions(self, graph_path, layer, neuron, 
                            activation_threshold=0.5, 
                            relative_threshold=0.5):
        """Find and test token substitutions that maintain neuron activation"""
        # Load and analyze the neuron's graph
        graph = self.load_neuron_graph(graph_path)
        patterns = self.get_activation_patterns(graph)
        successful_substitutions = []
        
        print(f"\nAnalyzing {len(patterns)} activation patterns...")
        
        # Process each activation pattern
        for pattern in patterns:
            print(f"\nAnalyzing pattern for token: {pattern['token']}")
            print(f"Original activation: {pattern['activation']:.3f}")
            
            # Log context information
            pre_context_str = ' '.join(pattern['pre_context']) if pattern['pre_context'] else '(no pre-context)'
            post_context_str = ' '.join(pattern['post_context']) if pattern['post_context'] else '(no post-context)'
            print(f"Pre-context: {pre_context_str}")
            print(f"Post-context: {post_context_str}")
            
            # Get BERT's suggestions for this context
            predictions = self.get_bert_predictions(
                pattern['pre_context'],
                pattern['post_context'],
                pattern['token']
            )
            
            print(f"Testing {len(predictions)} BERT suggestions...")
            
            # Test each predicted token
            for pred in predictions:
                activation = self.test_activation(
                    pattern['pre_context'],
                    pred['gpt2_id'],
                    pattern['post_context'],
                    layer,
                    neuron
                )
                
                print(f"Testing '{pred['token']}': activation = {activation:.3f}")
                
                # Check if activation meets our thresholds
                if (activation >= pattern['activation'] * relative_threshold and 
                    activation >= activation_threshold):
                    successful_substitutions.append({
                        'original_token': pattern['token'],
                        'substitute_token': pred['token'],
                        'bert_probability': pred['probability'],
                        'original_activation': pattern['activation'],
                        'new_activation': activation,
                        'pre_context': pre_context_str,
                        'post_context': post_context_str
                    })
                    print(f"Found successful substitution: {pred['token']}")
                    print(f"New activation: {activation:.3f}")
        
        return successful_substitutions

thresholds = [
    (0.3, 0.3),  # Very permissive
    (0.4, 0.4),  # Moderately permissive
    (0.5, 0.5)   # Original threshold
]

print("Loading GPT model...")
model_path = 'configs/base_model_best.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
model.eval()

analyzer = TokenSubstitutionAnalyzer(model, device)
layer = 0
neuron = 40
graph_path = Path('neuron_graphs/base_model') / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'

print(f"\nAnalyzing Layer {layer} Neuron {neuron} with multiple thresholds")
print("=" * 50)

for activation_threshold, relative_threshold in thresholds:
    print(f"\nTesting with thresholds:")
    print(f"Activation threshold: {activation_threshold:.1f}")
    print(f"Relative threshold: {relative_threshold:.1f}")
    print("-" * 30)
    
    substitutions = analyzer.analyze_substitutions(
        graph_path, 
        layer, 
        neuron,
        activation_threshold=activation_threshold,
        relative_threshold=relative_threshold
    )
    
    if substitutions:
        print("\nSuccessful substitutions:")
        for sub in substitutions:
            print(f"\nOriginal: '{sub['original_token']}'")
            print(f"Substitute: '{sub['substitute_token']}'")
            print(f"Context: {sub['pre_context']}")
            print(f"BERT probability: {sub['bert_probability']:.3f}")
            print(f"Original activation: {sub['original_activation']:.3f}")
            print(f"New activation: {sub['new_activation']:.3f}")
            print(f"Activation ratio: {sub['new_activation']/sub['original_activation']:.3f}")
    else:
        print("\nNo successful substitutions found at these thresholds")

In [None]:
# Base Model Parallel Ver
import torch
from pathlib import Path
from model_loader import load_model
from transformers import BertTokenizer, BertForMaskedLM
import tiktoken
import numpy as np
import networkx as nx
import json
import string
from tqdm import tqdm
from collections import defaultdict

class BatchedTokenSubstitutionAnalyzer:
    """
    Analyzes neuron activation patterns and finds meaningful token substitutions.
    Uses BERT for suggesting replacements and tests them in the original model,
    while considering grammatical context and token types.
    """
    def __init__(self, model, device='cuda'):
        """Initialize analyzer with models and token patterns"""
        self.model = model
        self.device = device
        self.gpt2_tokenizer = tiktoken.get_encoding("gpt2")
        
        # Initialize BERT for contextual predictions
        print("Loading BERT model for token suggestions...")
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)
        self.bert_model.eval()
        
        # Define grammatical patterns for different token types
        self.grammatical_patterns = {
            'punctuation': [
                "In the sentence '{pre} [MASK] {post}', the punctuation",
                "The text reads '{pre} [MASK] {post}' with proper punctuation",
                "Complete with punctuation: {pre} [MASK] {post}"
            ],
            'article': [
                "Consider '{pre} [MASK] {post}' as a phrase",
                "The phrase '{pre} [MASK] {post}' uses an article",
                "Fill in the article: {pre} [MASK] {post}"
            ],
            'preposition': [
                "The words '{pre} [MASK] {post}' form a prepositional phrase",
                "'{pre} [MASK] {post}' shows spatial relationship",
                "Complete with a preposition: {pre} [MASK] {post}"
            ],
            'conjunction': [
                "The sentence '{pre} [MASK] {post}' uses a conjunction",
                "Join the phrases: {pre} [MASK] {post}",
                "Connect with a conjunction: {pre} [MASK] {post}"
            ],
            'verb': [
                "The action in '{pre} [MASK] {post}' is",
                "Fill in the verb: {pre} [MASK] {post}",
                "What happens in '{pre} [MASK] {post}'?"
            ],
            'general': [
                "Complete the phrase: {pre} [MASK] {post}",
                "Fill in: {pre} [MASK] {post}",
                "What word fits here: {pre} [MASK] {post}"
            ]
        }
        
        # Create sets of tokens by type for classification
        self.punct_tokens = set(self.gpt2_tokenizer.encode(p)[0] 
                              for p in string.punctuation 
                              if len(self.gpt2_tokenizer.encode(p)) == 1)
        
        # Common word sets for token classification
        self.articles = {'the', 'a', 'an'}
        self.prepositions = {'in', 'on', 'at', 'to', 'from', 'by', 'with', 'above', 'below'}
        self.conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so'}

    def get_token_type(self, token):
        """
        Determine the grammatical type of a token for appropriate prompting.
        Uses both form and context to identify token types.
        """
        # Clean the token for analysis
        cleaned_token = token.strip().lower()
        
        # Check various token types
        if cleaned_token in string.punctuation:
            return 'punctuation'
        if cleaned_token in self.articles:
            return 'article'
        if cleaned_token in self.prepositions:
            return 'preposition'
        if cleaned_token in self.conjunctions:
            return 'conjunction'
        
        # Could add more sophisticated token type detection here
        return 'general'

    def load_neuron_graph(self, graph_path):
        """Load and parse the neuron's activation graph from JSON"""
        with open(graph_path) as f:
            graph_data = json.load(f)
        
        # Create a directed graph to represent token relationships
        graph = nx.DiGraph()
        
        # Add nodes with their properties
        for node, data in graph_data['nodes'].items():
            graph.add_node(node, **data)
            
        # Add edges with weights    
        for edge_str, data in graph_data['edges'].items():
            source, target = edge_str.split('->')
            graph.add_edge(source, target, **data)
            
        return graph

    def get_activation_patterns(self, graph):
        """Extract activating token patterns from the graph"""
        patterns = []
        for node, data in graph.nodes(data=True):
            if data.get('is_activating', False):
                # Get predecessor and successor nodes for context
                pre_context = list(graph.predecessors(node))
                post_context = list(graph.successors(node))
                
                patterns.append({
                    'token': node,
                    'activation': data['activation'],
                    'pre_context': pre_context,
                    'post_context': post_context
                })
        return patterns

    def get_bert_predictions_with_grammar(self, pre_context, post_context, original_token, k=10):
        """
        Get BERT predictions using grammatical prompting patterns.
        Combines predictions from multiple grammatical contexts.
        """
        # Determine token type and get appropriate patterns
        token_type = self.get_token_type(original_token)
        patterns = self.grammatical_patterns[token_type]
        
        # Process context text
        pre_text = ' '.join([t.replace('Ġ', ' ').strip() for t in pre_context]) if pre_context else ''
        post_text = ' '.join([t.replace('Ġ', ' ').strip() for t in post_context]) if post_context else ''
        
        combined_predictions = []
        
        # Try each grammatical pattern
        for pattern in patterns:
            masked_text = pattern.format(pre=pre_text, post=post_text).strip()
            inputs = self.bert_tokenizer(masked_text, return_tensors='pt').to(self.device)
            mask_idx = torch.where(inputs['input_ids'] == self.bert_tokenizer.mask_token_id)[1]
            
            with torch.no_grad():
                outputs = self.bert_model(**inputs)
                logits = outputs.logits[0, mask_idx]
                probs = torch.nn.functional.softmax(logits, dim=-1)
                
                top_k = torch.topk(probs, k * 2)
                
                for prob, token_id in zip(top_k.values[0], top_k.indices[0]):
                    token = self.bert_tokenizer.decode([token_id])
                    gpt2_tokens = self.gpt2_tokenizer.encode(token)
                    
                    # Only keep single-token predictions
                    if len(gpt2_tokens) == 1:
                        combined_predictions.append({
                            'token': token,
                            'gpt2_id': gpt2_tokens[0],
                            'probability': prob.item(),
                            'pattern': pattern
                        })
        
        # Sort by probability and take top k unique predictions
        unique_predictions = {}
        for pred in sorted(combined_predictions, key=lambda x: x['probability'], reverse=True):
            if pred['token'] not in unique_predictions:
                unique_predictions[pred['token']] = pred
                if len(unique_predictions) == k:
                    break
                    
        return list(unique_predictions.values())

    def test_activation(self, pre_context, token_id, post_context, layer, neuron):
        """Test how strongly a token activates the neuron in context"""
        # Handle pre-context tokens
        if not pre_context:
            pre_ids = []
        elif isinstance(pre_context[0], str):
            pre_ids = [self.gpt2_tokenizer.encode(t)[0] for t in pre_context]
        else:
            pre_ids = pre_context
            
        # Handle post-context tokens
        if not post_context:
            post_ids = []
        elif isinstance(post_context[0], str):
            post_ids = [self.gpt2_tokenizer.encode(t)[0] for t in post_context]
        else:
            post_ids = post_context
        
        # Create sequence and get activation
        sequence = pre_ids + [token_id] + post_ids
        input_ids = torch.tensor([sequence], device=self.device)
        
        with torch.no_grad():
            outputs, cache = self.model(input_ids, return_cache=True)
            activation = cache[f'transformer.h.{layer}.mlp'][0, len(pre_ids), neuron].item()
        
        return activation

    def analyze_single_neuron(self, layer, neuron, graph_path, 
                            activation_threshold=0.3, relative_threshold=0.3):
        """Analyze substitutions for a single neuron"""
        try:
            graph = self.load_neuron_graph(graph_path)
            patterns = self.get_activation_patterns(graph)
            successful_substitutions = []
            
            for pattern in patterns:
                predictions = self.get_bert_predictions_with_grammar(
                    pattern['pre_context'],
                    pattern['post_context'],
                    pattern['token']
                )
                
                for pred in predictions:
                    activation = self.test_activation(
                        pattern['pre_context'],
                        pred['gpt2_id'],
                        pattern['post_context'],
                        layer,
                        neuron
                    )
                    
                    # Check if activation meets thresholds
                    if (activation >= pattern['activation'] * relative_threshold and 
                        activation >= activation_threshold):
                        successful_substitutions.append({
                            'original_token': pattern['token'],
                            'substitute_token': pred['token'],
                            'bert_probability': pred['probability'],
                            'original_activation': pattern['activation'],
                            'new_activation': activation,
                            'pre_context': pattern['pre_context'],
                            'post_context': pattern['post_context'],
                            'pattern_used': pred['pattern']
                        })
            
            return successful_substitutions
            
        except Exception as e:
            print(f"Error analyzing neuron {layer}_{neuron}: {str(e)}")
            return None

    def analyze_neuron_batch(self, layer_range, neuron_range, batch_size=10):
        """Analyze neurons in batches for efficient processing"""
        results = {}
        neurons_to_analyze = []
        
        # Collect valid neurons
        print("Collecting valid neurons...")
        for layer in layer_range:
            for neuron in neuron_range:
                graph_path = Path('neuron_graphs/base_model') / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'
                if graph_path.exists():
                    neurons_to_analyze.append((layer, neuron, graph_path))

        # Process in batches
        total_batches = (len(neurons_to_analyze) + batch_size - 1) // batch_size
        print(f"\nProcessing {len(neurons_to_analyze)} neurons in {total_batches} batches")
        
        for batch_idx in tqdm(range(0, len(neurons_to_analyze), batch_size), desc="Processing batches"):
            batch = neurons_to_analyze[batch_idx:batch_idx + batch_size]
            batch_results = {}
            
            for layer, neuron, graph_path in batch:
                try:
                    substitutions = self.analyze_single_neuron(layer, neuron, graph_path)
                    if substitutions:
                        batch_results[f"l{layer}_n{neuron}"] = substitutions
                except Exception as e:
                    print(f"\nError analyzing layer {layer} neuron {neuron}: {str(e)}")
            
            # Update and save results
            results.update(batch_results)
            if (batch_idx + batch_size) % 50 == 0:
                self._save_intermediate_results(results, batch_idx + batch_size)

        return results

    def _save_intermediate_results(self, results, neurons_processed):
        """Save checkpoint of results during processing"""
        output_path = Path(f'substitution_results_checkpoint_{neurons_processed}.json')
        with open(output_path, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"\nSaved checkpoint after processing {neurons_processed} neurons")

    def get_substitution_statistics(self, results):
        """Calculate comprehensive statistics about substitution patterns"""
        stats = {
            'total_neurons_analyzed': 0,
            'neurons_with_substitutions': 0,
            'substitutions_by_type': defaultdict(int),
            'average_activation_ratio': [],
            'substitutions_by_layer': defaultdict(int),
            'bert_probability_distribution': [],
            'pattern_effectiveness': defaultdict(list)
        }
        
        for neuron_id, substitutions in results.items():
            layer = int(neuron_id.split('_')[0][1:])
            stats['total_neurons_analyzed'] += 1
            
            if substitutions:
                stats['neurons_with_substitutions'] += 1
                stats['substitutions_by_layer'][layer] += len(substitutions)
                
                for sub in substitutions:
                    token_type = self.get_token_type(sub['original_token'])
                    stats['substitutions_by_type'][token_type] += 1
                    
                    activation_ratio = sub['new_activation'] / sub['original_activation']
                    stats['average_activation_ratio'].append(activation_ratio)
                    stats['bert_probability_distribution'].append(sub['bert_probability'])
                    stats['pattern_effectiveness'][sub['pattern_used']].append(activation_ratio)
        
        # Calculate averages
        if stats['average_activation_ratio']:
            stats['average_activation_ratio'] = np.mean(stats['average_activation_ratio'])
            stats['bert_probability_mean'] = np.mean(stats['bert_probability_distribution'])
            
            # Calculate effectiveness of each pattern
            stats['pattern_effectiveness'] = {
                pattern: np.mean(ratios)
                for pattern, ratios in stats['pattern_effectiveness'].items()
            }
        
        return stats

def run_batch_analysis():
    """Run complete analysis for neurons 0-199 in each layer"""
    print("Loading GPT model...")
    model_path = 'configs/base_model_best.pt'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, config = load_model(model_path, device)
    model.eval()

    analyzer = BatchedTokenSubstitutionAnalyzer(model, device)
    
    # Analyze neurons 0-199 for each layer
    layer_range = range(8)
    neuron_range = range(200)
    
    print("Starting batched analysis...")
    results = analyzer.analyze_neuron_batch(layer_range, neuron_range)
    
    # Calculate and display statistics
    stats = analyzer.get_substitution_statistics(results)
    
    print("\nAnalysis Results:")
    print(f"Total neurons analyzed: {stats['total_neurons_analyzed']}")
    print(f"Neurons with substitutions: {stats['neurons_with_substitutions']}")
    print(f"Success rate: {(stats['neurons_with_substitutions']/stats['total_neurons_analyzed'])*100:.2f}%")
    
    print("\nSubstitutions by layer:")
    for layer, count in sorted(stats['substitutions_by_layer'].items()):
        print(f"Layer {layer}: {count} substitutions")
    
    print("\nSubstitutions by token type:")
    for token_type, count in stats['substitutions_by_type'].items():
        print(f"{token_type}: {count} substitutions")
    
    print(f"\nAverage activation ratio: {stats['average_activation_ratio']:.3f}")
    
    # Save final results
    output_path = Path('substitution_analysis_results.json')
    with open(output_path, 'w') as f:
        json.dump({
            'results': results,
            'statistics': stats
        }, f, indent=2)
    
    print(f"\nFull results saved to {output_path}")

if __name__ == "__main__":
    run_batch_analysis()

## Ablated

In [1]:
# Import needed modules
from neuron_graph_builder import NeuronAnalyzer
from transformers import BertTokenizer, BertForMaskedLM
import torch
import tiktoken
import numpy as np
import networkx as nx
import json
import string
import datetime
from tqdm import tqdm
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set, Tuple, Optional
from graph_utils import fast_measure_importance_ablated

class AblatedBatchedTokenSubstitutionAnalyzer(NeuronAnalyzer):
    """
    Analyzes neuron activation patterns for ablated models, suggesting token substitutions
    that maintain activation patterns. Inherits from NeuronAnalyzer to reuse graph and
    activation analysis code.
    """
    def __init__(self, model, device='cuda'):
        """Initialize analyzer with both GPT2 and BERT tokenization"""
        super().__init__(model, device)
        
        # Initialize BERT for token suggestions
        print("Loading BERT model for token suggestions...")
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)
        self.bert_model.eval()
        
        # Rename base class tokenizer for clarity
        self.gpt2_tokenizer = self.tokenizer
        
        # Define grammatical patterns for prompting
        self.grammatical_patterns = {
            'punctuation': [
                "In the sentence '{pre} [MASK] {post}', the punctuation",
                "The text reads '{pre} [MASK] {post}' with proper punctuation",
            ],
            'article': [
                "Consider '{pre} [MASK] {post}' as a phrase",
                "The phrase '{pre} [MASK] {post}' uses an article",
            ],
            'preposition': [
                "The words '{pre} [MASK] {post}' form a prepositional phrase",
                "'{pre} [MASK] {post}' shows spatial relationship",
            ],
            'conjunction': [
                "The sentence '{pre} [MASK] {post}' uses a conjunction",
                "Join the phrases: {pre} [MASK] {post}",
            ],
            'verb': [
                "The action in '{pre} [MASK] {post}' is",
                "Fill in the verb: {pre} [MASK] {post}",
            ],
            'general': [
                "Complete the phrase: {pre} [MASK] {post}",
                "Fill in: {pre} [MASK] {post}",
            ]
        }
        
        # Define token type sets
        self.articles = {'the', 'a', 'an'}
        self.prepositions = {'in', 'on', 'at', 'to', 'from', 'by', 'with'}
        self.conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so'}

    def get_token_type(self, token: str) -> str:
        """Determine the grammatical type of a token"""
        cleaned = token.strip().lower()
        if cleaned in string.punctuation:
            return 'punctuation'
        if cleaned in self.articles:
            return 'article'
        if cleaned in self.prepositions:
            return 'preposition'
        if cleaned in self.conjunctions:
            return 'conjunction'
        return 'general'

    def test_activation(self, pre_context, token_id, post_context, layer, neuron, 
                           min_activation=1e-4, relative_threshold=0.5):
        """
        Test neuron activation using graph_utils infrastructure with better activation handling
        
        Args:
            min_activation: Minimum absolute activation value to consider valid
            relative_threshold: Minimum ratio relative to max activation to consider valid
        """
        try:
            # Process context tokens
            if not pre_context:
                pre_ids = []
            elif isinstance(pre_context[0], str):
                pre_ids = [self.gpt2_tokenizer.encode(t)[0] for t in pre_context]
            else:
                pre_ids = pre_context
                
            if not post_context:
                post_ids = []
            elif isinstance(post_context[0], str):
                post_ids = [self.gpt2_tokenizer.encode(t)[0] for t in post_context]
            else:
                post_ids = post_context
            
            # Convert to full sequence
            sequence = pre_ids + [token_id] + post_ids
            text_input = self.gpt2_tokenizer.decode(sequence)
            position = len(pre_ids)
            
            # Use our existing activation testing
            _, initial_max, _, token_activations, actual_max_pos = fast_measure_importance_ablated(
                self, layer, neuron, text_input, 
                initial_argmax=position
            )
            
            # Get activation at our target position
            position_activation = None
            max_activation = None
            for token, act in token_activations:
                if position_activation is None and token == self.gpt2_tokenizer.decode([token_id]):
                    position_activation = act
                max_activation = max(max_activation or act, act)
            
            # Validation criteria:
            # 1. Must have some meaningful activation
            # 2. Must be reasonably close to max activation found
            if position_activation is not None:
                meets_min = abs(position_activation) >= min_activation
                ratio_to_max = abs(position_activation) / abs(max_activation) if abs(max_activation) > min_activation else 0
                meets_relative = ratio_to_max >= relative_threshold
                
                if meets_min or meets_relative:
                    return {
                        'activation': position_activation,
                        'token_activations': token_activations,
                        'position': position,
                        'sequence': text_input,
                        'meets_min': meets_min,
                        'meets_relative': meets_relative,
                        'ratio_to_max': ratio_to_max
                    }
            
            return None
            
        except Exception as e:
            print(f"Error in test_activation: {str(e)}")
            return None

    def get_bert_predictions_with_grammar(self, pre_context, post_context, original_token, k=10):
        """Get BERT predictions using grammatical prompting patterns"""
        token_type = self.get_token_type(original_token)
        patterns = self.grammatical_patterns[token_type]
        
        # Prepare context
        pre_text = ' '.join([t.replace('Ġ', ' ').strip() for t in pre_context]) if pre_context else ''
        post_text = ' '.join([t.replace('Ġ', ' ').strip() for t in post_context]) if post_context else ''
        
        predictions = []
        for pattern in patterns:
            masked_text = pattern.format(pre=pre_text, post=post_text).strip()
            inputs = self.bert_tokenizer(masked_text, return_tensors='pt').to(self.device)
            mask_idx = torch.where(inputs['input_ids'] == self.bert_tokenizer.mask_token_id)[1]
            
            with torch.no_grad():
                outputs = self.bert_model(**inputs)
                logits = outputs.logits[0, mask_idx]
                probs = torch.nn.functional.softmax(logits, dim=-1)
                
                top_k = torch.topk(probs, k * 2)
                
                for prob, token_id in zip(top_k.values[0], top_k.indices[0]):
                    token = self.bert_tokenizer.decode([token_id])
                    gpt2_tokens = self.gpt2_tokenizer.encode(token)
                    
                    if len(gpt2_tokens) == 1:
                        predictions.append({
                            'token': token,
                            'gpt2_id': gpt2_tokens[0],
                            'probability': prob.item(),
                            'pattern': pattern
                        })
        
        # Get unique predictions
        unique_predictions = {}
        for pred in sorted(predictions, key=lambda x: x['probability'], reverse=True):
            if pred['token'] not in unique_predictions:
                unique_predictions[pred['token']] = pred
                if len(unique_predictions) == k:
                    break
        
        return list(unique_predictions.values())

    def analyze_neuron_substitutions(self, layer: int, neuron: int, graph_path: Path, 
                                   debug_mode: bool = True) -> Optional[List[Dict]]:
        """Analyze possible token substitutions for a neuron"""
        print(f"\nAnalyzing L{layer}N{neuron}")
        
        try:
            # Load graph
            with open(graph_path) as f:
                graph_data = json.load(f)
            
            # Find activating nodes
            activating_nodes = [n for n, data in graph_data['nodes'].items() 
                              if data.get('is_activating', False)]
            
            if not activating_nodes:
                print("No activating nodes found")
                return None
            
            print(f"Found {len(activating_nodes)} activating patterns")
            results = []
            
            # Test each activating pattern
            for node in activating_nodes:
                if debug_mode:
                    print(f"\nTesting pattern: {node}")
                
                # Get context
                pre_edges = [e.split('->')[0] for e in graph_data['edges'] 
                           if e.split('->')[1] == node]
                
                # Get predictions
                predictions = self.get_bert_predictions_with_grammar(
                    pre_edges, [], node, k=5
                )
                
                # Test each prediction
                for pred in predictions:
                    try:
                        result = self.test_activation(
                            pre_context=pre_edges,
                            token_id=pred['gpt2_id'],
                            post_context=[],
                            layer=layer,
                            neuron=neuron
                        )
                        
                        if result:  # Only add successful tests
                            substitution = {
                                'original_token': node,
                                'substitute_token': pred['token'],
                                'activation': result['activation'],
                                'token_activations': result['token_activations'],
                                'bert_probability': pred['probability'],
                                'pre_context': pre_edges,
                                'activation_quality': {
                                    'meets_min_threshold': result['meets_min'],
                                    'meets_relative_threshold': result['meets_relative'],
                                    'ratio_to_max': result['ratio_to_max']
                                }
                            }
                            
                            if debug_mode:
                                print(f"\nSubstitution: {node} -> {pred['token']}")
                                print(f"Activation: {result['activation']:.4f}")
                                print(f"Ratio to max: {result['ratio_to_max']:.4f}")
                                if result['meets_min']:
                                    print("Meets minimum threshold")
                                if result['meets_relative']:
                                    print("Meets relative threshold")
                            
                            results.append(substitution)
                            
                    except Exception as e:
                        print(f"Error testing substitution: {str(e)}")
                        continue
            
            return results if results else None
            
        except Exception as e:
            print(f"Error analyzing neuron: {str(e)}")
            return None

    def analyze_layer_range(self, start_layer: int, end_layer: int, neurons_per_layer: int = 5):
        """Analyze a range of layers for substitution patterns"""
        results = {}
        
        for layer in range(start_layer, end_layer + 1):
            layer_results = []
            print(f"\nAnalyzing Layer {layer}")
            
            for neuron in range(neurons_per_layer):
                graph_path = Path('neuron_graphs/lbl') / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'
                
                if graph_path.exists():
                    substitutions = self.analyze_neuron_substitutions(
                        layer=layer,
                        neuron=neuron,
                        graph_path=graph_path
                    )
                    if substitutions:
                        layer_results.extend(substitutions)
            
            if layer_results:
                results[layer] = layer_results
                print(f"Found {len(layer_results)} substitutions in layer {layer}")
        
        return results

In [2]:
# Analysis runner for ablated models
def run_ablated_batch_analysis():
    """Run complete analysis for neurons 0-199 in each layer of the ablated model"""
    print("Loading ablated GPT model...")
    model_path = 'configs/lbl_model_20241016.pt'  # Using the layer-by-layer ablated model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model, config = load_model(model_path, device)
    model.eval()

    analyzer = AblatedBatchedTokenSubstitutionAnalyzer(model, device)
    
    # Analyze neurons 0-199 for each layer
    layer_range = range(8)
    neuron_range = range(200)
    
    print("Starting batched analysis...")
    results = analyzer.analyze_neuron_batch(layer_range, neuron_range)
    
    # Calculate and display statistics
    stats = analyzer.get_substitution_statistics(results)
    
    # Print analysis results
    print("\nAnalysis Results:")
    print(f"Total neurons analyzed: {stats['total_neurons_analyzed']}")
    print(f"Neurons with substitutions: {stats['neurons_with_substitutions']}")
    print(f"Success rate: {(stats['neurons_with_substitutions']/stats['total_neurons_analyzed'])*100:.2f}%")
    
    print("\nSubstitutions by layer:")
    for layer, count in sorted(stats['substitutions_by_layer'].items()):
        print(f"Layer {layer}: {count} substitutions")
    
    print("\nSubstitutions by token type:")
    for token_type, count in stats['substitutions_by_type'].items():
        print(f"{token_type}: {count} substitutions")
    
    print(f"\nAverage activation ratio: {stats['average_activation_ratio']:.3f}")
    
    # Save final results with timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = Path(f'ablated_substitution_analysis_{timestamp}.json')
    with open(output_path, 'w') as f:
        json.dump({
            'results': results,
            'statistics': stats,
            'model_info': {
                'path': model_path,
                'type': 'ablated',
                'config': config.__dict__
            }
        }, f, indent=2)
    
    print(f"\nFull results saved to {output_path}")

In [None]:
# Third cell - Execution
if __name__ == "__main__":
    run_ablated_batch_analysis()

In [None]:
# Get the existing analyzer instance and results
model_path = 'configs/lbl_model_20241016.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
analyzer = AblatedBatchedTokenSubstitutionAnalyzer(model, device)

# Load your most recent results file
results_path = "substitution/Ablated/ablated_substitution_analysis_20250116_235410.json"  # Update with your latest results file
with open(results_path, 'r') as f:
    data = json.load(f)
    results = data['results']

# Run the specialization analysis
analyzer.analyze_neuron_specialization(results)

In [None]:
# Load model and create analyzer
model_path = 'configs/lbl_model_20241016.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
analyzer = AblatedBatchedTokenSubstitutionAnalyzer(model, device)

# Define ranges for all layers
layer_range = range(8)  # All 8 layers
neuron_range = range(200)  # 200 neurons per layer

# Run analysis
print("\nStarting batch analysis with layer statistics...")
results = analyzer.analyze_neuron_batch(layer_range, neuron_range)

# Load most recent results for comparison
latest_results_path = "ablated_substitution_analysis_20250117_091755.json"
with open(latest_results_path, 'r') as f:
    previous_data = json.load(f)
    previous_results = previous_data['results']

print("\nComparison of Results:")
print("Previous analysis found neurons in layers:", 
      set(int(k.split('_')[0][1:]) for k in previous_results.keys()))
print("Current analysis found neurons in layers:", 
      set(int(k.split('_')[0][1:]) for k in results.keys()))

# Run specialization analysis on new results
print("\nSpecialization Analysis of New Results:")
analyzer.analyze_neuron_specialization(results)

In [5]:
import random

In [13]:
import random
import numpy as np
from collections import defaultdict
from pathlib import Path
import torch

# Load most recent results and model
model_path = 'configs/lbl_model_20241016.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
analyzer = AblatedBatchedTokenSubstitutionAnalyzer(model, device)

# Sample a few neurons from each layer to analyze activation patterns
sample_size = 10
activation_stats = defaultdict(list)

print("\nAnalyzing activation patterns across layers...")
for layer in range(8):  # All 8 layers
    print(f"\nLayer {layer}:")
    neurons = random.sample(range(200), sample_size)
    total_patterns = 0
    successful_neurons = 0
    
    for neuron in neurons:
        graph_path = Path('neuron_graphs/lbl') / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'
        try:
            if graph_path.exists():
                graph = analyzer.load_neuron_graph(graph_path)
                patterns = analyzer.get_activation_patterns(graph)
                
                if patterns:
                    total_patterns += len(patterns)
                    successful_neurons += 1
                    activations = [p['activation'] for p in patterns]
                    stats = {
                        'mean': float(np.mean(activations)),
                        'max': float(np.max(activations)),
                        'std': float(np.std(activations)),
                        'pattern_count': len(patterns)
                    }
                    activation_stats[layer].append(stats)
                    print(f"  Neuron {neuron}: {len(patterns)} patterns, "
                          f"max activation: {stats['max']:.4f}")
                    
        except Exception as e:
            print(f"Error processing neuron {neuron} in layer {layer}: {str(e)}")
    
    print(f"Layer {layer} Summary:")
    print(f"Successful neurons: {successful_neurons}/{sample_size}")
    print(f"Total patterns found: {total_patterns}")

# Print comprehensive statistics for each layer
print("\nLayer-wise Activation Statistics:")
for layer in range(8):
    if activation_stats[layer]:
        layer_means = [s['mean'] for s in activation_stats[layer]]
        layer_maxes = [s['max'] for s in activation_stats[layer]]
        layer_patterns = [s['pattern_count'] for s in activation_stats[layer]]
        
        print(f"\nLayer {layer}:")
        print(f"Average activation: {np.mean(layer_means):.4f} (±{np.std(layer_means):.4f})")
        print(f"Average max activation: {np.mean(layer_maxes):.4f} (±{np.std(layer_maxes):.4f})")
        print(f"Average pattern count: {np.mean(layer_patterns):.1f}")
        print(f"Number of neurons with patterns: {len(activation_stats[layer])}/{sample_size}")

Model loaded successfully from configs/lbl_model_20241016.pt
Model type: ablated (layer-by-layer)
Number of parameters: 8709504
Initializing NeuronAnalyzer...
Detected ablated model - initializing HookedTransformer
HookedTransformer initialized successfully
NeuronAnalyzer initialized (Model type: ablated)
Loading BERT model for token suggestions...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Analyzing activation patterns across layers...

Layer 0:
Error processing neuron 161 in layer 0: 'AblatedBatchedTokenSubstitutionAnalyzer' object has no attribute 'load_neuron_graph'
Error processing neuron 87 in layer 0: 'AblatedBatchedTokenSubstitutionAnalyzer' object has no attribute 'load_neuron_graph'
Error processing neuron 159 in layer 0: 'AblatedBatchedTokenSubstitutionAnalyzer' object has no attribute 'load_neuron_graph'
Error processing neuron 163 in layer 0: 'AblatedBatchedTokenSubstitutionAnalyzer' object has no attribute 'load_neuron_graph'
Error processing neuron 89 in layer 0: 'AblatedBatchedTokenSubstitutionAnalyzer' object has no attribute 'load_neuron_graph'
Error processing neuron 112 in layer 0: 'AblatedBatchedTokenSubstitutionAnalyzer' object has no attribute 'load_neuron_graph'
Error processing neuron 135 in layer 0: 'AblatedBatchedTokenSubstitutionAnalyzer' object has no attribute 'load_neuron_graph'
Error processing neuron 107 in layer 0: 'AblatedBatchedTokenSu

In [None]:
# Let's examine the actual graph files for deeper layers
print("\nAnalyzing graph structure across layers...")
graph_stats = defaultdict(lambda: {'total_graphs': 0, 'graphs_with_activations': 0, 'total_nodes': 0})

for layer in range(8):
    print(f"\nLayer {layer}:")
    for neuron in range(200):  # Check all neurons
        graph_path = Path('neuron_graphs/lbl') / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'
        if graph_path.exists():
            graph_stats[layer]['total_graphs'] += 1
            try:
                with open(graph_path, 'r') as f:
                    graph_data = json.load(f)
                    # Count nodes with is_activating=True
                    activating_nodes = sum(1 for node, data in graph_data['nodes'].items() 
                                         if data.get('is_activating', False))
                    if activating_nodes > 0:
                        graph_stats[layer]['graphs_with_activations'] += 1
                        graph_stats[layer]['total_nodes'] += activating_nodes
                        
            except Exception as e:
                print(f"Error reading graph {neuron}: {str(e)}")
    
    stats = graph_stats[layer]
    if stats['total_graphs'] > 0:
        print(f"Total graphs: {stats['total_graphs']}")
        print(f"Graphs with activations: {stats['graphs_with_activations']}")
        print(f"Average activating nodes per graph: {stats['total_nodes']/stats['total_graphs']:.2f}")

In [None]:
print("\nAnalyzing activation strengths in graphs with activations...")
activation_strengths = defaultdict(list)

for layer in range(8):
    print(f"\nLayer {layer}:")
    for neuron in range(200):
        graph_path = Path('neuron_graphs/lbl') / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'
        if graph_path.exists():
            with open(graph_path, 'r') as f:
                graph_data = json.load(f)
                # Get activation values for activating nodes
                activations = [data.get('activation', 0) 
                             for data in graph_data['nodes'].values() 
                             if data.get('is_activating', False)]
                if activations:
                    activation_strengths[layer].extend(activations)
    
    if activation_strengths[layer]:
        print(f"Number of activating patterns: {len(activation_strengths[layer])}")
        print(f"Mean activation: {np.mean(activation_strengths[layer]):.4f}")
        print(f"Max activation: {np.max(activation_strengths[layer]):.4f}")

In [None]:
# Initialize model and analyzer
import random
import numpy as np
from collections import defaultdict
from pathlib import Path
import torch
import json

model_path = 'configs/lbl_model_20241016.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
analyzer = AblatedBatchedTokenSubstitutionAnalyzer(model, device)

# Run analysis with updated method
print("Starting analysis with layer-adaptive thresholds...")
layer_range = range(8)
neuron_range = range(200)
results = analyzer.analyze_neuron_batch(layer_range, neuron_range)

# Print summary
print("\nResults by layer:")
layer_counts = defaultdict(int)
for neuron_id in results:
    layer = int(neuron_id.split('_')[0][1:])
    layer_counts[layer] += len(results[neuron_id])

for layer in sorted(layer_counts.keys()):
    print(f"Layer {layer}: {layer_counts[layer]} substitutions")

# Run specialization analysis
print("\nNeuron specialization analysis:")
analyzer.analyze_neuron_specialization(results)

In [None]:
# Initialize model and analyzer
import random
import numpy as np
from collections import defaultdict
from pathlib import Path
import torch
import json

model_path = 'configs/lbl_model_20241016.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
analyzer = AblatedBatchedTokenSubstitutionAnalyzer(model, device)

# Run focused analysis on a few neurons from each layer
print("\nDetailed Layer Analysis:")
for layer in range(8):
    print(f"\n=== Layer {layer} ===")
    # Take first 5 neurons that have activation patterns
    neurons_checked = 0
    neurons_with_patterns = 0
    
    for neuron in range(200):
        if neurons_with_patterns >= 5 and neurons_checked >= 20:  # Stop after finding 5 or checking 20
            break
            
        graph_path = Path('neuron_graphs/lbl') / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'
        if graph_path.exists():
            neurons_checked += 1
            
            # Load and analyze graph
            graph = analyzer.load_neuron_graph(graph_path)
            patterns = analyzer.get_activation_patterns(graph)
            
            if patterns:
                neurons_with_patterns += 1
                print(f"\nNeuron {neuron}:")
                print(f"Number of patterns: {len(patterns)}")
                
                # Show activation values
                activations = [p['activation'] for p in patterns]
                print(f"Activation stats:")
                print(f"- Mean: {np.mean(activations):.4f}")
                print(f"- Max: {np.max(activations):.4f}")
                print(f"- Min: {np.min(activations):.4f}")
                
                # Show pattern details for first pattern
                if patterns:
                    pattern = patterns[0]
                    print(f"\nFirst pattern details:")
                    print(f"Token: {pattern['token']}")
                    print(f"Activation: {pattern['activation']:.4f}")
                    print(f"Pre-context length: {len(pattern['pre_context'])}")
                    print(f"Post-context length: {len(pattern['post_context'])}")
                
                # Try to get predictions and test activations
                if patterns:
                    pattern = patterns[0]
                    predictions = analyzer.get_bert_predictions_with_grammar(
                        pattern['pre_context'],
                        pattern['post_context'],
                        pattern['token']
                    )
                    
                    print(f"\nTesting first 3 predictions:")
                    for pred in predictions[:3]:
                        activation = analyzer.test_activation(
                            pattern['pre_context'],
                            pred['gpt2_id'],
                            pattern['post_context'],
                            layer,
                            neuron
                        )
                        print(f"'{pred['token']}': {activation:.4f}")
    
    print(f"\nLayer {layer} Summary:")
    print(f"Neurons checked: {neurons_checked}")
    print(f"Neurons with patterns: {neurons_with_patterns}")

In [None]:
# Test cell for specific neuron activation diagnostics
import torch
import json
from pathlib import Path
import random
import numpy as np
from collections import defaultdict
from pathlib import Path
import torch
import json

model_path = 'configs/lbl_model_20241016.pt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, config = load_model(model_path, device)
analyzer = AblatedBatchedTokenSubstitutionAnalyzer(model, device)

# Let's test some of the neurons from your examples
test_cases = [
   (7, 87),  # Layer 7 neuron with 'said' pattern
   (6, 98),  # Layer 6 neuron with 'Tim' pattern
   (5, 114), # Layer 5 neuron with high activation (10.3284)
   (4, 87)   # Layer 4 neuron with low activation
]

print("Testing specific neurons with detailed activation tracking...")
for layer, neuron in test_cases:
   graph_path = Path('neuron_graphs/lbl') / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'
   if graph_path.exists():
       analyzer.test_specific_neuron(layer, neuron, graph_path)
       print("\n" + "="*50 + "\n")

In [3]:
# Import required libraries
import os
import torch
from pathlib import Path
from model_loader import load_model
from tqdm import tqdm
import json
from datetime import datetime


# Set up device and paths
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load model
print("\nLoading model...")
model_path = 'configs/lbl_model_20241016.pt'
model, config = load_model(model_path, device)
model.eval()
print("Model loaded successfully!")

# Initialize analyzer
print("\nInitializing analyzer...")
analyzer = AblatedBatchedTokenSubstitutionAnalyzer(model, device)
print("Analyzer initialized!")

# Create output directory for results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = Path(f'substitution_results_{timestamp}')
output_dir.mkdir(exist_ok=True)
print(f"\nResults will be saved to: {output_dir}")

# Test parameters
test_config = {
    'start_layer': 3,
    'end_layer': 7,
    'neurons_per_layer': 3,
    'debug_mode': True
}

print("\nTest configuration:")
for key, value in test_config.items():
    print(f"{key}: {value}")

# Run analysis with detailed progress tracking
print("\nStarting analysis...")
results = {}
try:
    # Iterate through layers with progress bar
    layer_pbar = tqdm(range(test_config['start_layer'], test_config['end_layer'] + 1), 
                     desc="Processing layers")
    
    for layer in layer_pbar:
        layer_results = []
        layer_pbar.set_description(f"Processing layer {layer}")
        
        # Iterate through neurons with progress bar
        neuron_pbar = tqdm(range(test_config['neurons_per_layer']), 
                          desc=f"Layer {layer} neurons",
                          leave=False)
        
        for neuron in neuron_pbar:
            try:
                neuron_pbar.set_description(f"Layer {layer} Neuron {neuron}")
                
                # Check if graph exists
                graph_path = Path('neuron_graphs/lbl') / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'
                if not graph_path.exists():
                    print(f"\nWarning: No graph file found at {graph_path}")
                    continue
                    
                # Analyze neuron
                print(f"\nAnalyzing L{layer}N{neuron}")
                substitutions = analyzer.analyze_neuron_substitutions(
                    layer=layer,
                    neuron=neuron,
                    graph_path=graph_path,
                    debug_mode=test_config['debug_mode']
                )
                
                # Process results
                if substitutions:
                    print(f"Found {len(substitutions)} substitutions")
                    layer_results.extend(substitutions)
                else:
                    print("No substitutions found")
                
            except Exception as e:
                print(f"\nError processing neuron {neuron} in layer {layer}: {str(e)}")
                import traceback
                traceback.print_exc()
                continue
        
        # Store layer results if any found
        if layer_results:
            results[layer] = layer_results
            print(f"\nLayer {layer} complete: {len(layer_results)} total substitutions")
        else:
            print(f"\nLayer {layer} complete: No substitutions found")
    
    # Save results
    if results:
        print("\nSaving results...")
        summary_data = {
            'timestamp': timestamp,
            'config': test_config,
            'results': results
        }
        
        output_file = output_dir / 'substitution_analysis.json'
        with open(output_file, 'w') as f:
            json.dump(summary_data, f, indent=2)
            
        print(f"Results saved to: {output_file}")
        
        # Print summary statistics
        print("\nSummary Statistics:")
        total_layers = len(range(test_config['start_layer'], test_config['end_layer'] + 1))
        layers_with_results = len(results)
        total_substitutions = sum(len(layer_results) for layer_results in results.values())
        avg_substitutions = total_substitutions / layers_with_results if layers_with_results else 0
        
        print(f"Total layers analyzed: {total_layers}")
        print(f"Layers with results: {layers_with_results}")
        print(f"Total substitutions found: {total_substitutions}")
        print(f"Average substitutions per layer with results: {avg_substitutions:.2f}")
        
        # Print example substitutions
        print("\nExample substitutions from first layer with results:")
        if results:
            first_layer = min(results.keys())
            if results[first_layer]:
                for i, sub in enumerate(results[first_layer][:3]):  # Show first 3 substitutions
                    print(f"\nSubstitution {i+1}:")
                    print(f"Original token: {sub['original_token']}")
                    print(f"Substitute token: {sub['substitute_token']}")
                    print(f"Activation: {sub['activation']:.4f}")
                    print(f"BERT probability: {sub['bert_probability']:.4f}")
    else:
        print("\nNo results found in any layer")
        
except Exception as e:
    print(f"\nError during analysis: {str(e)}")
    import traceback
    traceback.print_exc()

finally:
    # Cleanup
    print("\nCleaning up...")
    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    print("Done!")

Using device: cuda

Loading model...
Model loaded successfully from configs/lbl_model_20241016.pt
Model type: ablated (layer-by-layer)
Number of parameters: 8709504
Model loaded successfully!

Initializing analyzer...
Initializing NeuronAnalyzer...
Detected ablated model - initializing HookedTransformer
HookedTransformer initialized successfully
NeuronAnalyzer initialized (Model type: ablated)
Loading BERT model for token suggestions...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Analyzer initialized!

Results will be saved to: substitution_results_20250117_150955

Test configuration:
start_layer: 3
end_layer: 7
neurons_per_layer: 3
debug_mode: True

Starting analysis...


Processing layer 4:   0%|          | 0/5 [00:00<?, ?it/s]


Analyzing L3N0

Analyzing L3N0
No activating nodes found
No substitutions found

Analyzing L3N1

Analyzing L3N1
No activating nodes found
No substitutions found

Analyzing L3N2

Analyzing L3N2
No activating nodes found
No substitutions found

Layer 3 complete: No substitutions found


Processing layer 5:   0%|          | 0/5 [00:00<?, ?it/s]


Analyzing L4N0

Analyzing L4N0
No activating nodes found
No substitutions found

Analyzing L4N1

Analyzing L4N1
No activating nodes found
No substitutions found

Analyzing L4N2

Analyzing L4N2
No activating nodes found
No substitutions found

Layer 4 complete: No substitutions found


Processing layer 6:   0%|          | 0/5 [00:00<?, ?it/s]


Analyzing L5N0

Analyzing L5N0
No activating nodes found
No substitutions found

Analyzing L5N1

Analyzing L5N1
No activating nodes found
No substitutions found

Analyzing L5N2

Analyzing L5N2
No activating nodes found
No substitutions found

Layer 5 complete: No substitutions found


Processing layer 7:   0%|          | 0/5 [00:00<?, ?it/s]


Analyzing L6N0

Analyzing L6N0
No activating nodes found
No substitutions found

Analyzing L6N1

Analyzing L6N1
No activating nodes found
No substitutions found

Analyzing L6N2

Analyzing L6N2
No activating nodes found
No substitutions found

Layer 6 complete: No substitutions found


Processing layer 7: 100%|██████████| 5/5 [00:00<00:00, 88.13it/s]


Analyzing L7N0

Analyzing L7N0
No activating nodes found
No substitutions found

Analyzing L7N1

Analyzing L7N1
No activating nodes found
No substitutions found

Analyzing L7N2

Analyzing L7N2
No activating nodes found
No substitutions found

Layer 7 complete: No substitutions found

No results found in any layer

Cleaning up...
Done!





In [6]:
from pathlib import Path
import json

def check_graph_files(start_layer=0, end_layer=7, neurons_per_layer=200):
    base_path = Path('neuron_graphs/lbl')
    
    for layer in range(start_layer, end_layer + 1):
        print(f"\nLayer {layer} Summary:")
        found_files = 0
        files_with_nodes = 0
        total_activating_nodes = 0
        
        # Track which neurons have graphs and activating nodes
        neuron_status = []
        
        # Check each neuron
        for neuron in range(neurons_per_layer):
            graph_path = base_path / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'
            
            if graph_path.exists():
                found_files += 1
                try:
                    with open(graph_path) as f:
                        graph_data = json.load(f)
                        
                    # Count activating nodes
                    activating_nodes = [n for n, data in graph_data['nodes'].items() 
                                      if data.get('is_activating', False)]
                    
                    if activating_nodes:
                        files_with_nodes += 1
                        total_activating_nodes += len(activating_nodes)
                        status = f"Found {len(activating_nodes)} activating nodes"
                    else:
                        status = "No activating nodes"
                except json.JSONDecodeError:
                    status = "Invalid JSON"
                except Exception as e:
                    status = f"Error: {str(e)}"
            else:
                status = "File not found"
            
            neuron_status.append(f"Neuron {neuron}: {status}")
        
        # Print summary for this layer
        print(f"Files found: {found_files}")
        print(f"Files with activating nodes: {files_with_nodes}")
        print(f"Total activating nodes: {total_activating_nodes}")
        print("\nDetailed status:")
        for status in neuron_status:
            print(status)
            
# Run the check
check_graph_files(start_layer=3, end_layer=7, neurons_per_layer=200)


Layer 3 Summary:
Files found: 200
Files with activating nodes: 18
Total activating nodes: 20

Detailed status:
Neuron 0: No activating nodes
Neuron 1: No activating nodes
Neuron 2: No activating nodes
Neuron 3: No activating nodes
Neuron 4: No activating nodes
Neuron 5: No activating nodes
Neuron 6: No activating nodes
Neuron 7: No activating nodes
Neuron 8: No activating nodes
Neuron 9: No activating nodes
Neuron 10: No activating nodes
Neuron 11: No activating nodes
Neuron 12: No activating nodes
Neuron 13: No activating nodes
Neuron 14: No activating nodes
Neuron 15: No activating nodes
Neuron 16: No activating nodes
Neuron 17: No activating nodes
Neuron 18: Found 1 activating nodes
Neuron 19: No activating nodes
Neuron 20: No activating nodes
Neuron 21: Found 1 activating nodes
Neuron 22: No activating nodes
Neuron 23: No activating nodes
Neuron 24: No activating nodes
Neuron 25: No activating nodes
Neuron 26: No activating nodes
Neuron 27: No activating nodes
Neuron 28: No activa

In [8]:
# Import required libraries
import os
import torch
from pathlib import Path
from model_loader import load_model
from tqdm import tqdm
import json
from datetime import datetime

def find_neurons_with_activations(layer, base_path='neuron_graphs/lbl'):
    """Find all neurons in a layer that have activating patterns"""
    neurons = []
    layer_path = Path(base_path) / f'layer_{layer}'
    
    if not layer_path.exists():
        print(f"Warning: Layer path {layer_path} not found")
        return []
        
    for graph_file in layer_path.glob(f'l{layer}_n*_graph.json'):
        try:
            # Extract neuron number from filename like 'l3_n144_graph'
            neuron = int(graph_file.stem.split('_n')[1].split('_')[0])
            with open(graph_file) as f:
                graph_data = json.load(f)
                
            # Check for activating nodes
            activating_nodes = [n for n, data in graph_data['nodes'].items() 
                              if data.get('is_activating', False)]
                              
            if activating_nodes:
                neurons.append((neuron, len(activating_nodes)))
        except Exception as e:
            print(f"Error processing {graph_file}: {str(e)}")
            continue
            
    return sorted(neurons)  # Sort by neuron index

# Set up device and paths
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load model
print("\nLoading model...")
model_path = 'configs/lbl_model_20241016.pt'
model, config = load_model(model_path, device)
model.eval()
print("Model loaded successfully!")

# Initialize analyzer
print("\nInitializing analyzer...")
analyzer = AblatedBatchedTokenSubstitutionAnalyzer(model, device)
print("Analyzer initialized!")

# Create output directory for results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = Path(f'substitution_results_{timestamp}')
output_dir.mkdir(exist_ok=True)
print(f"\nResults will be saved to: {output_dir}")

# Test parameters
test_config = {
    'start_layer': 3,
    'end_layer': 7,
    'debug_mode': True
}

print("\nTest configuration:")
for key, value in test_config.items():
    print(f"{key}: {value}")

# Run analysis with targeted neurons
print("\nStarting analysis...")
results = {}
try:
    # Iterate through layers
    layer_pbar = tqdm(range(test_config['start_layer'], test_config['end_layer'] + 1), 
                     desc="Processing layers")
    
    for layer in layer_pbar:
        # Find neurons with activating patterns
        neurons = find_neurons_with_activations(layer)
        if not neurons:
            print(f"\nNo neurons with activating patterns found in layer {layer}")
            continue
            
        print(f"\nFound {len(neurons)} neurons with activating patterns in layer {layer}:")
        for neuron, num_patterns in neurons:
            print(f"Neuron {neuron}: {num_patterns} patterns")
            
        layer_results = []
        for neuron, _ in neurons:
            try:
                # Check graph file
                graph_path = Path('neuron_graphs/lbl') / f'layer_{layer}' / f'l{layer}_n{neuron}_graph.json'
                if not graph_path.exists():
                    print(f"\nWarning: No graph file found at {graph_path}")
                    continue
                    
                # Analyze neuron
                print(f"\nAnalyzing L{layer}N{neuron}")
                substitutions = analyzer.analyze_neuron_substitutions(
                    layer=layer,
                    neuron=neuron,
                    graph_path=graph_path,
                    debug_mode=test_config['debug_mode']
                )
                
                # Process results
                if substitutions:
                    print(f"Found {len(substitutions)} substitutions")
                    layer_results.extend(substitutions)
                else:
                    print("No substitutions found")
                
            except Exception as e:
                print(f"\nError processing neuron {neuron} in layer {layer}: {str(e)}")
                import traceback
                traceback.print_exc()
                continue
        
        # Store layer results if any found
        if layer_results:
            results[layer] = layer_results
            print(f"\nLayer {layer} complete: {len(layer_results)} total substitutions")
        else:
            print(f"\nLayer {layer} complete: No substitutions found")
    
    # Save results
    if results:
        print("\nSaving results...")
        summary_data = {
            'timestamp': timestamp,
            'config': test_config,
            'results': results
        }
        
        output_file = output_dir / 'substitution_analysis.json'
        with open(output_file, 'w') as f:
            json.dump(summary_data, f, indent=2)
            
        print(f"Results saved to: {output_file}")
        
        # Print summary statistics
        print("\nSummary Statistics:")
        total_layers = len(range(test_config['start_layer'], test_config['end_layer'] + 1))
        layers_with_results = len(results)
        total_substitutions = sum(len(layer_results) for layer_results in results.values())
        avg_substitutions = total_substitutions / layers_with_results if layers_with_results else 0
        
        print(f"Total layers analyzed: {total_layers}")
        print(f"Layers with results: {layers_with_results}")
        print(f"Total substitutions found: {total_substitutions}")
        print(f"Average substitutions per layer with results: {avg_substitutions:.2f}")
        
except Exception as e:
    print(f"\nError during analysis: {str(e)}")
    import traceback
    traceback.print_exc()

finally:
    # Cleanup
    print("\nCleaning up...")
    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    print("Done!")

Using device: cuda

Loading model...
Model loaded successfully from configs/lbl_model_20241016.pt
Model type: ablated (layer-by-layer)
Number of parameters: 8709504
Model loaded successfully!

Initializing analyzer...
Initializing NeuronAnalyzer...
Detected ablated model - initializing HookedTransformer
HookedTransformer initialized successfully
NeuronAnalyzer initialized (Model type: ablated)
Loading BERT model for token suggestions...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Analyzer initialized!

Results will be saved to: substitution_results_20250117_151849

Test configuration:
start_layer: 3
end_layer: 7
debug_mode: True

Starting analysis...


Processing layers:   0%|          | 0/5 [00:00<?, ?it/s]


Found 18 neurons with activating patterns in layer 3:
Neuron 18: 1 patterns
Neuron 21: 1 patterns
Neuron 39: 1 patterns
Neuron 53: 1 patterns
Neuron 60: 1 patterns
Neuron 80: 1 patterns
Neuron 101: 1 patterns
Neuron 113: 1 patterns
Neuron 116: 1 patterns
Neuron 119: 1 patterns
Neuron 120: 1 patterns
Neuron 126: 1 patterns
Neuron 145: 1 patterns
Neuron 148: 1 patterns
Neuron 153: 1 patterns
Neuron 163: 1 patterns
Neuron 164: 2 patterns
Neuron 188: 2 patterns

Analyzing L3N18

Analyzing L3N18
Found 1 activating patterns

Testing pattern:  box

Found 1 important tokens using threshold 0.5
Important tokens found:
 -  the

Substitution:  box -> .
Activation: -0.0003
Ratio to max: 0.0000
Meets minimum threshold

Found 1 important tokens using threshold 0.5
Important tokens found:
 -  the

Substitution:  box -> ;
Activation: -0.0044
Ratio to max: 0.0000
Meets minimum threshold

Found 1 important tokens using threshold 0.5
Important tokens found:
 -  the

Found 1 important tokens using thresh

Processing layers:  20%|██        | 1/5 [00:45<03:01, 45.37s/it]


Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Substitution: 
 -> up
Activation: 0.0044
Ratio to max: 1.0000
Meets minimum threshold
Meets relative threshold
Found 10 substitutions

Layer 3 complete: 63 total substitutions

Found 11 neurons with activating patterns in layer 4:
Neuron 6: 1 patterns
Neuron 19: 1 patterns
Neuron 49: 1 patterns
Neuron 62: 1 patterns
Neuron 87: 2 patterns
Neuron 104: 1 patterns
Neuron 111: 1 patterns
Neuron 123: 1 patterns
Neuron 124: 2 patterns
Neuron 134: 1 patterns
Neuron 135: 1 patterns

Analyzing L4N6

Analyzing L4N6
Found 1 activating patterns

Testing pattern:  are

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Substitution:  are -> .
Activation: -0.0991
Ratio to max: 1.0000
Meets minimum threshold
Meets relative threshold

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by

Processing layers:  40%|████      | 2/5 [01:18<01:54, 38.24s/it]


Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Substitution:  candy -> |
Activation: -0.0559
Ratio to max: 0.0000
Meets minimum threshold
Found 5 substitutions

Layer 4 complete: 26 total substitutions

Found 10 neurons with activating patterns in layer 5:
Neuron 18: 1 patterns
Neuron 80: 1 patterns
Neuron 101: 1 patterns
Neuron 111: 1 patterns
Neuron 114: 1 patterns
Neuron 122: 1 patterns
Neuron 143: 1 patterns
Neuron 146: 1 patterns
Neuron 156: 2 patterns
Neuron 160: 2 patterns

Analyzing L5N18

Analyzing L5N18
Found 1 activating patterns

Testing pattern: "

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Substitution: " -> a
Activation: -0.0024
Ratio to max: 0.0503
Meets minimum threshold

Found 0 important tokens using threshold 0.5
No tokens abo

Processing layers:  60%|██████    | 3/5 [01:48<01:08, 34.28s/it]


Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation
Found 5 substitutions

Layer 5 complete: 35 total substitutions

Found 7 neurons with activating patterns in layer 6:
Neuron 83: 1 patterns
Neuron 98: 1 patterns
Neuron 109: 1 patterns
Neuron 116: 1 patterns
Neuron 159: 2 patterns
Neuron 179: 1 patterns
Neuron 198: 1 patterns

Analyzing L6N83

Analyzing L6N83
Found 1 activating patterns

Testing pattern:  friends

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuro

Processing layers:  80%|████████  | 4/5 [02:07<00:28, 28.49s/it]


Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Substitution: " -> 1
Activation: -0.0034
Ratio to max: 0.0393
Meets minimum threshold
Found 5 substitutions

Layer 6 complete: 20 total substitutions

Found 10 neurons with activating patterns in layer 7:
Neuron 9: 1 patterns
Neuron 14: 1 patterns
Neuron 58: 2 patterns
Neuron 70: 1 patterns
Neuron 87: 1 patterns
Neuron 102: 1 patterns
Neuron 105: 1 patterns
Neuron 127: 2 patterns
Neuron 138: 1 patterns
Neuron 147: 2 patterns

Analyzing L7N9

Analyzing L7N9
Found 1 activating patterns

Testing pattern:  started

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Substitution:  started -> .
Activation: 3.0420
Ratio to max: 1.0000
Meets minimum threshold
Meets relative threshold

Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation

Substitution:  started -> ;
Act

Processing layers: 100%|██████████| 5/5 [02:40<00:00, 32.06s/it]


Found 0 important tokens using threshold 0.5
No tokens above threshold - neuron may be suppressed by ablation
Found 4 substitutions

Layer 7 complete: 41 total substitutions

Saving results...
Results saved to: substitution_results_20250117_151849/substitution_analysis.json

Summary Statistics:
Total layers analyzed: 5
Layers with results: 5
Total substitutions found: 185
Average substitutions per layer with results: 37.00

Cleaning up...
Done!



