# Building Concept Vectors

**Experiment:** Extract and build concept vector libraries from model activations

**Date:** 2025-11-05

**Research Question:** Can we extract interpretable concept representations from model activations?

**Goals:**
- Extract activation patterns for specific concepts (emotions, topics, etc.)
- Build concept libraries
- Store concepts for reuse across projects
- Visualize concept space geometry

## Setup

In [None]:
# 1. Add parent directory to path for code imports
import sys
sys.path.append('../code')

# 2. Import numerical and visualization libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# 3. Import introspection modules
from theory_of_mind.introspection import (
    ActivationSteerer,     # Capture and steer activations
    ConceptLibrary,        # Store concept vectors
    build_emotion_library, # Build pre-defined emotion concepts
)

# 4. Import harness experiment tracking
from harness import ExperimentConfig, get_tracker

# 5. Confirm setup
print("Imports successful")

## 1. Load Model

Load a local model for activation extraction (MLX or Ollama)

In [None]:
# Check for MLX support
try:
    from mlx_lm import load
    
    model_name = "mlx-community/Llama-3.2-3B-Instruct-4bit"
    model, tokenizer = load(model_name)
    
    print(f"Loaded model: {model_name}")
    print(f"Model layers: {len(model.model.layers)}")
    use_mlx = True
    
except ImportError:
    print("MLX not available. Using Ollama instead.")
    print("Note: Activation steering requires local model access (MLX).")
    use_mlx = False

## 2. Create Activation Steerer

Initialize steerer for capturing and manipulating activations

In [None]:
if use_mlx:
    steerer = ActivationSteerer(model, tokenizer)
    
    # Test activation capture
    test_prompt = "I am feeling happy today."
    activations = steerer.capture_activations(test_prompt, layer_idx=15)
    
    print(f"Captured activations shape: {activations.shape}")
    print(f"Activation statistics:")
    print(f"  Mean: {np.mean(activations):.4f}")
    print(f"  Std: {np.std(activations):.4f}")
    print(f"  Min: {np.min(activations):.4f}")
    print(f"  Max: {np.max(activations):.4f}")
else:
    print("Skipping (MLX required for activation steering)")

## 3. Build Emotion Concept Library

Extract concept vectors for basic emotions

In [None]:
if use_mlx:
    # Define emotion prompts
    emotion_prompts = {
        "happiness": [
            "I am feeling extremely happy and joyful!",
            "This is the best day ever, I'm so excited!",
            "I'm filled with joy and happiness.",
        ],
        "sadness": [
            "I feel deeply sad and melancholic.",
            "Everything feels heavy and sorrowful today.",
            "I'm overwhelmed with sadness and grief.",
        ],
        "anger": [
            "I am furious and enraged!",
            "This makes me so angry and frustrated!",
            "I'm filled with intense anger.",
        ],
        "fear": [
            "I'm terrified and scared.",
            "This is frightening and makes me anxious.",
            "I feel deep fear and worry.",
        ],
        "surprise": [
            "Wow, I'm completely surprised!",
            "This is so unexpected and shocking!",
            "I'm amazed and surprised by this.",
        ],
    }
    
    # Extract concept vectors
    library = ConceptLibrary()
    target_layer = 15  # Middle layer typically captures semantic info
    
    for emotion, prompts in emotion_prompts.items():
        print(f"Extracting concept vector for: {emotion}")
        
        # Average activations across prompts
        activations_list = []
        for prompt in prompts:
            acts = steerer.capture_activations(prompt, layer_idx=target_layer)
            activations_list.append(acts)
        
        # Average across prompts and sequence positions
        avg_activations = np.mean(activations_list, axis=0)
        concept_vector = np.mean(avg_activations, axis=0)  # Average over sequence
        
        library.add_concept(emotion, concept_vector, layer=target_layer)
        print(f"  Vector shape: {concept_vector.shape}")
    
    print(f"\nBuilt library with {len(library.concepts)} concepts")
    print(f"Concepts: {list(library.concepts.keys())}")
else:
    print("Skipping (MLX required)")

## 4. Visualize Concept Space

Use dimensionality reduction to visualize concept relationships

In [None]:
if use_mlx:
    # Get all concept vectors
    concept_names = list(library.concepts.keys())
    concept_vectors = [library.get_concept(name) for name in concept_names]
    concept_matrix = np.array(concept_vectors)
    
    # PCA projection to 2D
    pca = PCA(n_components=2)
    coords_2d = pca.fit_transform(concept_matrix)
    
    # Plot
    plt.figure(figsize=(10, 8))
    plt.scatter(coords_2d[:, 0], coords_2d[:, 1], s=100)
    
    for i, name in enumerate(concept_names):
        plt.annotate(
            name,
            (coords_2d[i, 0], coords_2d[i, 1]),
            fontsize=12,
            xytext=(5, 5),
            textcoords='offset points'
        )
    
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('Emotion Concept Space (PCA)')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    print(f"Explained variance: {pca.explained_variance_ratio_}")
else:
    print("Skipping (MLX required)")

## 5. Compute Concept Similarities

Measure cosine similarity between concepts

In [None]:
if use_mlx:
    from sklearn.metrics.pairwise import cosine_similarity
    
    similarity_matrix = cosine_similarity(concept_matrix)
    
    # Plot heatmap
    plt.figure(figsize=(8, 6))
    plt.imshow(similarity_matrix, cmap='coolwarm', vmin=-1, vmax=1)
    plt.colorbar(label='Cosine Similarity')
    plt.xticks(range(len(concept_names)), concept_names, rotation=45)
    plt.yticks(range(len(concept_names)), concept_names)
    plt.title('Concept Similarity Matrix')
    plt.tight_layout()
    plt.show()
    
    # Print most/least similar pairs
    print("\nMost similar pairs:")
    for i in range(len(concept_names)):
        for j in range(i+1, len(concept_names)):
            sim = similarity_matrix[i, j]
            print(f"  {concept_names[i]} <-> {concept_names[j]}: {sim:.3f}")
else:
    print("Skipping (MLX required)")

## 6. Save Concept Library

Store concept vectors for use in other projects

In [None]:
if use_mlx:
    # Save to shared concepts directory
    save_path_pkl = "/home/user/hidden-layer/shared/concepts/emotions_layer15.pkl"
    save_path_json = "/home/user/hidden-layer/shared/concepts/emotions_layer15.json"
    
    library.save(save_path_pkl)
    library.export_json(save_path_json)
    
    print(f"Saved library to:")
    print(f"  {save_path_pkl}")
    print(f"  {save_path_json}")
    
    # Test loading
    loaded_library = ConceptLibrary.load(save_path_pkl)
    print(f"\nSuccessfully loaded library with {len(loaded_library.concepts)} concepts")
else:
    print("Skipping (MLX required)")

## 7. Track Experiment

In [None]:
config = ExperimentConfig(
    experiment_name="concept_vectors_emotions",
    task_type="concept_extraction",
    strategy="activation_capture",
    provider="mlx" if use_mlx else "none",
    model=model_name if use_mlx else "none",
)

tracker = get_tracker()
run_dir = tracker.start_experiment(config)

if use_mlx:
    from harness import ExperimentResult
    
    result = ExperimentResult(
        config=config,
        task_input="Extract emotion concept vectors",
        output=f"Created library with {len(library.concepts)} concepts",
        eval_scores={},
        eval_metadata={
            "concepts": list(library.concepts.keys()),
            "target_layer": target_layer,
            "vector_dim": concept_vector.shape[0],
            "saved_to": save_path_pkl,
        },
        success=True,
    )
    tracker.log_result(result)

summary = tracker.finish_experiment()
print(f"Experiment logged in: {run_dir}")

## Next Steps

1. **Activation Steering** (02_activation_steering.ipynb): Apply concept vectors to steer model behavior
2. **Introspection Tasks** (03_introspection_eval.ipynb): Test if models can report their internal states
3. **API Introspection** (04_api_introspection.ipynb): Test frontier models

## Key Insights

- Concept vectors capture semantic patterns in activation space
- Different layers may encode different levels of abstraction
- Concept similarity reflects semantic relationships
- Shared concept libraries enable cross-project research