# Activation Steering Experiments

**Experiment:** Apply concept vectors to steer model behavior

**Date:** 2025-11-05

**Research Question:** Can we controllably modify model behavior by injecting concept vectors into activations?

**Goals:**
- Load concept library from previous notebook
- Apply steering vectors during generation
- Measure behavioral changes
- Test steering strength and layer selection

In [None]:
# 1. Add parent directory to path for code imports
import sys
sys.path.append('../code')

# 2. Import numerical and visualization libraries
import numpy as np
import matplotlib.pyplot as plt

# 3. Import introspection modules
from theory_of_mind.introspection import (
    ActivationSteerer,  # Steer model activations
    ConceptLibrary,     # Load concept vectors
)

# 4. Import harness experiment tracking
from harness import ExperimentConfig, ExperimentResult, get_tracker

# 5. Confirm setup
print("Imports successful")

## 1. Load Model and Concept Library

In [None]:
# Load MLX model
try:
    from mlx_lm import load
    
    model_name = "mlx-community/Llama-3.2-3B-Instruct-4bit"
    model, tokenizer = load(model_name)
    steerer = ActivationSteerer(model, tokenizer)
    
    print(f"Loaded model: {model_name}")
    use_mlx = True
    
except ImportError:
    print("MLX not available. This notebook requires local model access.")
    use_mlx = False

# Load concept library
if use_mlx:
    library_path = "/home/user/hidden-layer/shared/concepts/emotions_layer15.pkl"
    try:
        library = ConceptLibrary.load(library_path)
        print(f"Loaded concept library with {len(library.concepts)} concepts")
        print(f"Available concepts: {list(library.concepts.keys())}")
    except FileNotFoundError:
        print(f"Concept library not found at {library_path}")
        print("Run 01_concept_vectors.ipynb first to create the library")
        use_mlx = False

## 2. Test Basic Steering

Apply a single concept vector and observe behavior change

In [None]:
if use_mlx:
    # Neutral prompt
    prompt = "Today's weather is"
    
    # Generate without steering (baseline)
    print("Baseline (no steering):")
    baseline_output = steerer.generate(prompt, max_tokens=50)
    print(baseline_output)
    print("\n" + "="*60 + "\n")
    
    # Generate with happiness steering
    print("With 'happiness' steering:")
    happiness_vector = library.get_concept("happiness")
    steered_output = steerer.steer_generate(
        prompt,
        steering_vector=happiness_vector,
        layer_idx=15,
        strength=1.0,
        max_tokens=50,
    )
    print(steered_output)
    print("\n" + "="*60 + "\n")
    
    # Generate with sadness steering
    print("With 'sadness' steering:")
    sadness_vector = library.get_concept("sadness")
    steered_output = steerer.steer_generate(
        prompt,
        steering_vector=sadness_vector,
        layer_idx=15,
        strength=1.0,
        max_tokens=50,
    )
    print(steered_output)
else:
    print("Skipping (MLX required)")

## 3. Test Steering Strength

Vary the strength parameter to control steering intensity

In [None]:
if use_mlx:
    prompt = "The future looks"
    concept_name = "happiness"
    concept_vector = library.get_concept(concept_name)
    
    strengths = [0.0, 0.5, 1.0, 2.0]
    
    print(f"Testing steering strengths for '{concept_name}':\n")
    
    for strength in strengths:
        print(f"Strength = {strength}:")
        output = steerer.steer_generate(
            prompt,
            steering_vector=concept_vector,
            layer_idx=15,
            strength=strength,
            max_tokens=30,
        )
        print(output)
        print()
else:
    print("Skipping (MLX required)")

## 4. Test Layer Selection

Compare steering effects across different layers

In [None]:
if use_mlx:
    prompt = "The story begins with"
    concept_name = "fear"
    concept_vector = library.get_concept(concept_name)
    
    # Test early, middle, and late layers
    num_layers = len(model.model.layers)
    test_layers = [5, num_layers // 2, num_layers - 5]
    
    print(f"Testing layer selection for '{concept_name}' steering:\n")
    
    for layer_idx in test_layers:
        print(f"Layer {layer_idx}:")
        output = steerer.steer_generate(
            prompt,
            steering_vector=concept_vector,
            layer_idx=layer_idx,
            strength=1.0,
            max_tokens=30,
        )
        print(output)
        print()
else:
    print("Skipping (MLX required)")

## 5. Concept Blending

Combine multiple concept vectors for complex steering

In [None]:
if use_mlx:
    prompt = "The experience was"
    
    # Blend happiness and surprise
    happiness_vec = library.get_concept("happiness")
    surprise_vec = library.get_concept("surprise")
    blended_vec = 0.7 * happiness_vec + 0.3 * surprise_vec
    
    print("Baseline:")
    baseline = steerer.generate(prompt, max_tokens=30)
    print(baseline)
    print()
    
    print("With blended steering (70% happiness + 30% surprise):")
    blended_output = steerer.steer_generate(
        prompt,
        steering_vector=blended_vec,
        layer_idx=15,
        strength=1.0,
        max_tokens=30,
    )
    print(blended_output)
else:
    print("Skipping (MLX required)")

## 6. Systematic Evaluation

Evaluate steering effectiveness across multiple prompts and concepts

In [None]:
if use_mlx:
    import pandas as pd
    
    test_prompts = [
        "The day started",
        "She felt",
        "The news made everyone",
        "Looking at the results, I am",
    ]
    
    results = []
    
    for prompt in test_prompts:
        # Baseline
        baseline = steerer.generate(prompt, max_tokens=20)
        
        # Test each concept
        for concept_name in library.concepts.keys():
            concept_vector = library.get_concept(concept_name)
            steered = steerer.steer_generate(
                prompt,
                steering_vector=concept_vector,
                layer_idx=15,
                strength=1.0,
                max_tokens=20,
            )
            
            results.append({
                "prompt": prompt,
                "concept": concept_name,
                "baseline": baseline,
                "steered": steered,
            })
    
    df = pd.DataFrame(results)
    print(df.to_string())
else:
    print("Skipping (MLX required)")

## 7. Track Experiment

In [None]:
config = ExperimentConfig(
    experiment_name="activation_steering",
    task_type="steering",
    strategy="concept_injection",
    provider="mlx" if use_mlx else "none",
    model=model_name if use_mlx else "none",
)

tracker = get_tracker()
run_dir = tracker.start_experiment(config)

if use_mlx:
    for _, row in df.iterrows():
        result = ExperimentResult(
            config=config,
            task_input=row['prompt'],
            output=row['steered'],
            eval_metadata={
                "concept": row['concept'],
                "baseline": row['baseline'],
                "layer": 15,
                "strength": 1.0,
            },
            success=True,
        )
        tracker.log_result(result)

summary = tracker.finish_experiment()
print(f"\nExperiment logged in: {run_dir}")

## Key Findings

**Expected Observations:**
1. Steering strength controls the intensity of behavioral change
2. Middle layers (10-20) typically provide best semantic steering
3. Early layers affect surface form, late layers affect high-level concepts
4. Concept blending enables fine-grained control

**Research Questions:**
- How reliable is steering across different prompts?
- Can we predict optimal layer for each concept?
- Does steering preserve model capabilities?
- Can we detect if steering has been applied?

## Next Steps

1. **Introspection Evaluation** (03_introspection_eval.ipynb): Test if models can report steered states
2. **API Introspection** (04_api_introspection.ipynb): Test frontier models with prompt-based steering