# Feature Analysis: Token-level activations for target features

This notebook analyzes token-level activations for specific SAE features on prompts, using a two-stage approach:
1. Screen all prompts for target feature activation
2. Extract detailed token activations only for prompts where target features fire

Saves results to `active.jsonl` and `inactive.jsonl`.

In [23]:
import csv
import json
import torch
import os
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
from dictionary_learning.utils import load_dictionary
from tqdm.auto import tqdm
from sae_lens import SAE
from datasets import load_dataset

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Configuration

In [None]:
from dataclasses import dataclass
from typing import Dict, Tuple, Optional

@dataclass
class ModelConfig:
    """Configuration for model-specific settings"""
    base_model_name: str
    chat_model_name: str
    hf_release: str  # Reference only - actual loading uses saelens_release/sae_id
    assistant_header: str
    token_offsets: Dict[str, int]
    sae_base_path: str
    saelens_release: str  # Template for sae_lens release parameter
    sae_id_template: str  # Template for sae_lens sae_id parameter
    base_url: str  # Base URL for neuronpedia
    
    def get_sae_params(self, sae_layer: int, sae_trainer: str) -> Tuple[str, str]:
        """
        Generate SAE lens release and sae_id parameters.
        
        Args:
            sae_layer: Layer number for the SAE
            sae_trainer: Trainer identifier for the SAE
            
        Returns:
            Tuple of (release, sae_id) for sae_lens.SAE.from_pretrained()
        """
        if self.saelens_release == "llama_scope_lxr_{trainer}":
            release = self.saelens_release.format(trainer=sae_trainer)
            sae_id = self.sae_id_template.format(layer=sae_layer, trainer=sae_trainer)
        elif self.saelens_release == "gemma-scope-9b-pt-res":
            # Parse SAE_TRAINER "131k-l0-34" into components for Gemma
            parts = sae_trainer.split("-")
            width = parts[0]  # "131k"
            l0_value = parts[2]  # "34"
            
            release = self.saelens_release
            sae_id = self.sae_id_template.format(layer=sae_layer, width=width, l0=l0_value)
        elif self.saelens_release == "gemma-scope-9b-pt-res-canonical":
            # Parse SAE_TRAINER "131k-l0-34" into components for Gemma
            parts = sae_trainer.split("-")
            width = parts[0]  # "131k"

            release = self.saelens_release
            sae_id = self.sae_id_template.format(layer=sae_layer, width=width)
        else:
            raise ValueError(f"Unknown SAE lens release template: {self.saelens_release}")
        
        return release, sae_id

# Model configurations
MODEL_CONFIGS = {
    "llama": ModelConfig(
        base_model_name="meta-llama/Llama-3.1-8B",
        chat_model_name="meta-llama/Llama-3.1-8B-Instruct",
        hf_release="fnlp/Llama3_1-8B-Base-LXR-32x",
        assistant_header="<|start_header_id|>assistant<|end_header_id|>",
        token_offsets={"asst": -2, "endheader": -1, "newline": 0},
        sae_base_path="/workspace/sae/llama-3.1-8b/saes",
        saelens_release="llama_scope_lxr_{trainer}",
        sae_id_template="l{layer}r_{trainer}",
        base_url="https://www.neuronpedia.org/llama-3.1-8b/{layer}-llamascope-res-131k"
    ),
    "gemma": ModelConfig(
        base_model_name="google/gemma-2-9b",
        chat_model_name="google/gemma-2-9b-it",
        hf_release="google/gemma-scope-9b-pt-res/layer_{layer}/width_{width}/average_l0_{l0}",
        assistant_header="<start_of_turn>model",
        token_offsets={"model": -1, "newline": 0},
        sae_base_path="/workspace/sae/gemma-2-9b/saes",
        saelens_release="gemma-scope-9b-pt-res-canonical",
        sae_id_template="layer_{layer}/width_{width}/canonical",
        base_url="https://www.neuronpedia.org/gemma-2-9b/{layer}-gemmascope-res-131k"
    )
}

# =============================================================================
# MODEL SELECTION - Change this to switch between models
# =============================================================================
MODEL_TYPE = "gemma"  # Options: "gemma" or "llama"
MODEL_VER = "chat"
SAE_LAYER = 20
SAE_TRAINER = "131k-l0-114"
N_PROMPTS = 1000

# =============================================================================
# TARGET FEATURES - Specify which features to analyze
# =============================================================================
TARGET_FEATURES = [91547, 65116, 85422, 80134, 74855, 71187, 102414, 10392, 128628, 8524, 57516, 21953, 26196, 90900, 11383, 111921, 74079]  # List of feature IDs to analyze
ACTIVATION_THRESHOLD = 0.0  # Minimum activation to consider "active"

# =============================================================================
# DEDUPLICATION AND CONFIGURATION SETUP
# =============================================================================
if MODEL_TYPE not in MODEL_CONFIGS:
    raise ValueError(f"Unknown MODEL_TYPE: {MODEL_TYPE}. Available: {list(MODEL_CONFIGS.keys())}")

config = MODEL_CONFIGS[MODEL_TYPE]

# Deduplicate TARGET_FEATURES while preserving order
original_count = len(TARGET_FEATURES)
TARGET_FEATURES = list(dict.fromkeys(TARGET_FEATURES))  # Preserves order, removes duplicates
duplicates_removed = original_count - len(TARGET_FEATURES)

if duplicates_removed > 0:
    print(f"🔄 Removed {duplicates_removed} duplicate feature(s) from TARGET_FEATURES")

# Set model name based on version
if MODEL_VER == "chat":
    MODEL_NAME = config.chat_model_name
elif MODEL_VER == "base":
    MODEL_NAME = config.base_model_name
else:
    raise ValueError(f"Unknown MODEL_VER: {MODEL_VER}. Use 'chat' or 'base'")

# Always use chat model for tokenizer (has chat template)
CHAT_MODEL_NAME = config.chat_model_name

# Set up derived configurations
ASSISTANT_HEADER = config.assistant_header
TOKEN_OFFSETS = config.token_offsets
SAE_BASE_PATH = config.sae_base_path

# =============================================================================
# OUTPUT FILE CONFIGURATION
# =============================================================================
# Base output directory - individual feature directories will be created under this
BASE_OUTPUT_DIR = f"./results/6_active_prompts/{MODEL_TYPE}_trainer{SAE_TRAINER}_layer{SAE_LAYER}/{N_PROMPTS}_prompts"

# =============================================================================
# CHECK FOR EXISTING DIRECTORIES AND FILTER TARGET FEATURES
# =============================================================================
original_target_features = TARGET_FEATURES.copy()
filtered_target_features = []
existing_features = []

for feature_id in TARGET_FEATURES:
    feature_dir = f"{BASE_OUTPUT_DIR}/{feature_id}"
    if os.path.exists(feature_dir):
        existing_features.append(feature_id)
        print(f"⚠️  WARNING: Directory already exists for feature {feature_id}, skipping: {feature_dir}")
    else:
        filtered_target_features.append(feature_id)

# Update TARGET_FEATURES to only include features that don't have existing directories
TARGET_FEATURES = filtered_target_features

if existing_features:
    print(f"\n🔄 Skipped {len(existing_features)} existing features: {existing_features}")
    
if not TARGET_FEATURES:
    print(f"\n❌ No new features to process - all {len(original_target_features)} features already have existing directories!")
    print("To reprocess existing features, delete their directories first.")
else:
    print(f"\n✅ Will process {len(TARGET_FEATURES)} new features: {TARGET_FEATURES}")

# =============================================================================
# DERIVED CONFIGURATIONS
# =============================================================================
SAE_PATH = f"{SAE_BASE_PATH}/resid_post_layer_{SAE_LAYER}/trainer_{SAE_TRAINER}"
LAYER_INDEX = SAE_LAYER

# Data paths
PROMPTS_HF = "lmsys/lmsys-chat-1m"
SEED = 42
PROMPTS_PATH = f"/workspace/data/{PROMPTS_HF.split('/')[-1]}/chat_{N_PROMPTS}.jsonl"
os.makedirs(os.path.dirname(PROMPTS_PATH), exist_ok=True)

# Processing parameters
BATCH_SIZE = 32
MAX_LENGTH = 512

# =============================================================================
# SUMMARY
# =============================================================================
print(f"\nConfiguration Summary:")
print(f"  Model Type: {MODEL_TYPE}")
print(f"  Model to load: {MODEL_NAME}")
print(f"  SAE Layer: {SAE_LAYER}, Trainer: {SAE_TRAINER}")
print(f"  Target Features: {TARGET_FEATURES}")
print(f"  Activation Threshold: {ACTIVATION_THRESHOLD}")
print(f"  Base Output Directory: {BASE_OUTPUT_DIR}")

## Load Data

In [30]:
def load_lmsys_prompts(prompts_path: str, prompts_hf: str, n_prompts: int, seed: int) -> pd.DataFrame:
    # Check if prompts_path exists
    if os.path.exists(prompts_path):
        print(f"Prompts already exist at {prompts_path}")
        return pd.read_json(prompts_path, lines=True)
    else:
        print(f"Prompts do not exist at {prompts_path}. Loading from {prompts_hf}...")
        dataset = load_dataset(prompts_hf)
        dataset = dataset['train'].shuffle(seed=seed).select(range(n_prompts))
        df = dataset.to_pandas()

        # Extract the prompt from the first conversation item
        df['prompt'] = df['conversation'].apply(lambda x: x[0]['content'])

        # Only keep some columns
        df = df[['conversation_id', 'prompt', 'redacted', 'language']]

        # Save to .jsonl file
        df.to_json(prompts_path, orient='records', lines=True)
        return df

prompts_df = load_lmsys_prompts(PROMPTS_PATH, PROMPTS_HF, N_PROMPTS, SEED)
print(f"Loaded {prompts_df.shape[0]} prompts")

Prompts already exist at /workspace/data/lmsys-chat-1m/chat_1000.jsonl
Loaded 1000 prompts


## Load Model and SAE

In [31]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")

Tokenizer loaded: GemmaTokenizerFast


In [32]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map={"":0}
)
model.eval()

print(f"Model loaded: {model.__class__.__name__}")
print(f"Model device: {next(model.parameters()).device}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded: Gemma2ForCausalLM
Model device: cuda:0


In [36]:
def load_sae(config: ModelConfig, sae_path: str, sae_layer: int, sae_trainer: str) -> SAE:
    """
    Unified SAE loading function that handles both Llama and Gemma models.
    
    Args:
        config: ModelConfig object containing model-specific settings
        sae_path: Local path to store/load SAE files
        sae_layer: Layer number for the SAE
        sae_trainer: Trainer identifier for the SAE
    
    Returns:
        SAE: Loaded SAE model
    """
    # Check if SAE file exists locally
    print(f"Loading SAE from {sae_path}")
    ae_file_path = os.path.join(sae_path, "sae_weights.safetensors")
    
    if os.path.exists(ae_file_path):
        print(f"✓ Found SAE files at: {os.path.dirname(ae_file_path)}")
        sae = SAE.load_from_disk(sae_path)
        return sae
    
    print(f"SAE not found locally, downloading from HF via sae_lens...")
    os.makedirs(os.path.dirname(sae_path), exist_ok=True)
    
    # Get SAE parameters from config
    release, sae_id = config.get_sae_params(sae_layer, sae_trainer)
    print(f"Loading SAE with release='{release}', sae_id='{sae_id}'")
    
    # Load the SAE using sae_lens
    sae, _, _ = SAE.from_pretrained(
        release=release,
        sae_id=sae_id,
        device="cuda" # Hardcoded because it wants a string
    )
    
    # Save the SAE locally for future use
    sae.save_model(sae_path)
    return sae

# Load SAE using the unified function
sae = load_sae(config, SAE_PATH, SAE_LAYER, SAE_TRAINER)
sae = sae.to(device)  # Move SAE to GPU

print(f"SAE loaded with {sae.cfg.d_sae} features")
print(f"SAE device: {next(sae.parameters()).device}")

Loading SAE from /workspace/sae/gemma-2-9b/saes/resid_post_layer_20/trainer_131k-l0-114
✓ Found SAE files at: /workspace/sae/gemma-2-9b/saes/resid_post_layer_20/trainer_131k-l0-114
SAE loaded with 131072 features
SAE device: cuda:0


## Two-Stage Processing

In [None]:
class StopForward(Exception):
    """Exception to stop forward pass after target layer."""
    pass

@torch.no_grad()
def process_prompts_for_features(prompts: List[str], target_features: List[int], 
                                layer_idx: int, activation_threshold: float = 0.0) -> Dict[int, Tuple[List[Dict], List[Dict]]]:
    """Two-stage processing: screen for target features, then get detailed tokens for active prompts.
    
    Returns:
        Dict mapping feature_id to (active_prompts, inactive_prompts) for that feature
    """
    
    # Initialize results dictionary for each feature
    results = {}
    for feature_id in target_features:
        results[feature_id] = ([], [])  # (active_prompts, inactive_prompts)
    
    # Get target layer
    target_layer = model.model.layers[layer_idx]
    
    # Process in batches
    for i in tqdm(range(0, len(prompts), BATCH_SIZE), desc="Processing prompts"):
        batch_prompts = prompts[i:i+BATCH_SIZE]
        
        # Format prompts as chat messages
        formatted_prompts = []
        for prompt in batch_prompts:
            messages = [{"role": "user", "content": prompt}]
            formatted_prompt = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            formatted_prompts.append(formatted_prompt)
        
        # Tokenize batch
        batch_inputs = tokenizer(
            formatted_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        )
        
        # Move to device
        batch_inputs = {k: v.to(device) for k, v in batch_inputs.items()}
        
        # Stage 1: Get activations for screening
        activations = None
        
        def hook_fn(module, input, output):
            nonlocal activations
            activations = output[0] if isinstance(output, tuple) else output
            raise StopForward()
        
        # Register hook
        handle = target_layer.register_forward_hook(hook_fn)
        
        try:
            _ = model(**batch_inputs)
        except StopForward:
            pass
        finally:
            handle.remove()
        
        # Apply SAE to get feature activations
        batch_size, seq_len, hidden_dim = activations.shape
        flat_activations = activations.view(-1, hidden_dim)
        
        # Process SAE in chunks to avoid memory issues
        sae_features = []
        for chunk_start in range(0, flat_activations.shape[0], BATCH_SIZE * 8):
            chunk_end = min(chunk_start + BATCH_SIZE * 8, flat_activations.shape[0])
            chunk_activations = flat_activations[chunk_start:chunk_end]
            chunk_features = sae.encode(chunk_activations)
            sae_features.append(chunk_features.cpu())
        
        sae_features = torch.cat(sae_features, dim=0)
        sae_features = sae_features.view(batch_size, seq_len, -1)
        
        # Stage 2: Process each prompt for each target feature separately
        for batch_idx, (prompt, formatted_prompt) in enumerate(zip(batch_prompts, formatted_prompts)):
            prompt_idx = i + batch_idx
            prompt_features = sae_features[batch_idx]  # [seq_len, num_features]
            input_ids = batch_inputs['input_ids'][batch_idx].cpu().numpy()
            
            # Create tokenized prompt (convert input_ids to token strings)
            tokenized_prompt = []
            for token_id in input_ids:
                if token_id != tokenizer.pad_token_id:  # Skip padding tokens
                    token_text = tokenizer.decode([int(token_id)])
                    tokenized_prompt.append(token_text)
            
            # Process each target feature separately
            for feature_id in target_features:
                # Get activations for this specific feature
                feature_activations = prompt_features[:, feature_id]  # [seq_len]
                max_activation = float(feature_activations.max())
                
                # Check if this feature is active for this prompt
                is_active = max_activation > activation_threshold
                
                if is_active:
                    # Stage 3: Get detailed token analysis for active prompts
                    tokens = []
                    for pos in range(len(input_ids)):
                        if pos >= prompt_features.shape[0]:
                            break
                            
                        token_id = int(input_ids[pos])
                        token_text = tokenizer.decode([token_id])
                        
                        # Get activation for this specific feature at this position
                        activation_val = float(prompt_features[pos, feature_id])
                        
                        if activation_val > 0:  # Only store non-zero activations
                            tokens.append({
                                'position': pos,
                                'token_id': token_id,
                                'text': token_text,
                                'feature_activation': activation_val
                            })
                    
                    results[feature_id][0].append({
                        'prompt_id': prompt_idx,
                        'prompt_text': prompt,
                        'tokenized_prompt': tokenized_prompt,
                        'max_feature_activation': max_activation,
                        'tokens': tokens
                    })
                else:
                    # Inactive prompt - just basic info
                    results[feature_id][1].append({
                        'prompt_id': prompt_idx,
                        'prompt_text': prompt,
                        'tokenized_prompt': tokenized_prompt,
                        'max_feature_activation': max_activation
                    })
    
    return results

print("Updated two-stage processing function defined")

## Process All Prompts

In [None]:
print(f"Processing {len(prompts_df)} prompts for {len(TARGET_FEATURES)} target features...")
print(f"Target features: {TARGET_FEATURES}")
print(f"Activation threshold: {ACTIVATION_THRESHOLD}")

# Process all prompts for all features
feature_results = process_prompts_for_features(
    prompts_df['prompt'].tolist(), 
    TARGET_FEATURES, 
    LAYER_INDEX, 
    ACTIVATION_THRESHOLD
)

print(f"\nResults by feature:")
for feature_id in TARGET_FEATURES:
    active_prompts, inactive_prompts = feature_results[feature_id]
    total_tokens = sum(len(p['tokens']) for p in active_prompts)
    print(f"  Feature {feature_id}: {len(active_prompts)} active, {len(inactive_prompts)} inactive, {total_tokens} active tokens")

## Save Results

In [None]:
# Save results for each feature in separate directories
for feature_id in TARGET_FEATURES:
    active_prompts, inactive_prompts = feature_results[feature_id]
    
    # Create feature-specific directory
    feature_dir = f"{BASE_OUTPUT_DIR}/{feature_id}"
    os.makedirs(feature_dir, exist_ok=True)
    
    # Define file paths
    active_file = f"{feature_dir}/active.jsonl"
    inactive_file = f"{feature_dir}/inactive.jsonl"
    
    # Save active prompts
    print(f"Saving {len(active_prompts)} active prompts for feature {feature_id} to {active_file}")
    with open(active_file, 'w') as f:
        for prompt in active_prompts:
            f.write(json.dumps(prompt) + '\n')
    
    # Save inactive prompts
    print(f"Saving {len(inactive_prompts)} inactive prompts for feature {feature_id} to {inactive_file}")
    with open(inactive_file, 'w') as f:
        for prompt in inactive_prompts:
            f.write(json.dumps(prompt) + '\n')
    
    # Show sample results for this feature
    if active_prompts:
        print(f"  Sample active prompt for feature {feature_id}:")
        sample = active_prompts[0]
        print(f"    Prompt: {sample['prompt_text'][:100]}...")
        print(f"    Max activation: {sample['max_feature_activation']}")
        print(f"    Active tokens: {len(sample['tokens'])}")
        if sample['tokens']:
            top_token = max(sample['tokens'], key=lambda x: x['feature_activation'])
            print(f"    Top token: '{top_token['text']}' (position {top_token['position']})")
            print(f"    Token activation: {top_token['feature_activation']}")
        print()

print(f"✓ Results saved successfully for all {len(TARGET_FEATURES)} features!")

In [None]:
def find_assistant_position(input_ids: torch.Tensor, attention_mask: torch.Tensor, 
                          assistant_header: str, token_offset: int, tokenizer, device) -> int:
    """Find the position of the assistant token based on the given offset."""
    # Find assistant header position
    assistant_tokens = tokenizer.encode(assistant_header, add_special_tokens=False)
    
    # Find where assistant section starts
    assistant_pos = None
    for k in range(len(input_ids) - len(assistant_tokens) + 1):
        if torch.equal(input_ids[k:k+len(assistant_tokens)], torch.tensor(assistant_tokens, device=input_ids.device)):
            assistant_pos = k + len(assistant_tokens) + token_offset
            break
    
    if assistant_pos is None:
        # Fallback to last non-padding token
        assistant_pos = attention_mask.sum().item() - 1
    
    # Ensure position is within bounds
    max_pos = attention_mask.sum().item() - 1
    assistant_pos = min(assistant_pos, max_pos)
    assistant_pos = max(assistant_pos, 0)
    
    return int(assistant_pos)

@torch.no_grad()
def filter_by_token_position(active_prompts: List[Dict], inactive_prompts: List[Dict], 
                           token_type: str, token_offset: int, feature_id: int,
                           activation_threshold: float = 0.0) -> Tuple[List[Dict], List[Dict]]:
    """Filter prompts based on whether target feature is active at specific token positions."""
    
    token_active = []
    token_inactive = []
    
    # Process all prompts (both active and inactive from main analysis)
    all_prompts = active_prompts + inactive_prompts
    
    for prompt_data in all_prompts:
        prompt_text = prompt_data['prompt_text']
        
        # Use existing tokenized_prompt if available, otherwise tokenize
        if 'tokenized_prompt' in prompt_data:
            tokenized_prompt = prompt_data['tokenized_prompt']
        else:
            # Format as chat message to match processing
            messages = [{"role": "user", "content": prompt_text}]
            formatted_prompt = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            
            # Tokenize to get input_ids and attention_mask
            inputs = tokenizer(
                formatted_prompt,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=MAX_LENGTH
            )
            
            input_ids = inputs['input_ids'].squeeze(0)
            
            # Create tokenized prompt
            tokenized_prompt = []
            for token_id in input_ids:
                if token_id != tokenizer.pad_token_id:  # Skip padding tokens
                    token_text = tokenizer.decode([int(token_id)])
                    tokenized_prompt.append(token_text)
        
        # For finding token position, we still need input_ids and attention_mask
        messages = [{"role": "user", "content": prompt_text}]
        formatted_prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        
        inputs = tokenizer(
            formatted_prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        )
        
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        
        # Find the specific token position
        target_position = find_assistant_position(
            input_ids, attention_mask, ASSISTANT_HEADER, token_offset, tokenizer, device
        )
        
        # Check if this prompt has token data at the target position
        has_activation_at_position = False
        max_activation_at_position = 0.0
        
        # Check if this was an active prompt with token details
        if 'tokens' in prompt_data:
            for token_data in prompt_data['tokens']:
                if token_data['position'] == target_position:
                    activation = token_data['feature_activation']
                    if activation > activation_threshold:
                        has_activation_at_position = True
                        max_activation_at_position = max(max_activation_at_position, activation)
        
        # Create token-specific record
        token_record = {
            'prompt_id': prompt_data['prompt_id'],
            'prompt_text': prompt_data['prompt_text'],
            'tokenized_prompt': tokenized_prompt,
            'token_type': token_type,
            'token_position': target_position,
            'max_feature_activation': prompt_data['max_feature_activation']
        }
        
        if has_activation_at_position:
            token_record['max_activation_at_position'] = max_activation_at_position
            # Include token details for this specific position
            position_tokens = []
            if 'tokens' in prompt_data:
                for token_data in prompt_data['tokens']:
                    if token_data['position'] == target_position:
                        position_tokens.append(token_data)
            token_record['position_tokens'] = position_tokens
            token_active.append(token_record)
        else:
            token_record['max_activation_at_position'] = max_activation_at_position
            token_inactive.append(token_record)
    
    return token_active, token_inactive

print("Updated token-specific filtering functions defined")

In [None]:
# Process token-specific analysis for each feature and token type
print(f"Processing token-specific analysis for {len(TARGET_FEATURES)} features and {len(TOKEN_OFFSETS)} token types...")

for feature_id in TARGET_FEATURES:
    active_prompts, inactive_prompts = feature_results[feature_id]
    feature_dir = f"{BASE_OUTPUT_DIR}/{feature_id}"
    
    print(f"\nProcessing feature {feature_id}:")
    
    for token_type, token_offset in TOKEN_OFFSETS.items():
        print(f"  Processing token type: {token_type} (offset: {token_offset})")
        
        # Filter prompts based on activation at this specific token position
        token_active, token_inactive = filter_by_token_position(
            active_prompts, inactive_prompts, 
            token_type, token_offset, feature_id, ACTIVATION_THRESHOLD
        )
        
        # Save results for this token type
        active_file = f"{feature_dir}/active_{token_type}.jsonl"
        inactive_file = f"{feature_dir}/inactive_{token_type}.jsonl"
        
        print(f"    Active prompts at {token_type} position: {len(token_active)}")
        print(f"    Inactive prompts at {token_type} position: {len(token_inactive)}")
        
        # Save active prompts for this token type
        with open(active_file, 'w') as f:
            for prompt in token_active:
                f.write(json.dumps(prompt) + '\n')
        
        # Save inactive prompts for this token type
        with open(inactive_file, 'w') as f:
            for prompt in token_inactive:
                f.write(json.dumps(prompt) + '\n')
        
        print(f"    Saved: {active_file}")
        print(f"    Saved: {inactive_file}")
        
        # Show sample if available
        if token_active:
            sample = token_active[0]
            print(f"    Sample active prompt at {token_type} position:")
            print(f"      Position: {sample['token_position']}")
            print(f"      Max activation: {sample['max_activation_at_position']}")
            if 'position_tokens' in sample and sample['position_tokens']:
                token_data = sample['position_tokens'][0]
                print(f"      Token text: '{token_data['text']}'")
                print(f"      Feature activation: {token_data['feature_activation']}")

print(f"\n✓ Token-specific analysis complete for all features!")
print(f"Results saved in feature-specific directories under: {BASE_OUTPUT_DIR}")
print(f"Each feature directory contains:")
print(f"  - active.jsonl / inactive.jsonl (general)")
for token_type in TOKEN_OFFSETS.keys():
    print(f"  - active_{token_type}.jsonl / inactive_{token_type}.jsonl (position-specific)")