# Config-Behavioral Integration

**Goal**: Integrate config-based similarity with behavioral/capability data (LMArena) to analyze genotype-phenotype relationships.

**Key Questions**:
1. Does config drift predict behavioral drift?
2. Which architectural features predict capability (Elo)?
3. Do architecture clusters align with behavioral clusters?
4. Are ecosystem-successful models also behaviorally successful?
5. Are some families more behaviorally coherent than others?

**Contents**:
- LMArena data loading and model name mapping
- Behavioral drift analysis (config change → capability change)
- Architecture-capability regression (predicting Elo from config)
- Architecture vs behavioral cluster comparison
- Ecosystem fitness vs behavioral fitness analysis

**Dependencies**: Uses Gower distance function. Requires LMArena data (CSV/JSON) for full functionality.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import warnings
warnings.filterwarnings('ignore')

# Style matching main repo
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

In [None]:
# Load config data
df = pd.read_csv('data/model_configs_expanded.csv', low_memory=False)
print(f"Loaded {len(df):,} models with config.json")
print(f"Total columns: {len(df.columns)}")

## 1. Setup: Feature Preparation and Gower Distance

**Note**: This section redefines utilities for self-contained execution.

In [None]:
# Define features and categorize
architecture_features = [
    'config_model_type', 'config_hidden_size', 'config_num_hidden_layers',
    'config_num_attention_heads', 'config_intermediate_size'
]
capacity_features = [
    'config_vocab_size', 'config_max_position_embeddings', 'config_num_key_value_heads'
]
precision_features = [
    'config_torch_dtype', 'config_rope_theta', 'config_rope_scaling_type'
]
boolean_features = ['uses_moe', 'uses_gqa', 'uses_rope', 'uses_quantization']

all_features = architecture_features + capacity_features + precision_features + boolean_features
available_features = [f for f in all_features if f in df.columns]

# Categorize features
numeric_features = []
categorical_features = []
boolean_feature_list = []

for feat in available_features:
    if feat in df.columns:
        sample_values = df[feat].dropna().head(100)
        if len(sample_values) == 0:
            continue
        try:
            pd.to_numeric(sample_values, errors='raise')
            numeric_features.append(feat)
        except (ValueError, TypeError):
            unique_vals = sample_values.unique()
            if len(unique_vals) <= 2 and set(str(v).lower() for v in unique_vals).issubset({'true', 'false', '1', '0', 'yes', 'no', 'nan'}):
                boolean_feature_list.append(feat)
            else:
                categorical_features.append(feat)

print(f"Features prepared: {len(available_features)} total")

In [None]:
# Gower distance function (needed for config drift computation)
def gower_distance(x, y, numeric_cols, categorical_cols, boolean_cols):
    """Compute Gower distance between two config vectors."""
    distance = 0.0
    count = 0
    
    for col in numeric_cols:
        if col in x.index and col in y.index:
            x_val, y_val = x[col], y[col]
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            try:
                x_num, y_num = float(x_val), float(y_val)
                max_val = max(abs(x_num), abs(y_num))
                if max_val > 0:
                    distance += abs(x_num - y_num) / max_val
                count += 1
            except (ValueError, TypeError):
                continue
    
    for col in categorical_cols:
        if col in x.index and col in y.index:
            x_val, y_val = x[col], y[col]
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            if str(x_val) != str(y_val):
                distance += 1.0
            count += 1
    
    for col in boolean_cols:
        if col in x.index and col in y.index:
            x_val, y_val = x[col], y[col]
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            x_bool = bool(x_val) if not pd.isna(x_val) else False
            y_bool = bool(y_val) if not pd.isna(y_val) else False
            if x_bool != y_bool:
                distance += 1.0
            count += 1
    
    return distance / count if count > 0 else 1.0

print("✓ Gower distance function defined")

## 2. Load Config Drift Data (from notebook 09)

**Note**: If you've run notebook 09, you can load the drift data. Otherwise, this notebook will compute it.

In [None]:
# Try to load pre-computed drift data, or compute it if not available
import os
if os.path.exists('config_drift_pairs.csv'):
    df_drift = pd.read_csv('config_drift_pairs.csv')
    print(f"✓ Loaded pre-computed drift data: {len(df_drift):,} pairs")
else:
    print("⚠ config_drift_pairs.csv not found")
    print("  This notebook can still run behavioral analyses, but config drift")
    print("  will need to be computed (see notebook 09 for drift computation)")
    df_drift = pd.DataFrame()

## 1. Prepare Config Features for Similarity Computation

In [None]:
# Placeholder for behavioral drift analysis
# This requires joining config data with behavioral evaluation datasets

print("Behavioral drift analysis (placeholder)")
print("\\nThis analysis requires:")
print("  1. LMArena Elo scores or Arena-Hard-Auto scores")
print("  2. P2L (Prompt-to-Label) performance vectors")
print("  3. Joining behavioral data with config drift data")
print("\\nWhen behavioral data is available, this section will:")
print("  - Compute Δ_behavior = Elo(child) - Elo(parent)")
print("  - Regress capability change on config drift")
print("  - Identify which config changes correlate with capability changes")
print("  - Analyze prompt-specific behavioral drift (P2L clusters)")

# Example structure for when data is available:
# behavioral_data = {
#     'modelId': [...],
#     'elo_score': [...],
#     'arena_hard_score': [...],
#     'p2l_vector': [...]
# }
# 
# df_behavioral = pd.DataFrame(behavioral_data)
# df_drift_with_behavior = df_drift.merge(df_behavioral, left_on='child_id', right_on='modelId', how='inner')
# 
# # Compute behavioral change
# df_drift_with_behavior = df_drift_with_behavior.merge(
#     df_behavioral, left_on='parent_id', right_on='modelId', 
#     suffixes=('_child', '_parent'), how='inner'
# )
# df_drift_with_behavior['delta_elo'] = df_drift_with_behavior['elo_score_child'] - df_drift_with_behavior['elo_score_parent']
# 
# # Regression: capability change ~ config drift
# from sklearn.linear_model import LinearRegression
# X = df_drift_with_behavior[['drift']].values
# y = df_drift_with_behavior['delta_elo'].values
# model = LinearRegression().fit(X, y)
# 
# print(f\"Regression: Δ_Elo ~ Config_Drift\")
# print(f\"  Coefficient: {model.coef_[0]:.3f}\")
# print(f\"  R²: {model.score(X, y):.3f}\")

In [None]:
# Map LMArena model names to HuggingFace model IDs
# This is a critical step - Arena uses display names, HF uses repo paths

if lmarena_data is not None:
    print("Mapping LMArena model names to HuggingFace modelIds...")
    
    # Common mapping patterns
    def normalize_model_name(name):
        """Normalize model name for matching"""
        if pd.isna(name):
            return None
        name = str(name).lower().strip()
        # Remove common prefixes/suffixes
        name = name.replace(' (chat)', '').replace(' (instruct)', '').replace(' (base)', '')
        name = name.replace('chat-', '').replace('instruct-', '').replace('base-', '')
        return name
    
    # Normalize Arena names
    lmarena_data['normalized_name'] = lmarena_data['model'].apply(normalize_model_name) if 'model' in lmarena_data.columns else None
    
    # Normalize HF modelIds
    df['normalized_id'] = df['modelId'].apply(lambda x: normalize_model_name(x.split('/')[-1] if '/' in str(x) else str(x)))
    
    # Try direct matches first
    if 'normalized_name' in lmarena_data.columns:
        # Merge on normalized names
        df_with_elo = df.merge(
            lmarena_data[['model', 'elo_rating', 'elo_uncertainty']].rename(columns={'model': 'arena_model'}),
            left_on='normalized_id',
            right_on=lmarena_data['normalized_name'],
            how='left'
        )
        
        # Also try matching on modelId directly
        direct_matches = df.merge(
            lmarena_data[['model', 'elo_rating', 'elo_uncertainty']].rename(columns={'model': 'arena_model'}),
            left_on='modelId',
            right_on='model',
            how='left',
            suffixes=('', '_direct')
        )
        
        # Combine matches
        df['elo_rating'] = df_with_elo['elo_rating'].fillna(direct_matches.get('elo_rating', None))
        df['elo_uncertainty'] = df_with_elo['elo_uncertainty'].fillna(direct_matches.get('elo_uncertainty', None))
        df['arena_model'] = df_with_elo['arena_model'].fillna(direct_matches.get('arena_model', None))
    
    models_with_elo = df['elo_rating'].notna().sum()
    print(f"✓ Mapped {models_with_elo:,} models with Elo scores")
    print(f"  Coverage: {models_with_elo/len(df)*100:.1f}% of config dataset")
    
    if models_with_elo > 0:
        print(f"\\nElo score statistics:")
        print(f"  Mean: {df['elo_rating'].mean():.1f}")
        print(f"  Median: {df['elo_rating'].median():.1f}")
        print(f"  Range: [{df['elo_rating'].min():.1f}, {df['elo_rating'].max():.1f}]")
else:
    print("Skipping mapping - no LMArena data available")
    df['elo_rating'] = None
    df['elo_uncertainty'] = None

## 16. Behavioral Drift: Config Change → Capability Change

**Goal**: Analyze how config changes correlate with behavioral/capability changes (Elo scores from LMArena).

**Key Questions**:
- Does config drift predict behavioral drift?
- Which architectural changes correlate with capability improvements?
- Are some families more behaviorally coherent than others?

In [None]:
# Compute behavioral drift for parent-child pairs with Elo data
if 'elo_rating' in df.columns and df['elo_rating'].notna().sum() > 0:
    print("Computing behavioral drift (ΔElo) for parent-child pairs...")
    
    # Merge Elo scores into drift dataframe
    df_drift_with_behavior = df_drift.merge(
        df[['modelId', 'elo_rating', 'elo_uncertainty']].rename(columns={
            'modelId': 'child_id',
            'elo_rating': 'child_elo',
            'elo_uncertainty': 'child_elo_uncertainty'
        }),
        on='child_id',
        how='left'
    ).merge(
        df[['modelId', 'elo_rating', 'elo_uncertainty']].rename(columns={
            'modelId': 'parent_id',
            'elo_rating': 'parent_elo',
            'elo_uncertainty': 'parent_elo_uncertainty'
        }),
        on='parent_id',
        how='left'
    )
    
    # Compute behavioral drift (ΔElo)
    df_drift_with_behavior['delta_elo'] = (
        df_drift_with_behavior['child_elo'] - df_drift_with_behavior['parent_elo']
    )
    
    # Filter to pairs where both have Elo scores
    df_behavioral_drift = df_drift_with_behavior[
        df_drift_with_behavior['child_elo'].notna() & 
        df_drift_with_behavior['parent_elo'].notna()
    ].copy()
    
    print(f"\\n✓ Found {len(df_behavioral_drift):,} parent-child pairs with behavioral data")
    print(f"  Mean ΔElo: {df_behavioral_drift['delta_elo'].mean():.2f}")
    print(f"  Median ΔElo: {df_behavioral_drift['delta_elo'].median():.2f}")
    print(f"  Improving pairs: {(df_behavioral_drift['delta_elo'] > 0).sum():,} ({(df_behavioral_drift['delta_elo'] > 0).mean()*100:.1f}%)")
    print(f"  Regressing pairs: {(df_behavioral_drift['delta_elo'] < 0).sum():,} ({(df_behavioral_drift['delta_elo'] < 0).mean()*100:.1f}%)")
    
    # Visualize config drift vs behavioral drift
    if len(df_behavioral_drift) > 10:
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Scatter: Config drift vs Behavioral drift (THE CORE PLOT)
        axes[0,0].scatter(df_behavioral_drift['drift'], df_behavioral_drift['delta_elo'], 
                         alpha=0.5, s=30, c='steelblue', edgecolors='black', linewidth=0.5)
        axes[0,0].axhline(0, color='red', linestyle='--', alpha=0.5, linewidth=1)
        axes[0,0].axvline(df_behavioral_drift['drift'].median(), color='green', linestyle='--', alpha=0.5, linewidth=1)
        axes[0,0].set_xlabel('Config Drift (Gower Distance)', fontsize=11)
        axes[0,0].set_ylabel('Behavioral Drift (ΔElo)', fontsize=11)
        axes[0,0].set_title('Config Drift vs Behavioral Drift (Genotype → Phenotype)', fontsize=13)
        axes[0,0].grid(True, alpha=0.3)
        
        # Add correlation coefficient
        from scipy.stats import pearsonr
        corr, p_val = pearsonr(df_behavioral_drift['drift'], df_behavioral_drift['delta_elo'])
        axes[0,0].text(0.05, 0.95, f'r={corr:.3f}, p={p_val:.3f}', 
                      transform=axes[0,0].transAxes, fontsize=10,
                      verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        
        # 2. Distribution of behavioral drift
        axes[0,1].hist(df_behavioral_drift['delta_elo'], bins=50, color='coral', alpha=0.7, edgecolor='white')
        axes[0,1].axvline(0, color='red', linestyle='--', linewidth=2, label='No change')
        axes[0,1].axvline(df_behavioral_drift['delta_elo'].median(), color='blue', linestyle='--', linewidth=2, 
                         label=f'Median: {df_behavioral_drift[\"delta_elo\"].median():.1f}')
        axes[0,1].set_xlabel('Behavioral Drift (ΔElo)', fontsize=11)
        axes[0,1].set_ylabel('Count', fontsize=11)
        axes[0,1].set_title('Distribution of Behavioral Drift', fontsize=13)
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3, axis='y')
        
        # 3. Behavioral drift by family
        if 'parent_family' in df_behavioral_drift.columns:
            family_behavioral_drift = df_behavioral_drift.groupby('parent_family')['delta_elo'].agg(['mean', 'median', 'count']).sort_values('count', ascending=False).head(10)
            
            x_pos = np.arange(len(family_behavioral_drift))
            width = 0.35
            axes[1,0].bar(x_pos - width/2, family_behavioral_drift['mean'], width, label='Mean', color='steelblue', alpha=0.7)
            axes[1,0].bar(x_pos + width/2, family_behavioral_drift['median'], width, label='Median', color='coral', alpha=0.7)
            axes[1,0].axhline(0, color='red', linestyle='--', alpha=0.5)
            axes[1,0].set_xticks(x_pos)
            axes[1,0].set_xticklabels(family_behavioral_drift.index, rotation=45, ha='right')
            axes[1,0].set_ylabel('Behavioral Drift (ΔElo)', fontsize=11)
            axes[1,0].set_title('Behavioral Drift by Family (Top 10)', fontsize=13)
            axes[1,0].legend()
            axes[1,0].grid(True, alpha=0.3, axis='y')
        
        # 4. Quadrant plot: Config drift vs Behavioral drift
        median_config_drift = df_behavioral_drift['drift'].median()
        median_behavioral_drift = df_behavioral_drift['delta_elo'].median()
        
        quadrants = {
            'High Config, High Behavior': (df_behavioral_drift['drift'] > median_config_drift) & (df_behavioral_drift['delta_elo'] > median_behavioral_drift),
            'High Config, Low Behavior': (df_behavioral_drift['drift'] > median_config_drift) & (df_behavioral_drift['delta_elo'] <= median_behavioral_drift),
            'Low Config, High Behavior': (df_behavioral_drift['drift'] <= median_config_drift) & (df_behavioral_drift['delta_elo'] > median_behavioral_drift),
            'Low Config, Low Behavior': (df_behavioral_drift['drift'] <= median_config_drift) & (df_behavioral_drift['delta_elo'] <= median_behavioral_drift)
        }
        
        colors = {'High Config, High Behavior': 'green', 'High Config, Low Behavior': 'orange',
                 'Low Config, High Behavior': 'blue', 'Low Config, Low Behavior': 'red'}
        
        for quad_name, mask in quadrants.items():
            if mask.sum() > 0:
                axes[1,1].scatter(df_behavioral_drift[mask]['drift'], df_behavioral_drift[mask]['delta_elo'],
                                 alpha=0.5, s=30, label=quad_name, c=colors[quad_name], edgecolors='black', linewidth=0.5)
        
        axes[1,1].axhline(median_behavioral_drift, color='gray', linestyle='--', alpha=0.5)
        axes[1,1].axvline(median_config_drift, color='gray', linestyle='--', alpha=0.5)
        axes[1,1].set_xlabel('Config Drift', fontsize=11)
        axes[1,1].set_ylabel('Behavioral Drift (ΔElo)', fontsize=11)
        axes[1,1].set_title('Config vs Behavioral Drift Quadrants', fontsize=13)
        axes[1,1].legend(fontsize=8)
        axes[1,1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('figures/config_vs_behavioral_drift.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Save behavioral drift data
        df_behavioral_drift.to_csv('config_behavioral_drift.csv', index=False)
        print("\\n✓ Behavioral drift data saved to config_behavioral_drift.csv")
    else:
        print("Not enough behavioral data for visualization")
else:
    print("No behavioral data available - skipping behavioral drift analysis")
    df_behavioral_drift = pd.DataFrame()

## 17. Architecture-Capability Regression

**Goal**: Predict behavioral capability (Elo) from architectural features (config.json parameters).

**Key Questions**:
- Which architectural features are most predictive of capability?
- Are there nonlinear effects or diminishing returns?
- Do architecture clusters correspond to capability clusters?

In [None]:
# Predict Elo from config features
if 'elo_rating' in df.columns and df['elo_rating'].notna().sum() > 50:
    print("Training models to predict Elo from config features...")
    
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score, mean_squared_error
    
    # Prepare data
    df_elo = df[df['elo_rating'].notna()].copy()
    
    # Prepare features (numeric only for regression)
    feature_cols = numeric_features + boolean_feature_list
    X_cols = [col for col in feature_cols if col in df_elo.columns]
    
    # Fill missing values
    X = df_elo[X_cols].fillna(0)
    y = df_elo['elo_rating'].values
    
    # Remove columns with no variance
    X = X.loc[:, X.std() > 0]
    
    print(f"\\nTraining on {len(X):,} models with {len(X.columns)} features")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train models
    models = {
        'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
        'Linear Regression': LinearRegression()
    }
    
    results = {}
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        results[name] = {
            'model': model,
            'r2': r2,
            'rmse': rmse,
            'y_pred': y_pred,
            'y_test': y_test
        }
        
        print(f"\\n{name}:")
        print(f"  R²: {r2:.3f}")
        print(f"  RMSE: {rmse:.2f}")
    
    # Feature importance (from Random Forest)
    if 'Random Forest' in results:
        rf_model = results['Random Forest']['model']
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\\nTop 10 most predictive architectural features:")
        print(feature_importance.head(10).to_string(index=False))
        
        # Visualize feature importance
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Feature importance
        top_features = feature_importance.head(15)
        axes[0,0].barh(range(len(top_features)), top_features['importance'], color='steelblue', alpha=0.7)
        axes[0,0].set_yticks(range(len(top_features)))
        axes[0,0].set_yticklabels(top_features['feature'], fontsize=9)
        axes[0,0].invert_yaxis()
        axes[0,0].set_xlabel('Feature Importance', fontsize=11)
        axes[0,0].set_title('Top 15 Most Predictive Config Features for Elo', fontsize=13)
        axes[0,0].grid(True, alpha=0.3, axis='x')
        
        # 2. Predicted vs Actual (Random Forest)
        axes[0,1].scatter(results['Random Forest']['y_test'], results['Random Forest']['y_pred'], 
                         alpha=0.5, s=20, color='steelblue', edgecolors='black', linewidth=0.3)
        min_val = min(results['Random Forest']['y_test'].min(), results['Random Forest']['y_pred'].min())
        max_val = max(results['Random Forest']['y_test'].max(), results['Random Forest']['y_pred'].max())
        axes[0,1].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect prediction')
        axes[0,1].set_xlabel('Actual Elo', fontsize=11)
        axes[0,1].set_ylabel('Predicted Elo', fontsize=11)
        axes[0,1].set_title(f'Predicted vs Actual Elo (R²={results[\"Random Forest\"][\"r2\"]:.3f})', fontsize=13)
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3)
        
        # 3. Model comparison
        model_names = list(results.keys())
        r2_scores = [results[m]['r2'] for m in model_names]
        axes[1,0].bar(range(len(model_names)), r2_scores, color=['steelblue', 'coral', 'seagreen'], alpha=0.7)
        axes[1,0].set_xticks(range(len(model_names)))
        axes[1,0].set_xticklabels(model_names)
        axes[1,0].set_ylabel('R² Score', fontsize=11)
        axes[1,0].set_title('Model Performance Comparison', fontsize=13)
        axes[1,0].grid(True, alpha=0.3, axis='y')
        for i, score in enumerate(r2_scores):
            axes[1,0].text(i, score + 0.01, f'{score:.3f}', ha='center', fontsize=10)
        
        # 4. Residuals plot
        residuals = results['Random Forest']['y_test'] - results['Random Forest']['y_pred']
        axes[1,1].scatter(results['Random Forest']['y_pred'], residuals, alpha=0.5, s=20, color='coral', edgecolors='black', linewidth=0.3)
        axes[1,1].axhline(0, color='red', linestyle='--', linewidth=2)
        axes[1,1].set_xlabel('Predicted Elo', fontsize=11)
        axes[1,1].set_ylabel('Residuals', fontsize=11)
        axes[1,1].set_title('Residuals Plot', fontsize=13)
        axes[1,1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('figures/architecture_capability_regression.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Save feature importance
        feature_importance.to_csv('config_feature_importance_elo.csv', index=False)
        print("\\n✓ Feature importance saved to config_feature_importance_elo.csv")
    else:
        print("Random Forest model not available for feature importance")
else:
    print("Not enough Elo data for regression analysis (need >50 models)")

## 18. Architecture vs Behavioral Clusters

**Goal**: Compare clustering based on architecture (config) vs clustering based on behavior (Elo/capability).

**Key Questions**:
- Do architecturally similar models behave similarly?
- Are families behaviorally monomorphic or polymorphic?
- Do architecture clusters align with behavioral clusters?

In [None]:
# Compare architecture-based vs behavior-based clustering
if 'elo_rating' in df.columns and df['elo_rating'].notna().sum() > 50:
    print("Comparing architecture-based vs behavior-based clustering...")
    
    from sklearn.cluster import KMeans
    from sklearn.manifold import TSNE
    from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
    
    # Prepare data with both config and Elo
    df_cluster = df[df['elo_rating'].notna()].copy()
    
    if len(df_cluster) > 100:
        # Sample for computational efficiency
        df_cluster = df_cluster.sample(n=min(1000, len(df_cluster)), random_state=42)
    
    # Architecture features (config-based)
    arch_features = [col for col in numeric_features + boolean_feature_list if col in df_cluster.columns]
    X_arch = df_cluster[arch_features].fillna(0)
    X_arch = X_arch.loc[:, X_arch.std() > 0]  # Remove zero-variance columns
    
    # Standardize
    scaler_arch = StandardScaler()
    X_arch_scaled = scaler_arch.fit_transform(X_arch)
    
    # Behavioral features (Elo-based, can extend to multi-dimensional)
    X_behavior = df_cluster[['elo_rating']].values
    
    # Cluster architectures
    n_clusters = min(10, len(df_cluster) // 20)
    kmeans_arch = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    arch_clusters = kmeans_arch.fit_predict(X_arch_scaled)
    
    # Cluster behaviors (by Elo bins)
    behavior_clusters = pd.cut(df_cluster['elo_rating'], bins=n_clusters, labels=False)
    
    # Compute cluster alignment
    ari = adjusted_rand_score(arch_clusters, behavior_clusters)
    nmi = normalized_mutual_info_score(arch_clusters, behavior_clusters)
    
    print(f"\\nCluster Alignment Metrics:")
    print(f"  Adjusted Rand Index: {ari:.3f} (1.0 = perfect alignment, 0.0 = random)")
    print(f"  Normalized Mutual Information: {nmi:.3f} (1.0 = perfect alignment, 0.0 = independent)")
    
    # Visualize clusters
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Use t-SNE for 2D embedding
    print("\\nComputing t-SNE embeddings (this may take a minute)...")
    tsne_arch = TSNE(n_components=2, random_state=42, perplexity=min(30, len(df_cluster)-1))
    embedding_arch = tsne_arch.fit_transform(X_arch_scaled)
    
    # 1. Architecture clusters colored by cluster
    scatter1 = axes[0,0].scatter(embedding_arch[:, 0], embedding_arch[:, 1], 
                                c=arch_clusters, cmap='tab10', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
    axes[0,0].set_xlabel('t-SNE Dimension 1', fontsize=11)
    axes[0,0].set_ylabel('t-SNE Dimension 2', fontsize=11)
    axes[0,0].set_title('Architecture-Based Clusters (Config Features)', fontsize=13)
    plt.colorbar(scatter1, ax=axes[0,0])
    
    # 2. Architecture embedding colored by Elo
    scatter2 = axes[0,1].scatter(embedding_arch[:, 0], embedding_arch[:, 1], 
                                c=df_cluster['elo_rating'], cmap='viridis', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
    axes[0,1].set_xlabel('t-SNE Dimension 1', fontsize=11)
    axes[0,1].set_ylabel('t-SNE Dimension 2', fontsize=11)
    axes[0,1].set_title('Architecture Space Colored by Elo (Behavior)', fontsize=13)
    plt.colorbar(scatter2, ax=axes[0,1], label='Elo Rating')
    
    # 3. Elo distribution by architecture cluster
    cluster_elo = pd.DataFrame({
        'cluster': arch_clusters,
        'elo': df_cluster['elo_rating'].values
    })
    cluster_elo_stats = cluster_elo.groupby('cluster')['elo'].agg(['mean', 'std', 'count'])
    
    x_pos = np.arange(len(cluster_elo_stats))
    axes[1,0].bar(x_pos, cluster_elo_stats['mean'], yerr=cluster_elo_stats['std'], 
                  color='steelblue', alpha=0.7, capsize=5)
    axes[1,0].set_xticks(x_pos)
    axes[1,0].set_xticklabels([f'Cluster {i}' for i in cluster_elo_stats.index])
    axes[1,0].set_ylabel('Mean Elo Rating', fontsize=11)
    axes[1,0].set_title('Behavioral Capability by Architecture Cluster', fontsize=13)
    axes[1,0].grid(True, alpha=0.3, axis='y')
    
    # Add count labels
    for i, (idx, row) in enumerate(cluster_elo_stats.iterrows()):
        axes[1,0].text(i, row['mean'] + row['std'] + 5, f\"n={int(row['count'])}\", ha='center', fontsize=8)
    
    # 4. Cluster alignment confusion matrix
    confusion = pd.crosstab(pd.Series(arch_clusters, name='Architecture Cluster'),
                           pd.Series(behavior_clusters, name='Behavior Cluster'))
    im = axes[1,1].imshow(confusion.values, cmap='YlOrRd', aspect='auto')
    axes[1,1].set_xticks(range(len(confusion.columns)))
    axes[1,1].set_xticklabels([f'B{i}' for i in confusion.columns])
    axes[1,1].set_yticks(range(len(confusion.index)))
    axes[1,1].set_yticklabels([f'A{i}' for i in confusion.index])
    axes[1,1].set_xlabel('Behavior Cluster', fontsize=11)
    axes[1,1].set_ylabel('Architecture Cluster', fontsize=11)
    axes[1,1].set_title(f'Architecture vs Behavior Cluster Alignment\\n(ARI={ari:.3f}, NMI={nmi:.3f})', fontsize=13)
    plt.colorbar(im, ax=axes[1,1], label='Count')
    
    # Add text annotations
    for i in range(len(confusion.index)):
        for j in range(len(confusion.columns)):
            axes[1,1].text(j, i, str(confusion.iloc[i, j]), ha='center', va='center', fontsize=8)
    
    plt.tight_layout()
    plt.savefig('figures/architecture_vs_behavior_clusters.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\\n✓ Cluster analysis complete")
else:
    print("Not enough behavioral data for cluster comparison")

## 19. Ecosystem Fitness vs Behavioral Fitness

**Goal**: Compare ecosystem success metrics (downloads, descendants, likes) with behavioral capability (Elo).

**Key Questions**:
- Are the most downloaded models the most capable?
- Do high-performing models have more descendants?
- What is the relationship between ecosystem adoption and behavioral performance?

In [None]:
# Compare ecosystem fitness vs behavioral fitness
# Try to load ecosystem metrics (downloads, likes, descendants)
print("Analyzing ecosystem fitness vs behavioral fitness...")

# Check for ecosystem metrics in dataframe
ecosystem_metrics = {}
for metric in ['downloads', 'likes', 'num_descendants', 'num_children']:
    if metric in df.columns:
        ecosystem_metrics[metric] = df[metric]

# Also try loading from graph if available
if G_family is not None and 'elo_rating' in df.columns:
    # Count descendants for models with Elo
    df_with_descendants = df[df['elo_rating'].notna()].copy()
    
    def count_descendants_simple(model_id):
        try:
            descendants = nx.descendants(G_family, model_id)
            return len(descendants)
        except:
            return 0
    
    # Sample for performance
    if len(df_with_descendants) > 500:
        df_with_descendants = df_with_descendants.sample(n=500, random_state=42)
    
    df_with_descendants['num_descendants'] = df_with_descendants['modelId'].apply(count_descendants_simple)
    ecosystem_metrics['num_descendants'] = df_with_descendants.set_index('modelId')['num_descendants']

if len(ecosystem_metrics) > 0 and 'elo_rating' in df.columns:
    # Merge ecosystem metrics with Elo
    df_fitness = df[df['elo_rating'].notna()].copy()
    
    for metric_name, metric_series in ecosystem_metrics.items():
        if isinstance(metric_series, pd.Series):
            df_fitness = df_fitness.merge(
                metric_series.reset_index().rename(columns={metric_series.name: metric_name}),
                left_on='modelId',
                right_on=metric_series.index.name if metric_series.index.name else 'index',
                how='left'
            )
        else:
            df_fitness[metric_name] = df_fitness['modelId'].map(metric_series).fillna(0)
    
    # Filter to models with both Elo and at least one ecosystem metric
    has_metric = df_fitness[[m for m in ecosystem_metrics.keys()]].notna().any(axis=1)
    df_fitness = df_fitness[has_metric].copy()
    
    if len(df_fitness) > 10:
        print(f"\\n✓ Analyzing {len(df_fitness):,} models with both Elo and ecosystem metrics")
        
        # Visualize relationships
        n_metrics = len(ecosystem_metrics)
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Scatter: Downloads vs Elo
        if 'downloads' in df_fitness.columns:
            # Log scale for downloads
            log_downloads = np.log10(df_fitness['downloads'] + 1)
            axes[0,0].scatter(log_downloads, df_fitness['elo_rating'], alpha=0.5, s=30, 
                            color='steelblue', edgecolors='black', linewidth=0.3)
            axes[0,0].set_xlabel('Log10(Downloads + 1)', fontsize=11)
            axes[0,0].set_ylabel('Elo Rating', fontsize=11)
            axes[0,0].set_title('Ecosystem Adoption (Downloads) vs Behavioral Capability', fontsize=13)
            axes[0,0].grid(True, alpha=0.3)
            
            # Add correlation
            corr, p_val = pearsonr(log_downloads, df_fitness['elo_rating'])
            axes[0,0].text(0.05, 0.95, f'r={corr:.3f}, p={p_val:.3f}', 
                          transform=axes[0,0].transAxes, fontsize=10,
                          verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        
        # 2. Quadrant plot: Ecosystem Fitness vs Behavioral Fitness
        if 'downloads' in df_fitness.columns:
            median_downloads = df_fitness['downloads'].median()
            median_elo = df_fitness['elo_rating'].median()
            
            quadrants = {
                'High Adoption, High Capability': (df_fitness['downloads'] > median_downloads) & (df_fitness['elo_rating'] > median_elo),
                'High Adoption, Low Capability': (df_fitness['downloads'] > median_downloads) & (df_fitness['elo_rating'] <= median_elo),
                'Low Adoption, High Capability': (df_fitness['downloads'] <= median_downloads) & (df_fitness['elo_rating'] > median_elo),
                'Low Adoption, Low Capability': (df_fitness['downloads'] <= median_downloads) & (df_fitness['elo_rating'] <= median_elo)
            }
            
            colors = {'High Adoption, High Capability': 'green', 'High Adoption, Low Capability': 'orange',
                     'Low Adoption, High Capability': 'blue', 'Low Adoption, Low Capability': 'red'}
            
            for quad_name, mask in quadrants.items():
                if mask.sum() > 0:
                    axes[0,1].scatter(np.log10(df_fitness[mask]['downloads'] + 1), 
                                     df_fitness[mask]['elo_rating'],
                                     alpha=0.5, s=30, label=quad_name, c=colors[quad_name], 
                                     edgecolors='black', linewidth=0.3)
            
            axes[0,1].axhline(median_elo, color='gray', linestyle='--', alpha=0.5)
            axes[0,1].axvline(np.log10(median_downloads + 1), color='gray', linestyle='--', alpha=0.5)
            axes[0,1].set_xlabel('Log10(Downloads + 1)', fontsize=11)
            axes[0,1].set_ylabel('Elo Rating', fontsize=11)
            axes[0,1].set_title('Ecosystem Fitness vs Behavioral Fitness Quadrants', fontsize=13)
            axes[0,1].legend(fontsize=8)
            axes[0,1].grid(True, alpha=0.3)
        
        # 3. Descendants vs Elo
        if 'num_descendants' in df_fitness.columns:
            log_descendants = np.log10(df_fitness['num_descendants'] + 1)
            axes[1,0].scatter(log_descendants, df_fitness['elo_rating'], alpha=0.5, s=30,
                            color='coral', edgecolors='black', linewidth=0.3)
            axes[1,0].set_xlabel('Log10(Number of Descendants + 1)', fontsize=11)
            axes[1,0].set_ylabel('Elo Rating', fontsize=11)
            axes[1,0].set_title('Reproductive Success vs Behavioral Capability', fontsize=13)
            axes[1,0].grid(True, alpha=0.3)
            
            corr, p_val = pearsonr(log_descendants, df_fitness['elo_rating'])
            axes[1,0].text(0.05, 0.95, f'r={corr:.3f}, p={p_val:.3f}', 
                          transform=axes[1,0].transAxes, fontsize=10,
                          verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        
        # 4. Correlation matrix
        fitness_cols = ['elo_rating'] + [m for m in ecosystem_metrics.keys() if m in df_fitness.columns]
        fitness_data = df_fitness[fitness_cols].copy()
        
        # Log transform ecosystem metrics
        for col in fitness_data.columns:
            if col != 'elo_rating' and fitness_data[col].max() > 100:
                fitness_data[col] = np.log10(fitness_data[col] + 1)
        
        corr_matrix = fitness_data.corr()
        im = axes[1,1].imshow(corr_matrix.values, cmap='coolwarm', vmin=-1, vmax=1, aspect='auto')
        axes[1,1].set_xticks(range(len(corr_matrix.columns)))
        axes[1,1].set_yticks(range(len(corr_matrix.index)))
        axes[1,1].set_xticklabels(corr_matrix.columns, rotation=45, ha='right', fontsize=9)
        axes[1,1].set_yticklabels(corr_matrix.index, fontsize=9)
        axes[1,1].set_title('Correlation: Ecosystem Metrics vs Behavioral Capability', fontsize=13)
        plt.colorbar(im, ax=axes[1,1], label='Correlation')
        
        # Add correlation values
        for i in range(len(corr_matrix.index)):
            for j in range(len(corr_matrix.columns)):
                axes[1,1].text(j, i, f'{corr_matrix.iloc[i, j]:.2f}', 
                              ha='center', va='center', fontsize=8,
                              color='white' if abs(corr_matrix.iloc[i, j]) > 0.5 else 'black')
        
        plt.tight_layout()
        plt.savefig('figures/ecosystem_vs_behavioral_fitness.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Save fitness data
        df_fitness[['modelId', 'elo_rating'] + list(ecosystem_metrics.keys())].to_csv('ecosystem_behavioral_fitness.csv', index=False)
        print("\\n✓ Fitness data saved to ecosystem_behavioral_fitness.csv")
    else:
        print("Not enough data for fitness analysis")
else:
    print("Ecosystem metrics or Elo data not available - skipping fitness analysis")