# Config Similarity Graph Analysis

**Goal**: Replicate AI Ecosystem paper analyses using config-based similarity and architectural drift, parallel to the trait-based analyses in the original paper.

**Key Questions**:
1. Which models are architecturally similar?
2. How does architecture drift along parent-child edges?
3. Which config fields mutate most frequently?
4. How does config similarity relate to family trees?
5. What is the mutational landscape of config changes?
6. How does config drift correlate with behavioral/capability changes?

**Methodology**:
- **Gower distance** for mixed numeric + categorical similarity (primary metric)
- **L2, L1, cosine similarity** for comparison
- **Config drift analysis** along parent-child edges
- **Mutational landscape** analysis (which fields change most)
- **Subgraph analysis** (architectural similarity in descendant clusters)
- **Drift by depth** (cumulative drift along lineage paths)
- **Behavioral drift** (config change → capability change, when data available)

**This notebook implements the full "Master Plan" for config-based similarity analysis**, replicating the AI Ecosystem paper methodology but grounded in architectural similarity rather than metadata traits.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import warnings
warnings.filterwarnings('ignore')

# Style matching main repo
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

In [None]:
# Load config data
df = pd.read_csv('data/model_configs_expanded.csv', low_memory=False)
print(f"Loaded {len(df):,} models with config.json")
print(f"Total columns: {len(df.columns)}")

## 1. Prepare Config Features for Similarity Computation

In [None]:
# Define feature groups for similarity computation

# Core architecture features (high weight)
architecture_features = [
    'config_model_type',
    'config_hidden_size',
    'config_num_hidden_layers',
    'config_num_attention_heads',
    'config_intermediate_size'
]

# Capacity features (medium weight)
capacity_features = [
    'config_vocab_size',
    'config_max_position_embeddings',
    'config_num_key_value_heads'
]

# Precision/compute features (lower weight)
precision_features = [
    'config_torch_dtype',
    'config_rope_theta',
    'config_rope_scaling_type'
]

# Boolean flags (low weight)
boolean_features = [
    'uses_moe',
    'uses_gqa',
    'uses_rope',
    'uses_quantization'
]

# All features for similarity
all_features = architecture_features + capacity_features + precision_features + boolean_features

# Filter to features that exist in dataframe
available_features = [f for f in all_features if f in df.columns]
print(f"Available features for similarity: {len(available_features)}")
print(f"Features: {available_features[:10]}...")

## 2. Implement Gower Distance (Recommended for Mixed Data Types)

In [None]:
def gower_distance(x, y, numeric_cols, categorical_cols, boolean_cols):
    """
    Compute Gower distance between two config vectors.
    
    For numeric: normalized absolute difference
    For categorical: 0 if same, 1 if different
    For boolean: 0 if same, 1 if different
    Missing values: ignored in that dimension
    """
    distance = 0.0
    count = 0
    
    # Numeric features: normalized absolute difference
    for col in numeric_cols:
        if col in x.index and col in y.index:
            x_val = x[col]
            y_val = y[col]
            
            # Skip if either is missing
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            
            # Convert to numeric
            try:
                x_num = float(x_val)
                y_num = float(y_val)
                
                # Normalized difference (using max-min normalization)
                # For now, use absolute difference normalized by max value
                max_val = max(abs(x_num), abs(y_num))
                if max_val > 0:
                    distance += abs(x_num - y_num) / max_val
                else:
                    distance += 0  # Both are 0
                count += 1
            except (ValueError, TypeError):
                continue
    
    # Categorical features: 0 if same, 1 if different
    for col in categorical_cols:
        if col in x.index and col in y.index:
            x_val = x[col]
            y_val = y[col]
            
            # Skip if either is missing
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            
            # Compare as strings
            if str(x_val) != str(y_val):
                distance += 1.0
            count += 1
    
    # Boolean features: 0 if same, 1 if different
    for col in boolean_cols:
        if col in x.index and col in y.index:
            x_val = x[col]
            y_val = y[col]
            
            # Skip if either is missing
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            
            # Normalize boolean values
            x_bool = bool(x_val) if not pd.isna(x_val) else False
            y_bool = bool(y_val) if not pd.isna(y_val) else False
            
            if x_bool != y_bool:
                distance += 1.0
            count += 1
    
    # Return average distance (0 = identical, 1 = completely different)
    if count > 0:
        return distance / count
    else:
        return 1.0  # No common features = maximum distance

print("Gower distance function defined")

## 3. Categorize Features by Type

In [None]:
# Categorize features by data type
numeric_features = []
categorical_features = []
boolean_feature_list = []

for feat in available_features:
    if feat in df.columns:
        # Check data type
        sample_values = df[feat].dropna().head(100)
        
        if len(sample_values) == 0:
            continue
        
        # Try to convert to numeric
        try:
            pd.to_numeric(sample_values, errors='raise')
            numeric_features.append(feat)
        except (ValueError, TypeError):
            # Check if boolean-like
            unique_vals = sample_values.unique()
            if len(unique_vals) <= 2 and set(str(v).lower() for v in unique_vals).issubset({'true', 'false', '1', '0', 'yes', 'no', 'nan'}):
                boolean_feature_list.append(feat)
            else:
                categorical_features.append(feat)

print(f"Numeric features: {len(numeric_features)}")
print(f"  {numeric_features[:5]}...")
print(f"\nCategorical features: {len(categorical_features)}")
print(f"  {categorical_features[:5]}...")
print(f"\nBoolean features: {len(boolean_feature_list)}")
print(f"  {boolean_feature_list}")

## 4. Compute Similarity Matrix (Sample for MVP)

In [None]:
# For MVP, sample a subset of models to make computation feasible
# Focus on models with complete config data

# Filter to models with at least some key features
key_features = ['config_hidden_size', 'config_num_hidden_layers', 'config_model_type']
df_complete = df[df[key_features].notna().any(axis=1)].copy()

# Sample for MVP (can increase later)
SAMPLE_SIZE = 1000  # Start with 1000 models for MVP
if len(df_complete) > SAMPLE_SIZE:
    # Stratified sample by family if available
    if 'family' in df_complete.columns:
        df_sample = df_complete.groupby('family', group_keys=False).apply(
            lambda x: x.sample(min(len(x), SAMPLE_SIZE // len(df_complete['family'].unique()) + 1))
        ).head(SAMPLE_SIZE)
    else:
        df_sample = df_complete.sample(n=SAMPLE_SIZE, random_state=42)
else:
    df_sample = df_complete.copy()

print(f"Sampling {len(df_sample):,} models for similarity computation")
print(f"This will compute {len(df_sample) * (len(df_sample) - 1) // 2:,} pairwise distances")

## 11. Architecture Phylogeny: Config Drift Along Parent-Child Edges

**Goal**: Measure how architecture drifts along family tree edges, replicating the trait drift analysis from the AI Ecosystem paper.

**Key Questions**:
- Do fine-tunes preserve architecture?
- Which families mutate architecture the most?
- What is the distribution of config drift within vs between families?

In [None]:
# Load family graph to analyze parent-child relationships
import pickle
import os

# Try to load the family graph
G_family = None
try:
    graph_paths = [
        'data/ai_ecosystem_graph_finetune_fulljson.pkl',
        'data/ai_ecosystem_graph_nomerges.pkl',
        'data/ai_ecosystem_graph.pkl'
    ]
    for path in graph_paths:
        if os.path.exists(path):
            with open(path, 'rb') as f:
                G_family = pickle.load(f)
            print(f"Loaded family graph from {path}")
            print(f"  Nodes: {len(G_family.nodes):,}")
            print(f"  Edges: {len(G_family.edges):,}")
            break
except Exception as e:
    print(f"Could not load graph: {e}")
    print("Will compute drift from parent_model columns in dataframe")

# Compute config drift for parent-child pairs
if G_family is not None:
    # Extract parent-child pairs from graph
    parent_child_pairs = []
    for parent, child in G_family.edges():
        if parent in df['modelId'].values and child in df['modelId'].values:
            parent_child_pairs.append((parent, child))
    
    print(f"Found {len(parent_child_pairs):,} parent-child pairs in graph")
else:
    # Fallback: use parent_model columns from dataframe
    print("Using parent_model columns from dataframe")
    parent_child_pairs = []
    
    # Check for parent columns
    parent_cols = ['parent_model', 'finetune_parent', 'quantized_parent', 'adapter_parent', 'merge_parent']
    available_parent_cols = [col for col in parent_cols if col in df.columns]
    
    if len(available_parent_cols) > 0:
        for idx, row in df.iterrows():
            model_id = row['modelId']
            for col in available_parent_cols:
                if pd.notna(row[col]):
                    try:
                        parents = eval(row[col]) if isinstance(row[col], str) else row[col]
                        if isinstance(parents, list):
                            for parent in parents:
                                if parent in df['modelId'].values:
                                    parent_child_pairs.append((parent, model_id))
                    except:
                        continue
        
        print(f"Found {len(parent_child_pairs):,} parent-child pairs from dataframe columns")

# Sample pairs for analysis (if too many)
MAX_PAIRS = 5000
if len(parent_child_pairs) > MAX_PAIRS:
    import random
    parent_child_pairs = random.sample(parent_child_pairs, MAX_PAIRS)
    print(f"Sampled {len(parent_child_pairs):,} pairs for analysis")

print(f"\\nTotal parent-child pairs to analyze: {len(parent_child_pairs):,}")

In [None]:
# Compute config drift for each parent-child pair
print("Computing config drift for parent-child pairs...")

drift_data = []
for i, (parent_id, child_id) in enumerate(parent_child_pairs):
    if i % 500 == 0 and i > 0:
        print(f"  Processed {i}/{len(parent_child_pairs)} pairs...")
    
    # Get config vectors
    parent_row = df[df['modelId'] == parent_id]
    child_row = df[df['modelId'] == child_id]
    
    if len(parent_row) == 0 or len(child_row) == 0:
        continue
    
    parent_vec = parent_row.iloc[0][available_features]
    child_vec = child_row.iloc[0][available_features]
    
    # Compute Gower distance (drift)
    drift = gower_distance(
        parent_vec,
        child_vec,
        numeric_features,
        categorical_features,
        boolean_feature_list
    )
    
    # Get family info if available
    parent_family = parent_row.iloc[0].get('family', 'Unknown')
    child_family = child_row.iloc[0].get('family', 'Unknown')
    same_family = parent_family == child_family and parent_family != 'Unknown'
    
    drift_data.append({
        'parent_id': parent_id,
        'child_id': child_id,
        'drift': drift,
        'similarity': 1 - drift,
        'parent_family': parent_family,
        'child_family': child_family,
        'same_family': same_family
    })

df_drift = pd.DataFrame(drift_data)
print(f"\\n✓ Computed drift for {len(df_drift):,} parent-child pairs")
print(f"  Mean drift: {df_drift['drift'].mean():.3f}")
print(f"  Median drift: {df_drift['drift'].median():.3f}")

In [None]:
# Visualize config drift distributions
if len(df_drift) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Overall drift distribution
    axes[0,0].hist(df_drift['drift'], bins=50, color='steelblue', alpha=0.7, edgecolor='white')
    axes[0,0].axvline(df_drift['drift'].median(), color='red', linestyle='--', linewidth=2, label=f'Median: {df_drift[\"drift\"].median():.3f}')
    axes[0,0].set_xlabel('Config Drift (Gower Distance)', fontsize=11)
    axes[0,0].set_ylabel('Count', fontsize=11)
    axes[0,0].set_title('Distribution of Config Drift Along Parent-Child Edges', fontsize=13)
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3, axis='y')
    
    # 2. Within-family vs between-family drift
    if df_drift['same_family'].sum() > 0:
        within_family = df_drift[df_drift['same_family'] == True]['drift']
        between_family = df_drift[df_drift['same_family'] == False]['drift']
        
        axes[0,1].hist([within_family, between_family], bins=30, label=['Within Family', 'Between Families'], 
                      alpha=0.7, color=['seagreen', 'coral'], edgecolor='white')
        axes[0,1].set_xlabel('Config Drift', fontsize=11)
        axes[0,1].set_ylabel('Count', fontsize=11)
        axes[0,1].set_title('Config Drift: Within vs Between Families', fontsize=13)
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3, axis='y')
        
        print(f"\\nWithin-family drift: mean={within_family.mean():.3f}, median={within_family.median():.3f}")
        print(f"Between-family drift: mean={between_family.mean():.3f}, median={between_family.median():.3f}")
    
    # 3. Drift by family (top families)
    if 'parent_family' in df_drift.columns:
        family_drift = df_drift.groupby('parent_family')['drift'].agg(['mean', 'median', 'count']).sort_values('count', ascending=False)
        top_families = family_drift.head(10)
        
        x_pos = np.arange(len(top_families))
        width = 0.35
        axes[1,0].bar(x_pos - width/2, top_families['mean'], width, label='Mean', color='steelblue', alpha=0.7)
        axes[1,0].bar(x_pos + width/2, top_families['median'], width, label='Median', color='coral', alpha=0.7)
        axes[1,0].set_xticks(x_pos)
        axes[1,0].set_xticklabels(top_families.index, rotation=45, ha='right')
        axes[1,0].set_ylabel('Config Drift', fontsize=11)
        axes[1,0].set_title('Config Drift by Family (Top 10)', fontsize=13)
        axes[1,0].legend()
        axes[1,0].grid(True, alpha=0.3, axis='y')
    
    # 4. Cumulative drift distribution
    sorted_drift = np.sort(df_drift['drift'])
    cumulative = np.arange(1, len(sorted_drift) + 1) / len(sorted_drift)
    axes[1,1].plot(sorted_drift, cumulative, linewidth=2, color='purple')
    axes[1,1].axvline(df_drift['drift'].median(), color='red', linestyle='--', linewidth=2, alpha=0.7)
    axes[1,1].set_xlabel('Config Drift', fontsize=11)
    axes[1,1].set_ylabel('Cumulative Fraction', fontsize=11)
    axes[1,1].set_title('Cumulative Distribution of Config Drift', fontsize=13)
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('figures/config_drift_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save drift data
    df_drift.to_csv('config_drift_pairs.csv', index=False)
    print("\\n✓ Drift data saved to config_drift_pairs.csv")
else:
    print("No drift data to visualize")

## 12. Mutational Landscape: Which Config Fields Drift Most?

**Goal**: Identify which config fields mutate most frequently along parent-child edges, parallel to trait mutation analysis in the original paper.

In [None]:
# Analyze which config fields change most frequently
print("Analyzing mutational landscape...")

mutation_counts = {feat: {'changed': 0, 'unchanged': 0, 'missing': 0} for feat in available_features}

for parent_id, child_id in parent_child_pairs[:min(5000, len(parent_child_pairs))]:
    parent_row = df[df['modelId'] == parent_id]
    child_row = df[df['modelId'] == child_id]
    
    if len(parent_row) == 0 or len(child_row) == 0:
        continue
    
    parent_vec = parent_row.iloc[0]
    child_vec = child_row.iloc[0]
    
    for feat in available_features:
        parent_val = parent_vec.get(feat)
        child_val = child_vec.get(feat)
        
        # Check if missing
        if pd.isna(parent_val) or pd.isna(child_val):
            mutation_counts[feat]['missing'] += 1
            continue
        
        # Check if changed
        if feat in numeric_features:
            try:
                parent_num = float(parent_val)
                child_num = float(child_val)
                if abs(parent_num - child_num) > 1e-6:  # Numeric difference
                    mutation_counts[feat]['changed'] += 1
                else:
                    mutation_counts[feat]['unchanged'] += 1
            except:
                if str(parent_val) != str(child_val):
                    mutation_counts[feat]['changed'] += 1
                else:
                    mutation_counts[feat]['unchanged'] += 1
        else:
            if str(parent_val) != str(child_val):
                mutation_counts[feat]['changed'] += 1
            else:
                mutation_counts[feat]['unchanged'] += 1

# Compute mutation rates
mutation_rates = []
for feat, counts in mutation_counts.items():
    total = counts['changed'] + counts['unchanged']
    if total > 0:
        rate = counts['changed'] / total
        mutation_rates.append({
            'feature': feat,
            'mutation_rate': rate,
            'n_changed': counts['changed'],
            'n_unchanged': counts['unchanged'],
            'n_missing': counts['missing'],
            'total_pairs': total
        })

df_mutations = pd.DataFrame(mutation_rates).sort_values('mutation_rate', ascending=False)
print(f"\\n✓ Analyzed mutations for {len(df_mutations)} features")
print(f"\\nTop 10 most frequently mutated features:")
print(df_mutations.head(10)[['feature', 'mutation_rate', 'n_changed', 'total_pairs']].to_string(index=False))

In [None]:
# Visualize mutational landscape
if len(df_mutations) > 0:
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
    
    # Top: Mutation rates for top features
    top_mutations = df_mutations.head(20)
    colors = ['coral' if rate > 0.5 else 'steelblue' for rate in top_mutations['mutation_rate']]
    
    bars = axes[0].barh(range(len(top_mutations)), top_mutations['mutation_rate'], color=colors, alpha=0.7)
    axes[0].set_yticks(range(len(top_mutations)))
    axes[0].set_yticklabels(top_mutations['feature'], fontsize=9)
    axes[0].invert_yaxis()
    axes[0].set_xlabel('Mutation Rate', fontsize=11)
    axes[0].set_title('Top 20 Most Frequently Mutated Config Features', fontsize=13)
    axes[0].axvline(0.5, color='red', linestyle='--', alpha=0.5, label='50% threshold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3, axis='x')
    
    # Add value labels
    for i, (idx, row) in enumerate(top_mutations.iterrows()):
        axes[0].text(row['mutation_rate'] + 0.01, i, f\"{row['mutation_rate']:.2%}\", va='center', fontsize=8)
    
    # Bottom: Mutation counts by feature type
    df_mutations['feature_type'] = df_mutations['feature'].apply(
        lambda x: 'Numeric' if x in numeric_features else ('Categorical' if x in categorical_features else 'Boolean')
    )
    
    type_summary = df_mutations.groupby('feature_type').agg({
        'mutation_rate': 'mean',
        'n_changed': 'sum',
        'total_pairs': 'sum'
    }).reset_index()
    
    x_pos = np.arange(len(type_summary))
    width = 0.35
    axes[1].bar(x_pos - width/2, type_summary['mutation_rate'], width, label='Mean Mutation Rate', color='steelblue', alpha=0.7)
    axes[1].bar(x_pos + width/2, type_summary['n_changed'] / type_summary['total_pairs'], width, 
               label='Overall Mutation Rate', color='coral', alpha=0.7)
    axes[1].set_xticks(x_pos)
    axes[1].set_xticklabels(type_summary['feature_type'])
    axes[1].set_ylabel('Mutation Rate', fontsize=11)
    axes[1].set_title('Mutation Rates by Feature Type', fontsize=13)
    axes[1].legend()
    axes[1].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig('figures/mutational_landscape.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save mutation data
    df_mutations.to_csv('config_mutation_rates.csv', index=False)
    print("\\n✓ Mutation data saved to config_mutation_rates.csv")

## 13. Subgraph Analysis: Architectural Similarity in Descendant Clusters

**Goal**: Analyze config similarity within subgraphs (descendant clusters), replicating the subgraph trait correlation analysis from the AI Ecosystem paper.

In [None]:
# Analyze config similarity within subgraphs (descendant clusters)
if G_family is not None:
    print("Analyzing config similarity within descendant subgraphs...")
    
    # Find root nodes (nodes with no incoming edges)
    root_nodes = [n for n in G_family.nodes() if G_family.in_degree(n) == 0]
    print(f"Found {len(root_nodes):,} root nodes")
    
    # Analyze top root nodes by number of descendants
    root_descendant_counts = {}
    for root in root_nodes[:min(20, len(root_nodes))]:  # Top 20 roots
        descendants = set(nx.descendants(G_family, root))
        root_descendant_counts[root] = len(descendants)
    
    # Sort by descendant count
    top_roots = sorted(root_descendant_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    
    subgraph_stats = []
    
    for root_id, n_descendants in top_roots:
        if n_descendants < 5:  # Skip small subgraphs
            continue
        
        # Get subgraph
        descendants = set(nx.descendants(G_family, root_id)) | {root_id}
        subgraph_nodes = [n for n in descendants if n in df['modelId'].values]
        
        if len(subgraph_nodes) < 5:
            continue
        
        # Get config vectors for subgraph
        subgraph_df = df[df['modelId'].isin(subgraph_nodes)][available_features].copy()
        
        if len(subgraph_df) < 2:
            continue
        
        # Compute pairwise similarities within subgraph
        subgraph_similarities = []
        subgraph_indices = subgraph_df.index.tolist()
        
        for i in range(len(subgraph_indices)):
            for j in range(i+1, len(subgraph_indices)):
                vec_i = subgraph_df.loc[subgraph_indices[i]]
                vec_j = subgraph_df.loc[subgraph_indices[j]]
                
                dist = gower_distance(
                    vec_i, vec_j,
                    numeric_features,
                    categorical_features,
                    boolean_feature_list
                )
                subgraph_similarities.append(1 - dist)
        
        if len(subgraph_similarities) > 0:
            mean_sim = np.mean(subgraph_similarities)
            median_sim = np.median(subgraph_similarities)
            
            # Get family info
            root_family = df[df['modelId'] == root_id]['family'].iloc[0] if len(df[df['modelId'] == root_id]) > 0 else 'Unknown'
            
            subgraph_stats.append({
                'root_id': root_id,
                'n_nodes': len(subgraph_nodes),
                'n_descendants': n_descendants,
                'mean_similarity': mean_sim,
                'median_similarity': median_sim,
                'family': root_family
            })
    
    df_subgraph_stats = pd.DataFrame(subgraph_stats)
    
    if len(df_subgraph_stats) > 0:
        print(f"\\n✓ Analyzed {len(df_subgraph_stats)} subgraphs")
        print(f"\\nSubgraph similarity statistics:")
        print(df_subgraph_stats[['root_id', 'n_nodes', 'mean_similarity', 'family']].head(10).to_string(index=False))
        
        # Visualize
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Scatter: subgraph size vs similarity
        axes[0].scatter(df_subgraph_stats['n_nodes'], df_subgraph_stats['mean_similarity'], 
                       alpha=0.6, s=100, c='steelblue')
        axes[0].set_xlabel('Subgraph Size (Number of Nodes)', fontsize=11)
        axes[0].set_ylabel('Mean Config Similarity', fontsize=11)
        axes[0].set_title('Subgraph Size vs Config Similarity', fontsize=13)
        axes[0].grid(True, alpha=0.3)
        
        # Similarity by family
        if 'family' in df_subgraph_stats.columns and df_subgraph_stats['family'].nunique() > 1:
            family_similarity = df_subgraph_stats.groupby('family')['mean_similarity'].agg(['mean', 'count']).sort_values('count', ascending=False).head(10)
            
            x_pos = np.arange(len(family_similarity))
            axes[1].bar(x_pos, family_similarity['mean'], color='coral', alpha=0.7)
            axes[1].set_xticks(x_pos)
            axes[1].set_xticklabels(family_similarity.index, rotation=45, ha='right')
            axes[1].set_ylabel('Mean Config Similarity', fontsize=11)
            axes[1].set_title('Mean Subgraph Similarity by Family', fontsize=13)
            axes[1].grid(True, alpha=0.3, axis='y')
            
            # Add count labels
            for i, (idx, row) in enumerate(family_similarity.iterrows()):
                axes[1].text(i, row['mean'] + 0.01, f\"n={int(row['count'])}\", ha='center', fontsize=8)
        
        plt.tight_layout()
        plt.savefig('figures/subgraph_similarity_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Save subgraph stats
        df_subgraph_stats.to_csv('subgraph_similarity_stats.csv', index=False)
        print("\\n✓ Subgraph stats saved to subgraph_similarity_stats.csv")
    else:
        print("No subgraph statistics computed")
else:
    print("Family graph not available - skipping subgraph analysis")

## 14. Drift by Depth: Config Drift Curves

**Goal**: Measure cumulative config drift along lineage paths (root → ... → leaf), parallel to trait drift curves in the original paper.

In [None]:
# Analyze config drift by depth in lineage trees
if G_family is not None:
    print("Analyzing config drift by depth...")
    
    # Find some deep lineages
    def get_lineage_paths(root, max_depth=10, max_paths=100):
        \"\"\"Get paths from root to leaves\"\"\"
        paths = []
        leaves = [n for n in G_family.nodes() if G_family.out_degree(n) == 0 and nx.has_path(G_family, root, n)]
        
        for leaf in leaves[:max_paths]:
            try:
                path = nx.shortest_path(G_family, root, leaf)
                if len(path) <= max_depth:
                    paths.append(path)
            except:
                continue
        
        return paths
    
    # Analyze drift along paths for top roots
    depth_drift_data = []
    
    for root_id, _ in top_roots[:5]:  # Top 5 roots
        if root_id not in df['modelId'].values:
            continue
        
        paths = get_lineage_paths(root_id, max_depth=8, max_paths=50)
        
        for path in paths:
            # Filter to nodes with config data
            path_with_config = [n for n in path if n in df['modelId'].values]
            
            if len(path_with_config) < 2:
                continue
            
            # Get root config
            root_config = df[df['modelId'] == path_with_config[0]][available_features].iloc[0]
            
            # Compute cumulative drift along path
            for i, node_id in enumerate(path_with_config[1:], 1):
                node_config = df[df['modelId'] == node_id][available_features].iloc[0]
                
                drift = gower_distance(
                    root_config, node_config,
                    numeric_features,
                    categorical_features,
                    boolean_feature_list
                )
                
                depth_drift_data.append({
                    'root_id': root_id,
                    'node_id': node_id,
                    'depth': i,
                    'cumulative_drift': drift,
                    'path_length': len(path_with_config)
                })
    
    df_depth_drift = pd.DataFrame(depth_drift_data)
    
    if len(df_depth_drift) > 0:
        print(f"\\n✓ Analyzed drift along {df_depth_drift['root_id'].nunique()} lineage trees")
        
        # Visualize drift by depth
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Average drift by depth
        depth_stats = df_depth_drift.groupby('depth')['cumulative_drift'].agg(['mean', 'median', 'std', 'count']).reset_index()
        
        axes[0].plot(depth_stats['depth'], depth_stats['mean'], marker='o', linewidth=2, label='Mean', color='steelblue')
        axes[0].fill_between(depth_stats['depth'], 
                            depth_stats['mean'] - depth_stats['std'],
                            depth_stats['mean'] + depth_stats['std'],
                            alpha=0.2, color='steelblue')
        axes[0].plot(depth_stats['depth'], depth_stats['median'], marker='s', linewidth=2, label='Median', color='coral', linestyle='--')
        axes[0].set_xlabel('Depth in Lineage Tree', fontsize=11)
        axes[0].set_ylabel('Cumulative Config Drift', fontsize=11)
        axes[0].set_title('Config Drift vs Depth in Lineage Trees', fontsize=13)
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Distribution of drift at different depths
        depth_samples = [1, 3, 5, 7]
        available_depths = [d for d in depth_samples if d in df_depth_drift['depth'].values]
        
        if len(available_depths) > 0:
            drift_by_depth = [df_depth_drift[df_depth_drift['depth'] == d]['cumulative_drift'].values for d in available_depths]
            axes[1].boxplot(drift_by_depth, labels=[f'Depth {d}' for d in available_depths])
            axes[1].set_xlabel('Depth', fontsize=11)
            axes[1].set_ylabel('Cumulative Config Drift', fontsize=11)
            axes[1].set_title('Distribution of Config Drift at Different Depths', fontsize=13)
            axes[1].grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        plt.savefig('figures/drift_by_depth.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Save depth drift data
        df_depth_drift.to_csv('config_drift_by_depth.csv', index=False)
        print("\\n✓ Depth drift data saved to config_drift_by_depth.csv")
    else:
        print("No depth drift data computed")
else:
    print("Family graph not available - skipping depth analysis")

## 15. Behavioral Drift: Config Change → Capability Change

**Goal**: Analyze how config changes correlate with behavioral/capability changes (when behavioral data is available).

**Note**: This section requires joining with behavioral datasets (LMArena, Arena-Hard-Auto, P2L). Placeholder code is provided for when such data becomes available.

In [None]:
# Placeholder for behavioral drift analysis
# This requires joining config data with behavioral evaluation datasets

print("Behavioral drift analysis (placeholder)")
print("\\nThis analysis requires:")
print("  1. LMArena Elo scores or Arena-Hard-Auto scores")
print("  2. P2L (Prompt-to-Label) performance vectors")
print("  3. Joining behavioral data with config drift data")
print("\\nWhen behavioral data is available, this section will:")
print("  - Compute Δ_behavior = Elo(child) - Elo(parent)")
print("  - Regress capability change on config drift")
print("  - Identify which config changes correlate with capability changes")
print("  - Analyze prompt-specific behavioral drift (P2L clusters)")

# Example structure for when data is available:
# behavioral_data = {
#     'modelId': [...],
#     'elo_score': [...],
#     'arena_hard_score': [...],
#     'p2l_vector': [...]
# }
# 
# df_behavioral = pd.DataFrame(behavioral_data)
# df_drift_with_behavior = df_drift.merge(df_behavioral, left_on='child_id', right_on='modelId', how='inner')
# 
# # Compute behavioral change
# df_drift_with_behavior = df_drift_with_behavior.merge(
#     df_behavioral, left_on='parent_id', right_on='modelId', 
#     suffixes=('_child', '_parent'), how='inner'
# )
# df_drift_with_behavior['delta_elo'] = df_drift_with_behavior['elo_score_child'] - df_drift_with_behavior['elo_score_parent']
# 
# # Regression: capability change ~ config drift
# from sklearn.linear_model import LinearRegression
# X = df_drift_with_behavior[['drift']].values
# y = df_drift_with_behavior['delta_elo'].values
# model = LinearRegression().fit(X, y)
# 
# print(f\"Regression: Δ_Elo ~ Config_Drift\")
# print(f\"  Coefficient: {model.coef_[0]:.3f}\")
# print(f\"  R²: {model.score(X, y):.3f}\")

In [None]:
# Map LMArena model names to HuggingFace model IDs
# This is a critical step - Arena uses display names, HF uses repo paths

if lmarena_data is not None:
    print("Mapping LMArena model names to HuggingFace modelIds...")
    
    # Common mapping patterns
    def normalize_model_name(name):
        """Normalize model name for matching"""
        if pd.isna(name):
            return None
        name = str(name).lower().strip()
        # Remove common prefixes/suffixes
        name = name.replace(' (chat)', '').replace(' (instruct)', '').replace(' (base)', '')
        name = name.replace('chat-', '').replace('instruct-', '').replace('base-', '')
        return name
    
    # Normalize Arena names
    lmarena_data['normalized_name'] = lmarena_data['model'].apply(normalize_model_name) if 'model' in lmarena_data.columns else None
    
    # Normalize HF modelIds
    df['normalized_id'] = df['modelId'].apply(lambda x: normalize_model_name(x.split('/')[-1] if '/' in str(x) else str(x)))
    
    # Try direct matches first
    if 'normalized_name' in lmarena_data.columns:
        # Merge on normalized names
        df_with_elo = df.merge(
            lmarena_data[['model', 'elo_rating', 'elo_uncertainty']].rename(columns={'model': 'arena_model'}),
            left_on='normalized_id',
            right_on=lmarena_data['normalized_name'],
            how='left'
        )
        
        # Also try matching on modelId directly
        direct_matches = df.merge(
            lmarena_data[['model', 'elo_rating', 'elo_uncertainty']].rename(columns={'model': 'arena_model'}),
            left_on='modelId',
            right_on='model',
            how='left',
            suffixes=('', '_direct')
        )
        
        # Combine matches
        df['elo_rating'] = df_with_elo['elo_rating'].fillna(direct_matches.get('elo_rating', None))
        df['elo_uncertainty'] = df_with_elo['elo_uncertainty'].fillna(direct_matches.get('elo_uncertainty', None))
        df['arena_model'] = df_with_elo['arena_model'].fillna(direct_matches.get('arena_model', None))
    
    models_with_elo = df['elo_rating'].notna().sum()
    print(f"✓ Mapped {models_with_elo:,} models with Elo scores")
    print(f"  Coverage: {models_with_elo/len(df)*100:.1f}% of config dataset")
    
    if models_with_elo > 0:
        print(f"\\nElo score statistics:")
        print(f"  Mean: {df['elo_rating'].mean():.1f}")
        print(f"  Median: {df['elo_rating'].median():.1f}")
        print(f"  Range: [{df['elo_rating'].min():.1f}, {df['elo_rating'].max():.1f}]")
else:
    print("Skipping mapping - no LMArena data available")
    df['elo_rating'] = None
    df['elo_uncertainty'] = None

## 16. Behavioral Drift: Config Change → Capability Change

**Goal**: Analyze how config changes correlate with behavioral/capability changes (Elo scores from LMArena).

**Key Questions**:
- Does config drift predict behavioral drift?
- Which architectural changes correlate with capability improvements?
- Are some families more behaviorally coherent than others?

In [None]:
# Compute behavioral drift for parent-child pairs with Elo data
if 'elo_rating' in df.columns and df['elo_rating'].notna().sum() > 0:
    print("Computing behavioral drift (ΔElo) for parent-child pairs...")
    
    # Merge Elo scores into drift dataframe
    df_drift_with_behavior = df_drift.merge(
        df[['modelId', 'elo_rating', 'elo_uncertainty']].rename(columns={
            'modelId': 'child_id',
            'elo_rating': 'child_elo',
            'elo_uncertainty': 'child_elo_uncertainty'
        }),
        on='child_id',
        how='left'
    ).merge(
        df[['modelId', 'elo_rating', 'elo_uncertainty']].rename(columns={
            'modelId': 'parent_id',
            'elo_rating': 'parent_elo',
            'elo_uncertainty': 'parent_elo_uncertainty'
        }),
        on='parent_id',
        how='left'
    )
    
    # Compute behavioral drift (ΔElo)
    df_drift_with_behavior['delta_elo'] = (
        df_drift_with_behavior['child_elo'] - df_drift_with_behavior['parent_elo']
    )
    
    # Filter to pairs where both have Elo scores
    df_behavioral_drift = df_drift_with_behavior[
        df_drift_with_behavior['child_elo'].notna() & 
        df_drift_with_behavior['parent_elo'].notna()
    ].copy()
    
    print(f"\\n✓ Found {len(df_behavioral_drift):,} parent-child pairs with behavioral data")
    print(f"  Mean ΔElo: {df_behavioral_drift['delta_elo'].mean():.2f}")
    print(f"  Median ΔElo: {df_behavioral_drift['delta_elo'].median():.2f}")
    print(f"  Improving pairs: {(df_behavioral_drift['delta_elo'] > 0).sum():,} ({(df_behavioral_drift['delta_elo'] > 0).mean()*100:.1f}%)")
    print(f"  Regressing pairs: {(df_behavioral_drift['delta_elo'] < 0).sum():,} ({(df_behavioral_drift['delta_elo'] < 0).mean()*100:.1f}%)")
    
    # Visualize config drift vs behavioral drift
    if len(df_behavioral_drift) > 10:
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Scatter: Config drift vs Behavioral drift (THE CORE PLOT)
        axes[0,0].scatter(df_behavioral_drift['drift'], df_behavioral_drift['delta_elo'], 
                         alpha=0.5, s=30, c='steelblue', edgecolors='black', linewidth=0.5)
        axes[0,0].axhline(0, color='red', linestyle='--', alpha=0.5, linewidth=1)
        axes[0,0].axvline(df_behavioral_drift['drift'].median(), color='green', linestyle='--', alpha=0.5, linewidth=1)
        axes[0,0].set_xlabel('Config Drift (Gower Distance)', fontsize=11)
        axes[0,0].set_ylabel('Behavioral Drift (ΔElo)', fontsize=11)
        axes[0,0].set_title('Config Drift vs Behavioral Drift (Genotype → Phenotype)', fontsize=13)
        axes[0,0].grid(True, alpha=0.3)
        
        # Add correlation coefficient
        from scipy.stats import pearsonr
        corr, p_val = pearsonr(df_behavioral_drift['drift'], df_behavioral_drift['delta_elo'])
        axes[0,0].text(0.05, 0.95, f'r={corr:.3f}, p={p_val:.3f}', 
                      transform=axes[0,0].transAxes, fontsize=10,
                      verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        
        # 2. Distribution of behavioral drift
        axes[0,1].hist(df_behavioral_drift['delta_elo'], bins=50, color='coral', alpha=0.7, edgecolor='white')
        axes[0,1].axvline(0, color='red', linestyle='--', linewidth=2, label='No change')
        axes[0,1].axvline(df_behavioral_drift['delta_elo'].median(), color='blue', linestyle='--', linewidth=2, 
                         label=f'Median: {df_behavioral_drift[\"delta_elo\"].median():.1f}')
        axes[0,1].set_xlabel('Behavioral Drift (ΔElo)', fontsize=11)
        axes[0,1].set_ylabel('Count', fontsize=11)
        axes[0,1].set_title('Distribution of Behavioral Drift', fontsize=13)
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3, axis='y')
        
        # 3. Behavioral drift by family
        if 'parent_family' in df_behavioral_drift.columns:
            family_behavioral_drift = df_behavioral_drift.groupby('parent_family')['delta_elo'].agg(['mean', 'median', 'count']).sort_values('count', ascending=False).head(10)
            
            x_pos = np.arange(len(family_behavioral_drift))
            width = 0.35
            axes[1,0].bar(x_pos - width/2, family_behavioral_drift['mean'], width, label='Mean', color='steelblue', alpha=0.7)
            axes[1,0].bar(x_pos + width/2, family_behavioral_drift['median'], width, label='Median', color='coral', alpha=0.7)
            axes[1,0].axhline(0, color='red', linestyle='--', alpha=0.5)
            axes[1,0].set_xticks(x_pos)
            axes[1,0].set_xticklabels(family_behavioral_drift.index, rotation=45, ha='right')
            axes[1,0].set_ylabel('Behavioral Drift (ΔElo)', fontsize=11)
            axes[1,0].set_title('Behavioral Drift by Family (Top 10)', fontsize=13)
            axes[1,0].legend()
            axes[1,0].grid(True, alpha=0.3, axis='y')
        
        # 4. Quadrant plot: Config drift vs Behavioral drift
        median_config_drift = df_behavioral_drift['drift'].median()
        median_behavioral_drift = df_behavioral_drift['delta_elo'].median()
        
        quadrants = {
            'High Config, High Behavior': (df_behavioral_drift['drift'] > median_config_drift) & (df_behavioral_drift['delta_elo'] > median_behavioral_drift),
            'High Config, Low Behavior': (df_behavioral_drift['drift'] > median_config_drift) & (df_behavioral_drift['delta_elo'] <= median_behavioral_drift),
            'Low Config, High Behavior': (df_behavioral_drift['drift'] <= median_config_drift) & (df_behavioral_drift['delta_elo'] > median_behavioral_drift),
            'Low Config, Low Behavior': (df_behavioral_drift['drift'] <= median_config_drift) & (df_behavioral_drift['delta_elo'] <= median_behavioral_drift)
        }
        
        colors = {'High Config, High Behavior': 'green', 'High Config, Low Behavior': 'orange',
                 'Low Config, High Behavior': 'blue', 'Low Config, Low Behavior': 'red'}
        
        for quad_name, mask in quadrants.items():
            if mask.sum() > 0:
                axes[1,1].scatter(df_behavioral_drift[mask]['drift'], df_behavioral_drift[mask]['delta_elo'],
                                 alpha=0.5, s=30, label=quad_name, c=colors[quad_name], edgecolors='black', linewidth=0.5)
        
        axes[1,1].axhline(median_behavioral_drift, color='gray', linestyle='--', alpha=0.5)
        axes[1,1].axvline(median_config_drift, color='gray', linestyle='--', alpha=0.5)
        axes[1,1].set_xlabel('Config Drift', fontsize=11)
        axes[1,1].set_ylabel('Behavioral Drift (ΔElo)', fontsize=11)
        axes[1,1].set_title('Config vs Behavioral Drift Quadrants', fontsize=13)
        axes[1,1].legend(fontsize=8)
        axes[1,1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('figures/config_vs_behavioral_drift.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Save behavioral drift data
        df_behavioral_drift.to_csv('config_behavioral_drift.csv', index=False)
        print("\\n✓ Behavioral drift data saved to config_behavioral_drift.csv")
    else:
        print("Not enough behavioral data for visualization")
else:
    print("No behavioral data available - skipping behavioral drift analysis")
    df_behavioral_drift = pd.DataFrame()

## 17. Architecture-Capability Regression

**Goal**: Predict behavioral capability (Elo) from architectural features (config.json parameters).

**Key Questions**:
- Which architectural features are most predictive of capability?
- Are there nonlinear effects or diminishing returns?
- Do architecture clusters correspond to capability clusters?

In [None]:
# Predict Elo from config features
if 'elo_rating' in df.columns and df['elo_rating'].notna().sum() > 50:
    print("Training models to predict Elo from config features...")
    
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score, mean_squared_error
    
    # Prepare data
    df_elo = df[df['elo_rating'].notna()].copy()
    
    # Prepare features (numeric only for regression)
    feature_cols = numeric_features + boolean_feature_list
    X_cols = [col for col in feature_cols if col in df_elo.columns]
    
    # Fill missing values
    X = df_elo[X_cols].fillna(0)
    y = df_elo['elo_rating'].values
    
    # Remove columns with no variance
    X = X.loc[:, X.std() > 0]
    
    print(f"\\nTraining on {len(X):,} models with {len(X.columns)} features")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train models
    models = {
        'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
        'Linear Regression': LinearRegression()
    }
    
    results = {}
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        results[name] = {
            'model': model,
            'r2': r2,
            'rmse': rmse,
            'y_pred': y_pred,
            'y_test': y_test
        }
        
        print(f"\\n{name}:")
        print(f"  R²: {r2:.3f}")
        print(f"  RMSE: {rmse:.2f}")
    
    # Feature importance (from Random Forest)
    if 'Random Forest' in results:
        rf_model = results['Random Forest']['model']
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\\nTop 10 most predictive architectural features:")
        print(feature_importance.head(10).to_string(index=False))
        
        # Visualize feature importance
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Feature importance
        top_features = feature_importance.head(15)
        axes[0,0].barh(range(len(top_features)), top_features['importance'], color='steelblue', alpha=0.7)
        axes[0,0].set_yticks(range(len(top_features)))
        axes[0,0].set_yticklabels(top_features['feature'], fontsize=9)
        axes[0,0].invert_yaxis()
        axes[0,0].set_xlabel('Feature Importance', fontsize=11)
        axes[0,0].set_title('Top 15 Most Predictive Config Features for Elo', fontsize=13)
        axes[0,0].grid(True, alpha=0.3, axis='x')
        
        # 2. Predicted vs Actual (Random Forest)
        axes[0,1].scatter(results['Random Forest']['y_test'], results['Random Forest']['y_pred'], 
                         alpha=0.5, s=20, color='steelblue', edgecolors='black', linewidth=0.3)
        min_val = min(results['Random Forest']['y_test'].min(), results['Random Forest']['y_pred'].min())
        max_val = max(results['Random Forest']['y_test'].max(), results['Random Forest']['y_pred'].max())
        axes[0,1].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect prediction')
        axes[0,1].set_xlabel('Actual Elo', fontsize=11)
        axes[0,1].set_ylabel('Predicted Elo', fontsize=11)
        axes[0,1].set_title(f'Predicted vs Actual Elo (R²={results[\"Random Forest\"][\"r2\"]:.3f})', fontsize=13)
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3)
        
        # 3. Model comparison
        model_names = list(results.keys())
        r2_scores = [results[m]['r2'] for m in model_names]
        axes[1,0].bar(range(len(model_names)), r2_scores, color=['steelblue', 'coral', 'seagreen'], alpha=0.7)
        axes[1,0].set_xticks(range(len(model_names)))
        axes[1,0].set_xticklabels(model_names)
        axes[1,0].set_ylabel('R² Score', fontsize=11)
        axes[1,0].set_title('Model Performance Comparison', fontsize=13)
        axes[1,0].grid(True, alpha=0.3, axis='y')
        for i, score in enumerate(r2_scores):
            axes[1,0].text(i, score + 0.01, f'{score:.3f}', ha='center', fontsize=10)
        
        # 4. Residuals plot
        residuals = results['Random Forest']['y_test'] - results['Random Forest']['y_pred']
        axes[1,1].scatter(results['Random Forest']['y_pred'], residuals, alpha=0.5, s=20, color='coral', edgecolors='black', linewidth=0.3)
        axes[1,1].axhline(0, color='red', linestyle='--', linewidth=2)
        axes[1,1].set_xlabel('Predicted Elo', fontsize=11)
        axes[1,1].set_ylabel('Residuals', fontsize=11)
        axes[1,1].set_title('Residuals Plot', fontsize=13)
        axes[1,1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('figures/architecture_capability_regression.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Save feature importance
        feature_importance.to_csv('config_feature_importance_elo.csv', index=False)
        print("\\n✓ Feature importance saved to config_feature_importance_elo.csv")
    else:
        print("Random Forest model not available for feature importance")
else:
    print("Not enough Elo data for regression analysis (need >50 models)")

## 18. Architecture vs Behavioral Clusters

**Goal**: Compare clustering based on architecture (config) vs clustering based on behavior (Elo/capability).

**Key Questions**:
- Do architecturally similar models behave similarly?
- Are families behaviorally monomorphic or polymorphic?
- Do architecture clusters align with behavioral clusters?

In [None]:
# Compare architecture-based vs behavior-based clustering
if 'elo_rating' in df.columns and df['elo_rating'].notna().sum() > 50:
    print("Comparing architecture-based vs behavior-based clustering...")
    
    from sklearn.cluster import KMeans
    from sklearn.manifold import TSNE
    from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
    
    # Prepare data with both config and Elo
    df_cluster = df[df['elo_rating'].notna()].copy()
    
    if len(df_cluster) > 100:
        # Sample for computational efficiency
        df_cluster = df_cluster.sample(n=min(1000, len(df_cluster)), random_state=42)
    
    # Architecture features (config-based)
    arch_features = [col for col in numeric_features + boolean_feature_list if col in df_cluster.columns]
    X_arch = df_cluster[arch_features].fillna(0)
    X_arch = X_arch.loc[:, X_arch.std() > 0]  # Remove zero-variance columns
    
    # Standardize
    scaler_arch = StandardScaler()
    X_arch_scaled = scaler_arch.fit_transform(X_arch)
    
    # Behavioral features (Elo-based, can extend to multi-dimensional)
    X_behavior = df_cluster[['elo_rating']].values
    
    # Cluster architectures
    n_clusters = min(10, len(df_cluster) // 20)
    kmeans_arch = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    arch_clusters = kmeans_arch.fit_predict(X_arch_scaled)
    
    # Cluster behaviors (by Elo bins)
    behavior_clusters = pd.cut(df_cluster['elo_rating'], bins=n_clusters, labels=False)
    
    # Compute cluster alignment
    ari = adjusted_rand_score(arch_clusters, behavior_clusters)
    nmi = normalized_mutual_info_score(arch_clusters, behavior_clusters)
    
    print(f"\\nCluster Alignment Metrics:")
    print(f"  Adjusted Rand Index: {ari:.3f} (1.0 = perfect alignment, 0.0 = random)")
    print(f"  Normalized Mutual Information: {nmi:.3f} (1.0 = perfect alignment, 0.0 = independent)")
    
    # Visualize clusters
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Use t-SNE for 2D embedding
    print("\\nComputing t-SNE embeddings (this may take a minute)...")
    tsne_arch = TSNE(n_components=2, random_state=42, perplexity=min(30, len(df_cluster)-1))
    embedding_arch = tsne_arch.fit_transform(X_arch_scaled)
    
    # 1. Architecture clusters colored by cluster
    scatter1 = axes[0,0].scatter(embedding_arch[:, 0], embedding_arch[:, 1], 
                                c=arch_clusters, cmap='tab10', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
    axes[0,0].set_xlabel('t-SNE Dimension 1', fontsize=11)
    axes[0,0].set_ylabel('t-SNE Dimension 2', fontsize=11)
    axes[0,0].set_title('Architecture-Based Clusters (Config Features)', fontsize=13)
    plt.colorbar(scatter1, ax=axes[0,0])
    
    # 2. Architecture embedding colored by Elo
    scatter2 = axes[0,1].scatter(embedding_arch[:, 0], embedding_arch[:, 1], 
                                c=df_cluster['elo_rating'], cmap='viridis', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
    axes[0,1].set_xlabel('t-SNE Dimension 1', fontsize=11)
    axes[0,1].set_ylabel('t-SNE Dimension 2', fontsize=11)
    axes[0,1].set_title('Architecture Space Colored by Elo (Behavior)', fontsize=13)
    plt.colorbar(scatter2, ax=axes[0,1], label='Elo Rating')
    
    # 3. Elo distribution by architecture cluster
    cluster_elo = pd.DataFrame({
        'cluster': arch_clusters,
        'elo': df_cluster['elo_rating'].values
    })
    cluster_elo_stats = cluster_elo.groupby('cluster')['elo'].agg(['mean', 'std', 'count'])
    
    x_pos = np.arange(len(cluster_elo_stats))
    axes[1,0].bar(x_pos, cluster_elo_stats['mean'], yerr=cluster_elo_stats['std'], 
                  color='steelblue', alpha=0.7, capsize=5)
    axes[1,0].set_xticks(x_pos)
    axes[1,0].set_xticklabels([f'Cluster {i}' for i in cluster_elo_stats.index])
    axes[1,0].set_ylabel('Mean Elo Rating', fontsize=11)
    axes[1,0].set_title('Behavioral Capability by Architecture Cluster', fontsize=13)
    axes[1,0].grid(True, alpha=0.3, axis='y')
    
    # Add count labels
    for i, (idx, row) in enumerate(cluster_elo_stats.iterrows()):
        axes[1,0].text(i, row['mean'] + row['std'] + 5, f\"n={int(row['count'])}\", ha='center', fontsize=8)
    
    # 4. Cluster alignment confusion matrix
    confusion = pd.crosstab(pd.Series(arch_clusters, name='Architecture Cluster'),
                           pd.Series(behavior_clusters, name='Behavior Cluster'))
    im = axes[1,1].imshow(confusion.values, cmap='YlOrRd', aspect='auto')
    axes[1,1].set_xticks(range(len(confusion.columns)))
    axes[1,1].set_xticklabels([f'B{i}' for i in confusion.columns])
    axes[1,1].set_yticks(range(len(confusion.index)))
    axes[1,1].set_yticklabels([f'A{i}' for i in confusion.index])
    axes[1,1].set_xlabel('Behavior Cluster', fontsize=11)
    axes[1,1].set_ylabel('Architecture Cluster', fontsize=11)
    axes[1,1].set_title(f'Architecture vs Behavior Cluster Alignment\\n(ARI={ari:.3f}, NMI={nmi:.3f})', fontsize=13)
    plt.colorbar(im, ax=axes[1,1], label='Count')
    
    # Add text annotations
    for i in range(len(confusion.index)):
        for j in range(len(confusion.columns)):
            axes[1,1].text(j, i, str(confusion.iloc[i, j]), ha='center', va='center', fontsize=8)
    
    plt.tight_layout()
    plt.savefig('figures/architecture_vs_behavior_clusters.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\\n✓ Cluster analysis complete")
else:
    print("Not enough behavioral data for cluster comparison")

## 19. Ecosystem Fitness vs Behavioral Fitness

**Goal**: Compare ecosystem success metrics (downloads, descendants, likes) with behavioral capability (Elo).

**Key Questions**:
- Are the most downloaded models the most capable?
- Do high-performing models have more descendants?
- What is the relationship between ecosystem adoption and behavioral performance?

In [None]:
# Compare ecosystem fitness vs behavioral fitness
# Try to load ecosystem metrics (downloads, likes, descendants)
print("Analyzing ecosystem fitness vs behavioral fitness...")

# Check for ecosystem metrics in dataframe
ecosystem_metrics = {}
for metric in ['downloads', 'likes', 'num_descendants', 'num_children']:
    if metric in df.columns:
        ecosystem_metrics[metric] = df[metric]

# Also try loading from graph if available
if G_family is not None and 'elo_rating' in df.columns:
    # Count descendants for models with Elo
    df_with_descendants = df[df['elo_rating'].notna()].copy()
    
    def count_descendants_simple(model_id):
        try:
            descendants = nx.descendants(G_family, model_id)
            return len(descendants)
        except:
            return 0
    
    # Sample for performance
    if len(df_with_descendants) > 500:
        df_with_descendants = df_with_descendants.sample(n=500, random_state=42)
    
    df_with_descendants['num_descendants'] = df_with_descendants['modelId'].apply(count_descendants_simple)
    ecosystem_metrics['num_descendants'] = df_with_descendants.set_index('modelId')['num_descendants']

if len(ecosystem_metrics) > 0 and 'elo_rating' in df.columns:
    # Merge ecosystem metrics with Elo
    df_fitness = df[df['elo_rating'].notna()].copy()
    
    for metric_name, metric_series in ecosystem_metrics.items():
        if isinstance(metric_series, pd.Series):
            df_fitness = df_fitness.merge(
                metric_series.reset_index().rename(columns={metric_series.name: metric_name}),
                left_on='modelId',
                right_on=metric_series.index.name if metric_series.index.name else 'index',
                how='left'
            )
        else:
            df_fitness[metric_name] = df_fitness['modelId'].map(metric_series).fillna(0)
    
    # Filter to models with both Elo and at least one ecosystem metric
    has_metric = df_fitness[[m for m in ecosystem_metrics.keys()]].notna().any(axis=1)
    df_fitness = df_fitness[has_metric].copy()
    
    if len(df_fitness) > 10:
        print(f"\\n✓ Analyzing {len(df_fitness):,} models with both Elo and ecosystem metrics")
        
        # Visualize relationships
        n_metrics = len(ecosystem_metrics)
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Scatter: Downloads vs Elo
        if 'downloads' in df_fitness.columns:
            # Log scale for downloads
            log_downloads = np.log10(df_fitness['downloads'] + 1)
            axes[0,0].scatter(log_downloads, df_fitness['elo_rating'], alpha=0.5, s=30, 
                            color='steelblue', edgecolors='black', linewidth=0.3)
            axes[0,0].set_xlabel('Log10(Downloads + 1)', fontsize=11)
            axes[0,0].set_ylabel('Elo Rating', fontsize=11)
            axes[0,0].set_title('Ecosystem Adoption (Downloads) vs Behavioral Capability', fontsize=13)
            axes[0,0].grid(True, alpha=0.3)
            
            # Add correlation
            corr, p_val = pearsonr(log_downloads, df_fitness['elo_rating'])
            axes[0,0].text(0.05, 0.95, f'r={corr:.3f}, p={p_val:.3f}', 
                          transform=axes[0,0].transAxes, fontsize=10,
                          verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        
        # 2. Quadrant plot: Ecosystem Fitness vs Behavioral Fitness
        if 'downloads' in df_fitness.columns:
            median_downloads = df_fitness['downloads'].median()
            median_elo = df_fitness['elo_rating'].median()
            
            quadrants = {
                'High Adoption, High Capability': (df_fitness['downloads'] > median_downloads) & (df_fitness['elo_rating'] > median_elo),
                'High Adoption, Low Capability': (df_fitness['downloads'] > median_downloads) & (df_fitness['elo_rating'] <= median_elo),
                'Low Adoption, High Capability': (df_fitness['downloads'] <= median_downloads) & (df_fitness['elo_rating'] > median_elo),
                'Low Adoption, Low Capability': (df_fitness['downloads'] <= median_downloads) & (df_fitness['elo_rating'] <= median_elo)
            }
            
            colors = {'High Adoption, High Capability': 'green', 'High Adoption, Low Capability': 'orange',
                     'Low Adoption, High Capability': 'blue', 'Low Adoption, Low Capability': 'red'}
            
            for quad_name, mask in quadrants.items():
                if mask.sum() > 0:
                    axes[0,1].scatter(np.log10(df_fitness[mask]['downloads'] + 1), 
                                     df_fitness[mask]['elo_rating'],
                                     alpha=0.5, s=30, label=quad_name, c=colors[quad_name], 
                                     edgecolors='black', linewidth=0.3)
            
            axes[0,1].axhline(median_elo, color='gray', linestyle='--', alpha=0.5)
            axes[0,1].axvline(np.log10(median_downloads + 1), color='gray', linestyle='--', alpha=0.5)
            axes[0,1].set_xlabel('Log10(Downloads + 1)', fontsize=11)
            axes[0,1].set_ylabel('Elo Rating', fontsize=11)
            axes[0,1].set_title('Ecosystem Fitness vs Behavioral Fitness Quadrants', fontsize=13)
            axes[0,1].legend(fontsize=8)
            axes[0,1].grid(True, alpha=0.3)
        
        # 3. Descendants vs Elo
        if 'num_descendants' in df_fitness.columns:
            log_descendants = np.log10(df_fitness['num_descendants'] + 1)
            axes[1,0].scatter(log_descendants, df_fitness['elo_rating'], alpha=0.5, s=30,
                            color='coral', edgecolors='black', linewidth=0.3)
            axes[1,0].set_xlabel('Log10(Number of Descendants + 1)', fontsize=11)
            axes[1,0].set_ylabel('Elo Rating', fontsize=11)
            axes[1,0].set_title('Reproductive Success vs Behavioral Capability', fontsize=13)
            axes[1,0].grid(True, alpha=0.3)
            
            corr, p_val = pearsonr(log_descendants, df_fitness['elo_rating'])
            axes[1,0].text(0.05, 0.95, f'r={corr:.3f}, p={p_val:.3f}', 
                          transform=axes[1,0].transAxes, fontsize=10,
                          verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
        
        # 4. Correlation matrix
        fitness_cols = ['elo_rating'] + [m for m in ecosystem_metrics.keys() if m in df_fitness.columns]
        fitness_data = df_fitness[fitness_cols].copy()
        
        # Log transform ecosystem metrics
        for col in fitness_data.columns:
            if col != 'elo_rating' and fitness_data[col].max() > 100:
                fitness_data[col] = np.log10(fitness_data[col] + 1)
        
        corr_matrix = fitness_data.corr()
        im = axes[1,1].imshow(corr_matrix.values, cmap='coolwarm', vmin=-1, vmax=1, aspect='auto')
        axes[1,1].set_xticks(range(len(corr_matrix.columns)))
        axes[1,1].set_yticks(range(len(corr_matrix.index)))
        axes[1,1].set_xticklabels(corr_matrix.columns, rotation=45, ha='right', fontsize=9)
        axes[1,1].set_yticklabels(corr_matrix.index, fontsize=9)
        axes[1,1].set_title('Correlation: Ecosystem Metrics vs Behavioral Capability', fontsize=13)
        plt.colorbar(im, ax=axes[1,1], label='Correlation')
        
        # Add correlation values
        for i in range(len(corr_matrix.index)):
            for j in range(len(corr_matrix.columns)):
                axes[1,1].text(j, i, f'{corr_matrix.iloc[i, j]:.2f}', 
                              ha='center', va='center', fontsize=8,
                              color='white' if abs(corr_matrix.iloc[i, j]) > 0.5 else 'black')
        
        plt.tight_layout()
        plt.savefig('figures/ecosystem_vs_behavioral_fitness.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Save fitness data
        df_fitness[['modelId', 'elo_rating'] + list(ecosystem_metrics.keys())].to_csv('ecosystem_behavioral_fitness.csv', index=False)
        print("\\n✓ Fitness data saved to ecosystem_behavioral_fitness.csv")
    else:
        print("Not enough data for fitness analysis")
else:
    print("Ecosystem metrics or Elo data not available - skipping fitness analysis")

## 16. Final Summary: Config Similarity Analysis

This notebook implements a comprehensive config-based similarity analysis, replicating the AI Ecosystem paper methodology but grounded in architectural similarity rather than metadata traits.

In [None]:
print("="*80)
print("CONFIG SIMILARITY ANALYSIS - COMPREHENSIVE SUMMARY")
print("="*80)

print(f"\\n1. SIMILARITY GRAPH:")
print(f"   - Nodes: {len(G_similarity.nodes):,}")
print(f"   - Edges: {len(G_similarity.edges):,}")
print(f"   - Similarity threshold: {SIMILARITY_THRESHOLD}")

if len(df_drift) > 0:
    print(f"\\n2. CONFIG DRIFT ANALYSIS:")
    print(f"   - Parent-child pairs analyzed: {len(df_drift):,}")
    print(f"   - Mean drift: {df_drift['drift'].mean():.3f}")
    print(f"   - Median drift: {df_drift['drift'].median():.3f}")
    if 'same_family' in df_drift.columns:
        within = df_drift[df_drift['same_family'] == True]['drift'].mean() if df_drift['same_family'].sum() > 0 else None
        between = df_drift[df_drift['same_family'] == False]['drift'].mean() if (df_drift['same_family'] == False).sum() > 0 else None
        if within is not None:
            print(f"   - Within-family drift: {within:.3f}")
        if between is not None:
            print(f"   - Between-family drift: {between:.3f}")

if len(df_mutations) > 0:
    print(f"\\n3. MUTATIONAL LANDSCAPE:")
    print(f"   - Features analyzed: {len(df_mutations)}")
    print(f"   - Most mutated feature: {df_mutations.iloc[0]['feature']} (rate: {df_mutations.iloc[0]['mutation_rate']:.2%})")
    print(f"   - Mean mutation rate: {df_mutations['mutation_rate'].mean():.2%}")

if 'df_subgraph_stats' in locals() and len(df_subgraph_stats) > 0:
    print(f"\\n4. SUBGRAPH ANALYSIS:")
    print(f"   - Subgraphs analyzed: {len(df_subgraph_stats)}")
    print(f"   - Mean subgraph similarity: {df_subgraph_stats['mean_similarity'].mean():.3f}")

if 'df_depth_drift' in locals() and len(df_depth_drift) > 0:
    print(f"\\n5. DEPTH ANALYSIS:")
    print(f"   - Lineage paths analyzed: {df_depth_drift['root_id'].nunique()}")
    print(f"   - Max depth analyzed: {df_depth_drift['depth'].max()}")

print(f"\\n6. OUTPUT FILES GENERATED:")
output_files = [
    'config_similarity_summary.csv',
    'config_drift_pairs.csv',
    'config_mutation_rates.csv'
]
if 'df_subgraph_stats' in locals():
    output_files.append('subgraph_similarity_stats.csv')
if 'df_depth_drift' in locals():
    output_files.append('config_drift_by_depth.csv')

for f in output_files:
    print(f"   - {f}")

print(f"\\n7. FIGURES GENERATED:")
figures = [
    'figures/config_similarity_embedding.png',
    'figures/similarity_metrics_comparison.png',
    'figures/config_drift_analysis.png',
    'figures/mutational_landscape.png'
]
if 'df_subgraph_stats' in locals():
    figures.append('figures/subgraph_similarity_analysis.png')
if 'df_depth_drift' in locals():
    figures.append('figures/drift_by_depth.png')

for f in figures:
    print(f"   - {f}")

print("\\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

In [None]:
# Compute Gower distance matrix
print("Computing Gower distance matrix (this may take a few minutes)...")

n = len(df_sample)
distance_matrix = np.zeros((n, n))

# Prepare feature vectors
feature_vectors = df_sample[available_features].copy()

# Compute pairwise distances
for i in range(n):
    if i % 100 == 0:
        print(f"  Processing {i}/{n}...")
    for j in range(i+1, n):
        dist = gower_distance(
            feature_vectors.iloc[i],
            feature_vectors.iloc[j],
            numeric_features,
            categorical_features,
            boolean_feature_list
        )
        distance_matrix[i, j] = dist
        distance_matrix[j, i] = dist  # Symmetric

print(f"✓ Distance matrix computed: {distance_matrix.shape}")
print(f"Distance range: [{distance_matrix[distance_matrix > 0].min():.3f}, {distance_matrix.max():.3f}]")

## 5. Build Similarity Graph

In [None]:
# Create similarity graph
# Edge threshold: connect models with similarity > threshold
# Lower distance = higher similarity, so we use distance < threshold

SIMILARITY_THRESHOLD = 0.3  # Connect models with Gower distance < 0.3 (70%+ similar)

G_similarity = nx.Graph()

# Add nodes
for idx, row in df_sample.iterrows():
    model_id = row['modelId']
    G_similarity.add_node(model_id, **row.to_dict())

# Add edges for similar models
edge_count = 0
for i in range(n):
    model_i = df_sample.iloc[i]['modelId']
    for j in range(i+1, n):
        if distance_matrix[i, j] < SIMILARITY_THRESHOLD:
            model_j = df_sample.iloc[j]['modelId']
            G_similarity.add_edge(model_i, model_j, weight=1 - distance_matrix[i, j], distance=distance_matrix[i, j])
            edge_count += 1

print(f"✓ Similarity graph created")
print(f"  Nodes: {len(G_similarity.nodes):,}")
print(f"  Edges: {len(G_similarity.edges):,}")
print(f"  Average degree: {2 * len(G_similarity.edges) / len(G_similarity.nodes):.2f}")
print(f"  Connected components: {nx.number_connected_components(G_similarity)}")

## 6. Analyze Graph Structure

In [None]:
# Graph statistics
if len(G_similarity.nodes) > 0:
    # Largest connected component
    largest_cc = max(nx.connected_components(G_similarity), key=len)
    G_largest = G_similarity.subgraph(largest_cc)
    
    print(f"Largest connected component: {len(largest_cc):,} nodes")
    print(f"\nGraph statistics:")
    print(f"  Average clustering coefficient: {nx.average_clustering(G_similarity):.3f}")
    print(f"  Density: {nx.density(G_similarity):.6f}")
    
    # Degree distribution
    degrees = dict(G_similarity.degree())
    print(f"\nDegree distribution:")
    print(f"  Min degree: {min(degrees.values())}")
    print(f"  Max degree: {max(degrees.values())}")
    print(f"  Mean degree: {np.mean(list(degrees.values())):.2f}")
    print(f"  Median degree: {np.median(list(degrees.values())):.2f}")

## 7. Visualize Similarity Clusters

In [None]:
# Use PCA to reduce dimensions for visualization
# Convert distance matrix to similarity matrix
similarity_matrix = 1 - distance_matrix

# Use MDS or PCA on similarity matrix
from sklearn.manifold import MDS

# For visualization, use 2D embedding
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
embedding = mds.fit_transform(distance_matrix)

# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Color by family if available
if 'family' in df_sample.columns:
    families = df_sample['family'].values
    unique_families = pd.Series(families).unique()
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_families)))
    family_color_map = {fam: colors[i] for i, fam in enumerate(unique_families)}
    node_colors = [family_color_map.get(fam, 'gray') for fam in families]
else:
    node_colors = 'steelblue'

# Plot 1: Full similarity graph embedding
scatter = axes[0].scatter(embedding[:, 0], embedding[:, 1], c=node_colors, alpha=0.6, s=20)
axes[0].set_xlabel('MDS Dimension 1', fontsize=11)
axes[0].set_ylabel('MDS Dimension 2', fontsize=11)
axes[0].set_title(f'Config Similarity Embedding (n={len(df_sample):,})', fontsize=13)
axes[0].grid(True, alpha=0.3)

# Add edges for top similarities (to avoid clutter)
if len(G_similarity.edges) > 0:
    # Sample edges for visualization
    edge_sample = list(G_similarity.edges(data=True))[:min(500, len(G_similarity.edges))]
    
    for edge in edge_sample:
        model_i, model_j, data = edge
        idx_i = df_sample[df_sample['modelId'] == model_i].index[0] if len(df_sample[df_sample['modelId'] == model_i]) > 0 else None
        idx_j = df_sample[df_sample['modelId'] == model_j].index[0] if len(df_sample[df_sample['modelId'] == model_j]) > 0 else None
        
        if idx_i is not None and idx_j is not None:
            i_pos = df_sample.index.get_loc(idx_i) if idx_i in df_sample.index else None
            j_pos = df_sample.index.get_loc(idx_j) if idx_j in df_sample.index else None
            
            if i_pos is not None and j_pos is not None and i_pos < len(embedding) and j_pos < len(embedding):
                axes[0].plot([embedding[i_pos, 0], embedding[j_pos, 0]], 
                           [embedding[i_pos, 1], embedding[j_pos, 1]], 
                           'gray', alpha=0.1, linewidth=0.5)

# Plot 2: Degree distribution
degrees = dict(G_similarity.degree())
degree_values = list(degrees.values())
axes[1].hist(degree_values, bins=30, color='steelblue', alpha=0.7, edgecolor='white')
axes[1].set_xlabel('Node Degree (Number of Similar Models)', fontsize=11)
axes[1].set_ylabel('Count', fontsize=11)
axes[1].set_title('Similarity Graph Degree Distribution', fontsize=13)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('figures/config_similarity_embedding.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Find Nearest Neighbors

In [None]:
# Find nearest neighbors for a sample of models
def get_nearest_neighbors(model_id, k=5):
    """Get k nearest neighbors for a given model"""
    if model_id not in df_sample['modelId'].values:
        return None
    
    model_idx = df_sample[df_sample['modelId'] == model_id].index[0]
    model_pos = df_sample.index.get_loc(model_idx)
    
    # Get distances to all other models
    distances = distance_matrix[model_pos, :]
    
    # Get k+1 nearest (including self)
    nearest_indices = np.argsort(distances)[:k+1]
    
    neighbors = []
    for idx in nearest_indices[1:]:  # Skip self
        neighbor_id = df_sample.iloc[idx]['modelId']
        distance = distances[idx]
        neighbors.append({
            'modelId': neighbor_id,
            'distance': distance,
            'similarity': 1 - distance
        })
    
    return neighbors

# Example: Find nearest neighbors for a few models
sample_models = df_sample['modelId'].head(5).tolist()

print("Nearest neighbors examples:\n")
for model_id in sample_models[:3]:
    neighbors = get_nearest_neighbors(model_id, k=5)
    if neighbors:
        print(f"\nModel: {model_id}")
        print(f"  Nearest neighbors:")
        for i, neighbor in enumerate(neighbors, 1):
            print(f"    {i}. {neighbor['modelId']} (similarity: {neighbor['similarity']:.3f}, distance: {neighbor['distance']:.3f})")

## 9. Compare Similarity Metrics

In [None]:
# Compare different similarity metrics on a small sample
COMPARE_SAMPLE = 100
df_compare = df_sample.head(COMPARE_SAMPLE).copy()

# Prepare numeric features only for L2/L1/cosine
numeric_data = df_compare[numeric_features].copy()
numeric_data = numeric_data.fillna(0)

# Standardize
scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(numeric_data)

# Compute different distance metrics
from scipy.spatial.distance import pdist, squareform

l2_distances = squareform(pdist(numeric_scaled, metric='euclidean'))
l1_distances = squareform(pdist(numeric_scaled, metric='cityblock'))
cosine_distances = squareform(pdist(numeric_scaled, metric='cosine'))

# Compare with Gower (on same sample)
gower_sample = distance_matrix[:COMPARE_SAMPLE, :COMPARE_SAMPLE]

# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

metrics = [
    ('Gower Distance', gower_sample, 'viridis'),
    ('L2 (Euclidean)', l2_distances / l2_distances.max(), 'plasma'),
    ('L1 (Manhattan)', l1_distances / l1_distances.max(), 'magma'),
    ('Cosine Distance', cosine_distances, 'cividis')
]

for idx, (name, dist_mat, cmap) in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    im = ax.imshow(dist_mat, cmap=cmap, aspect='auto', interpolation='nearest')
    ax.set_title(f'{name} Matrix (n={COMPARE_SAMPLE})', fontsize=11)
    plt.colorbar(im, ax=ax)

plt.tight_layout()
plt.savefig('figures/similarity_metrics_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nSimilarity metrics comparison:")
print(f"  Gower: range [{gower_sample[gower_sample > 0].min():.3f}, {gower_sample.max():.3f}]")
print(f"  L2: range [{l2_distances[l2_distances > 0].min():.3f}, {l2_distances.max():.3f}]")
print(f"  L1: range [{l1_distances[l1_distances > 0].min():.3f}, {l1_distances.max():.3f}]")
print(f"  Cosine: range [{cosine_distances[cosine_distances > 0].min():.3f}, {cosine_distances.max():.3f}]")

## 10. Summary Statistics

In [None]:
print("="*80)
print("CONFIG SIMILARITY GRAPH SUMMARY")
print("="*80)
print(f"\nDataset:")
print(f"  Total models analyzed: {len(df_sample):,}")
print(f"  Features used: {len(available_features)}")
print(f"    - Numeric: {len(numeric_features)}")
print(f"    - Categorical: {len(categorical_features)}")
print(f"    - Boolean: {len(boolean_feature_list)}")

print(f"\nSimilarity Graph:")
print(f"  Nodes: {len(G_similarity.nodes):,}")
print(f"  Edges: {len(G_similarity.edges):,}")
print(f"  Similarity threshold: {SIMILARITY_THRESHOLD}")
print(f"  Average degree: {2 * len(G_similarity.edges) / len(G_similarity.nodes):.2f}")
print(f"  Connected components: {nx.number_connected_components(G_similarity)}")

if len(G_similarity.nodes) > 0:
    print(f"  Average clustering: {nx.average_clustering(G_similarity):.3f}")
    print(f"  Graph density: {nx.density(G_similarity):.6f}")

print(f"\nDistance Statistics:")
print(f"  Mean distance: {distance_matrix[distance_matrix > 0].mean():.3f}")
print(f"  Median distance: {np.median(distance_matrix[distance_matrix > 0]):.3f}")
print(f"  Min distance: {distance_matrix[distance_matrix > 0].min():.3f}")
print(f"  Max distance: {distance_matrix.max():.3f}")

# Save summary
summary = {
    'n_models': len(df_sample),
    'n_features': len(available_features),
    'n_nodes': len(G_similarity.nodes),
    'n_edges': len(G_similarity.edges),
    'similarity_threshold': SIMILARITY_THRESHOLD,
    'mean_distance': float(distance_matrix[distance_matrix > 0].mean()),
    'median_distance': float(np.median(distance_matrix[distance_matrix > 0]))
}

pd.DataFrame([summary]).to_csv('config_similarity_summary.csv', index=False)
print("\n✓ Summary saved to config_similarity_summary.csv")