# Config Subgraph Analysis

**Goal**: Analyze architectural similarity within descendant clusters (subgraphs) and drift along lineage paths.

**Key Questions**:
1. How architecturally coherent are descendant clusters?
2. Do some families maintain architectural similarity better than others?
3. How does config drift accumulate along lineage paths?
4. Are architectures stable after the root, or do they drift continuously?

**Contents**:
- Subgraph similarity analysis (coherence within descendant clusters)
- Config drift by depth in lineage trees
- Cumulative drift along lineage paths
- Family-level coherence comparisons

**Dependencies**: Uses Gower distance function and requires family graph.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import warnings
warnings.filterwarnings('ignore')

# Style matching main repo
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

In [None]:
# Load config data
df = pd.read_csv('data/model_configs_expanded.csv', low_memory=False)
print(f"Loaded {len(df):,} models with config.json")
print(f"Total columns: {len(df.columns)}")

## 1. Setup: Feature Preparation and Gower Distance

**Note**: This section redefines the Gower distance function for self-contained execution.

In [None]:
# Define features and categorize (same as notebook 08/09)
architecture_features = [
    'config_model_type', 'config_hidden_size', 'config_num_hidden_layers',
    'config_num_attention_heads', 'config_intermediate_size'
]
capacity_features = [
    'config_vocab_size', 'config_max_position_embeddings', 'config_num_key_value_heads'
]
precision_features = [
    'config_torch_dtype', 'config_rope_theta', 'config_rope_scaling_type'
]
boolean_features = ['uses_moe', 'uses_gqa', 'uses_rope', 'uses_quantization']

all_features = architecture_features + capacity_features + precision_features + boolean_features
available_features = [f for f in all_features if f in df.columns]

# Categorize features
numeric_features = []
categorical_features = []
boolean_feature_list = []

for feat in available_features:
    if feat in df.columns:
        sample_values = df[feat].dropna().head(100)
        if len(sample_values) == 0:
            continue
        try:
            pd.to_numeric(sample_values, errors='raise')
            numeric_features.append(feat)
        except (ValueError, TypeError):
            unique_vals = sample_values.unique()
            if len(unique_vals) <= 2 and set(str(v).lower() for v in unique_vals).issubset({'true', 'false', '1', '0', 'yes', 'no', 'nan'}):
                boolean_feature_list.append(feat)
            else:
                categorical_features.append(feat)

print(f"Features prepared: {len(available_features)} total")

In [None]:
# Gower distance function
def gower_distance(x, y, numeric_cols, categorical_cols, boolean_cols):
    """Compute Gower distance between two config vectors."""
    distance = 0.0
    count = 0
    
    for col in numeric_cols:
        if col in x.index and col in y.index:
            x_val, y_val = x[col], y[col]
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            try:
                x_num, y_num = float(x_val), float(y_val)
                max_val = max(abs(x_num), abs(y_num))
                if max_val > 0:
                    distance += abs(x_num - y_num) / max_val
                count += 1
            except (ValueError, TypeError):
                continue
    
    for col in categorical_cols:
        if col in x.index and col in y.index:
            x_val, y_val = x[col], y[col]
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            if str(x_val) != str(y_val):
                distance += 1.0
            count += 1
    
    for col in boolean_cols:
        if col in x.index and col in y.index:
            x_val, y_val = x[col], y[col]
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            x_bool = bool(x_val) if not pd.isna(x_val) else False
            y_bool = bool(y_val) if not pd.isna(y_val) else False
            if x_bool != y_bool:
                distance += 1.0
            count += 1
    
    return distance / count if count > 0 else 1.0

print("✓ Gower distance function defined")

## 2. Load Family Graph

## 1. Prepare Config Features for Similarity Computation

In [None]:
# Analyze config similarity within subgraphs (descendant clusters)
if G_family is not None:
    print("Analyzing config similarity within descendant subgraphs...")
    
    # Find root nodes (nodes with no incoming edges)
    root_nodes = [n for n in G_family.nodes() if G_family.in_degree(n) == 0]
    print(f"Found {len(root_nodes):,} root nodes")
    
    # Analyze top root nodes by number of descendants
    root_descendant_counts = {}
    for root in root_nodes[:min(20, len(root_nodes))]:  # Top 20 roots
        descendants = set(nx.descendants(G_family, root))
        root_descendant_counts[root] = len(descendants)
    
    # Sort by descendant count
    top_roots = sorted(root_descendant_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    
    subgraph_stats = []
    
    for root_id, n_descendants in top_roots:
        if n_descendants < 5:  # Skip small subgraphs
            continue
        
        # Get subgraph
        descendants = set(nx.descendants(G_family, root_id)) | {root_id}
        subgraph_nodes = [n for n in descendants if n in df['modelId'].values]
        
        if len(subgraph_nodes) < 5:
            continue
        
        # Get config vectors for subgraph
        subgraph_df = df[df['modelId'].isin(subgraph_nodes)][available_features].copy()
        
        if len(subgraph_df) < 2:
            continue
        
        # Compute pairwise similarities within subgraph
        subgraph_similarities = []
        subgraph_indices = subgraph_df.index.tolist()
        
        for i in range(len(subgraph_indices)):
            for j in range(i+1, len(subgraph_indices)):
                vec_i = subgraph_df.loc[subgraph_indices[i]]
                vec_j = subgraph_df.loc[subgraph_indices[j]]
                
                dist = gower_distance(
                    vec_i, vec_j,
                    numeric_features,
                    categorical_features,
                    boolean_feature_list
                )
                subgraph_similarities.append(1 - dist)
        
        if len(subgraph_similarities) > 0:
            mean_sim = np.mean(subgraph_similarities)
            median_sim = np.median(subgraph_similarities)
            
            # Get family info
            root_family = df[df['modelId'] == root_id]['family'].iloc[0] if len(df[df['modelId'] == root_id]) > 0 else 'Unknown'
            
            subgraph_stats.append({
                'root_id': root_id,
                'n_nodes': len(subgraph_nodes),
                'n_descendants': n_descendants,
                'mean_similarity': mean_sim,
                'median_similarity': median_sim,
                'family': root_family
            })
    
    df_subgraph_stats = pd.DataFrame(subgraph_stats)
    
    if len(df_subgraph_stats) > 0:
        print(f"\\n✓ Analyzed {len(df_subgraph_stats)} subgraphs")
        print(f"\\nSubgraph similarity statistics:")
        print(df_subgraph_stats[['root_id', 'n_nodes', 'mean_similarity', 'family']].head(10).to_string(index=False))
        
        # Visualize
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Scatter: subgraph size vs similarity
        axes[0].scatter(df_subgraph_stats['n_nodes'], df_subgraph_stats['mean_similarity'], 
                       alpha=0.6, s=100, c='steelblue')
        axes[0].set_xlabel('Subgraph Size (Number of Nodes)', fontsize=11)
        axes[0].set_ylabel('Mean Config Similarity', fontsize=11)
        axes[0].set_title('Subgraph Size vs Config Similarity', fontsize=13)
        axes[0].grid(True, alpha=0.3)
        
        # Similarity by family
        if 'family' in df_subgraph_stats.columns and df_subgraph_stats['family'].nunique() > 1:
            family_similarity = df_subgraph_stats.groupby('family')['mean_similarity'].agg(['mean', 'count']).sort_values('count', ascending=False).head(10)
            
            x_pos = np.arange(len(family_similarity))
            axes[1].bar(x_pos, family_similarity['mean'], color='coral', alpha=0.7)
            axes[1].set_xticks(x_pos)
            axes[1].set_xticklabels(family_similarity.index, rotation=45, ha='right')
            axes[1].set_ylabel('Mean Config Similarity', fontsize=11)
            axes[1].set_title('Mean Subgraph Similarity by Family', fontsize=13)
            axes[1].grid(True, alpha=0.3, axis='y')
            
            # Add count labels
            for i, (idx, row) in enumerate(family_similarity.iterrows()):
                axes[1].text(i, row['mean'] + 0.01, f\"n={int(row['count'])}\", ha='center', fontsize=8)
        
        plt.tight_layout()
        plt.savefig('figures/subgraph_similarity_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Save subgraph stats
        df_subgraph_stats.to_csv('subgraph_similarity_stats.csv', index=False)
        print("\\n✓ Subgraph stats saved to subgraph_similarity_stats.csv")
    else:
        print("No subgraph statistics computed")
else:
    print("Family graph not available - skipping subgraph analysis")

## 14. Drift by Depth: Config Drift Curves

**Goal**: Measure cumulative config drift along lineage paths (root → ... → leaf), parallel to trait drift curves in the original paper.

In [None]:
# Analyze config drift by depth in lineage trees
if G_family is not None:
    print("Analyzing config drift by depth...")
    
    # Find some deep lineages
    def get_lineage_paths(root, max_depth=10, max_paths=100):
        \"\"\"Get paths from root to leaves\"\"\"
        paths = []
        leaves = [n for n in G_family.nodes() if G_family.out_degree(n) == 0 and nx.has_path(G_family, root, n)]
        
        for leaf in leaves[:max_paths]:
            try:
                path = nx.shortest_path(G_family, root, leaf)
                if len(path) <= max_depth:
                    paths.append(path)
            except:
                continue
        
        return paths
    
    # Analyze drift along paths for top roots
    depth_drift_data = []
    
    for root_id, _ in top_roots[:5]:  # Top 5 roots
        if root_id not in df['modelId'].values:
            continue
        
        paths = get_lineage_paths(root_id, max_depth=8, max_paths=50)
        
        for path in paths:
            # Filter to nodes with config data
            path_with_config = [n for n in path if n in df['modelId'].values]
            
            if len(path_with_config) < 2:
                continue
            
            # Get root config
            root_config = df[df['modelId'] == path_with_config[0]][available_features].iloc[0]
            
            # Compute cumulative drift along path
            for i, node_id in enumerate(path_with_config[1:], 1):
                node_config = df[df['modelId'] == node_id][available_features].iloc[0]
                
                drift = gower_distance(
                    root_config, node_config,
                    numeric_features,
                    categorical_features,
                    boolean_feature_list
                )
                
                depth_drift_data.append({
                    'root_id': root_id,
                    'node_id': node_id,
                    'depth': i,
                    'cumulative_drift': drift,
                    'path_length': len(path_with_config)
                })
    
    df_depth_drift = pd.DataFrame(depth_drift_data)
    
    if len(df_depth_drift) > 0:
        print(f"\\n✓ Analyzed drift along {df_depth_drift['root_id'].nunique()} lineage trees")
        
        # Visualize drift by depth
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Average drift by depth
        depth_stats = df_depth_drift.groupby('depth')['cumulative_drift'].agg(['mean', 'median', 'std', 'count']).reset_index()
        
        axes[0].plot(depth_stats['depth'], depth_stats['mean'], marker='o', linewidth=2, label='Mean', color='steelblue')
        axes[0].fill_between(depth_stats['depth'], 
                            depth_stats['mean'] - depth_stats['std'],
                            depth_stats['mean'] + depth_stats['std'],
                            alpha=0.2, color='steelblue')
        axes[0].plot(depth_stats['depth'], depth_stats['median'], marker='s', linewidth=2, label='Median', color='coral', linestyle='--')
        axes[0].set_xlabel('Depth in Lineage Tree', fontsize=11)
        axes[0].set_ylabel('Cumulative Config Drift', fontsize=11)
        axes[0].set_title('Config Drift vs Depth in Lineage Trees', fontsize=13)
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Distribution of drift at different depths
        depth_samples = [1, 3, 5, 7]
        available_depths = [d for d in depth_samples if d in df_depth_drift['depth'].values]
        
        if len(available_depths) > 0:
            drift_by_depth = [df_depth_drift[df_depth_drift['depth'] == d]['cumulative_drift'].values for d in available_depths]
            axes[1].boxplot(drift_by_depth, labels=[f'Depth {d}' for d in available_depths])
            axes[1].set_xlabel('Depth', fontsize=11)
            axes[1].set_ylabel('Cumulative Config Drift', fontsize=11)
            axes[1].set_title('Distribution of Config Drift at Different Depths', fontsize=13)
            axes[1].grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        plt.savefig('figures/drift_by_depth.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # Save depth drift data
        df_depth_drift.to_csv('config_drift_by_depth.csv', index=False)
        print("\\n✓ Depth drift data saved to config_drift_by_depth.csv")
    else:
        print("No depth drift data computed")
else:
    print("Family graph not available - skipping depth analysis")