# Config Similarity Basics

**Goal**: Core similarity computation and graph construction for config-based architectural similarity.

**Key Questions**:
1. Which models are architecturally similar?
2. How do models cluster in architecture space?
3. What are the nearest neighbors in architecture space?
4. How do different similarity metrics compare?

**Contents**:
- Feature preparation and categorization
- Gower distance implementation (handles mixed data types)
- Similarity matrix computation
- Similarity graph construction
- Graph visualization and statistics
- Nearest neighbors analysis
- Similarity metrics comparison (Gower, L2, L1, Cosine)

**This is the foundational notebook** - other notebooks build on these utilities.

## Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform
import warnings
import os
warnings.filterwarnings('ignore')

# Style matching main repo
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Create figures directory if it doesn't exist
os.makedirs('figures', exist_ok=True)

## 1. Load Data

In [None]:
# Load config data
df = pd.read_csv('../data/model_configs_expanded.csv', low_memory=False)
print(f"Loaded {len(df):,} models with config.json")
print(f"Total columns: {len(df.columns)}")

# Ensure modelId column exists
if 'modelId' not in df.columns:
    raise ValueError("modelId column not found in dataframe")

## 2. Prepare Config Features for Similarity Computation

In [None]:
# Define feature groups for similarity computation

# Core architecture features (high weight)
architecture_features = [
    'config_model_type',
    'config_hidden_size',
    'config_num_hidden_layers',
    'config_num_attention_heads',
    'config_intermediate_size'
]

# Capacity features (medium weight)
capacity_features = [
    'config_vocab_size',
    'config_max_position_embeddings',
    'config_num_key_value_heads'
]

# Precision/compute features (lower weight)
precision_features = [
    'config_torch_dtype',
    'config_rope_theta',
    'config_rope_scaling_type'
]

# Boolean flags (low weight)
boolean_features = [
    'uses_moe',
    'uses_gqa',
    'uses_rope',
    'uses_quantization'
]

# All features for similarity
all_features = architecture_features + capacity_features + precision_features + boolean_features

# Filter to features that exist in dataframe
available_features = [f for f in all_features if f in df.columns]
print(f"Available features for similarity: {len(available_features)}")
print(f"Features: {available_features}")

## 3. Categorize Features by Type

In [None]:
# Categorize features by data type for Gower distance computation
numeric_features = []
categorical_features = []
boolean_feature_list = []

for feat in available_features:
    if feat not in df.columns:
        continue
    
    # Check data type using sample values
    sample_values = df[feat].dropna().head(100)
    
    if len(sample_values) == 0:
        continue
    
    # Try to convert to numeric
    try:
        pd.to_numeric(sample_values, errors='raise')
        numeric_features.append(feat)
    except (ValueError, TypeError):
        # Check if boolean-like
        unique_vals = sample_values.unique()
        unique_strs = set(str(v).lower() for v in unique_vals if pd.notna(v))
        
        if len(unique_vals) <= 2 and unique_strs.issubset({'true', 'false', '1', '0', 'yes', 'no', 'nan'}):
            boolean_feature_list.append(feat)
        else:
            categorical_features.append(feat)

print(f"Numeric features ({len(numeric_features)}): {numeric_features}")
print(f"\nCategorical features ({len(categorical_features)}): {categorical_features}")
print(f"\nBoolean features ({len(boolean_feature_list)}): {boolean_feature_list}")

## 4. Implement Gower Distance Function

Gower distance handles mixed data types (numeric, categorical, boolean) and is ideal for config.json similarity.

In [None]:
def gower_distance(x, y, numeric_cols, categorical_cols, boolean_cols):
    """
    Compute Gower distance between two config vectors.
    
    Parameters:
    -----------
    x, y : pandas Series
        Config vectors to compare
    numeric_cols : list
        List of numeric feature column names
    categorical_cols : list
        List of categorical feature column names
    boolean_cols : list
        List of boolean feature column names
    
    Returns:
    --------
    float
        Gower distance (0 = identical, 1 = completely different)
    
    Notes:
    ------
    - Numeric: normalized absolute difference
    - Categorical: 0 if same, 1 if different
    - Boolean: 0 if same, 1 if different
    - Missing values: ignored in that dimension
    """
    distance = 0.0
    count = 0
    
    # Numeric features: normalized absolute difference
    for col in numeric_cols:
        if col in x.index and col in y.index:
            x_val = x[col]
            y_val = y[col]
            
            # Skip if either is missing
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            
            # Convert to numeric
            try:
                x_num = float(x_val)
                y_num = float(y_val)
                
                # Normalized difference (using max value for normalization)
                max_val = max(abs(x_num), abs(y_num))
                if max_val > 0:
                    distance += abs(x_num - y_num) / max_val
                else:
                    distance += 0  # Both are 0
                count += 1
            except (ValueError, TypeError):
                continue
    
    # Categorical features: 0 if same, 1 if different
    for col in categorical_cols:
        if col in x.index and col in y.index:
            x_val = x[col]
            y_val = y[col]
            
            # Skip if either is missing
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            
            # Compare as strings
            if str(x_val) != str(y_val):
                distance += 1.0
            count += 1
    
    # Boolean features: 0 if same, 1 if different
    for col in boolean_cols:
        if col in x.index and col in y.index:
            x_val = x[col]
            y_val = y[col]
            
            # Skip if either is missing
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            
            # Normalize boolean values
            x_bool = bool(x_val) if not pd.isna(x_val) else False
            y_bool = bool(y_val) if not pd.isna(y_val) else False
            
            if x_bool != y_bool:
                distance += 1.0
            count += 1
    
    # Return average distance (0 = identical, 1 = completely different)
    if count > 0:
        return distance / count
    else:
        return 1.0  # No common features = maximum distance

print("✓ Gower distance function defined")

## 5. Sample Data for MVP

For computational efficiency, we sample a subset of models. Increase SAMPLE_SIZE for more comprehensive analysis.

In [None]:
# Filter to models with at least some key features
key_features = ['config_hidden_size', 'config_num_hidden_layers', 'config_model_type']
df_complete = df[df[key_features].notna().any(axis=1)].copy()

# Sample for MVP (can increase later)
SAMPLE_SIZE = 1000  # Start with 1000 models for MVP

if len(df_complete) > SAMPLE_SIZE:
    # Stratified sample by family if available
    if 'family' in df_complete.columns and df_complete['family'].notna().sum() > 0:
        df_sample = df_complete.groupby('family', group_keys=False).apply(
            lambda x: x.sample(min(len(x), max(1, SAMPLE_SIZE // df_complete['family'].nunique())), random_state=42)
        ).head(SAMPLE_SIZE)
    else:
        df_sample = df_complete.sample(n=SAMPLE_SIZE, random_state=42)
else:
    df_sample = df_complete.copy()

# Reset index for easier indexing
df_sample = df_sample.reset_index(drop=True)

print(f"Sampled {len(df_sample):,} models for similarity computation")
print(f"This will compute {len(df_sample) * (len(df_sample) - 1) // 2:,} pairwise distances")

# Verify we have the required columns
if 'modelId' not in df_sample.columns:
    raise ValueError("modelId column not found in sampled dataframe")

## 6. Compute Similarity Matrix

Compute pairwise Gower distances between all sampled models.

In [None]:
# Compute pairwise similarity matrix using Gower distance
print("Computing similarity matrix...")
print(f"Processing {len(df_sample):,} models...")

n_models = len(df_sample)
similarity_matrix = np.zeros((n_models, n_models))

# Compute pairwise distances
for i in range(n_models):
    if i % 100 == 0:
        print(f"  Processed {i}/{n_models} models...")
    
    for j in range(i+1, n_models):
        vec_i = df_sample.iloc[i][available_features]
        vec_j = df_sample.iloc[j][available_features]
        
        # Compute Gower distance
        distance = gower_distance(
            vec_i, vec_j,
            numeric_features,
            categorical_features,
            boolean_feature_list
        )
        
        # Store in symmetric matrix
        similarity_matrix[i, j] = distance
        similarity_matrix[j, i] = distance

# Diagonal is 0 (self-similarity)
np.fill_diagonal(similarity_matrix, 0.0)

print(f"\n✓ Computed similarity matrix: {n_models}x{n_models}")
print(f"  Mean distance: {similarity_matrix[similarity_matrix > 0].mean():.3f}")
print(f"  Median distance: {np.median(similarity_matrix[similarity_matrix > 0]):.3f}")
print(f"  Min distance: {similarity_matrix[similarity_matrix > 0].min():.3f}")
print(f"  Max distance: {similarity_matrix[similarity_matrix > 0].max():.3f}")

## 7. Build Similarity Graph

Construct a graph where edges connect architecturally similar models.

In [None]:
# Build similarity graph
# Connect models that are architecturally similar (distance below threshold)

# Use median distance as threshold (or top-k nearest neighbors)
distance_threshold = np.median(similarity_matrix[similarity_matrix > 0])
# Alternative: connect to top-k nearest neighbors
K_NEAREST = 5  # Each model connects to its 5 most similar neighbors

print(f"Building similarity graph...")
print(f"  Distance threshold: {distance_threshold:.3f}")
print(f"  K-nearest neighbors: {K_NEAREST}")

G_similarity = nx.Graph()

# Add nodes
for idx, row in df_sample.iterrows():
    G_similarity.add_node(row['modelId'], **row.to_dict())

# Add edges based on similarity
edges_added = 0
for i in range(n_models):
    model_i = df_sample.iloc[i]['modelId']
    
    # Get distances to all other models
    distances = similarity_matrix[i, :]
    
    # Find k nearest neighbors (excluding self)
    nearest_indices = np.argsort(distances)[1:K_NEAREST+1]  # Skip index 0 (self)
    
    for j in nearest_indices:
        model_j = df_sample.iloc[j]['modelId']
        distance = distances[j]
        
        # Add edge if below threshold
        if distance <= distance_threshold:
            G_similarity.add_edge(model_i, model_j, weight=1-distance, distance=distance)
            edges_added += 1

print(f"\n✓ Built similarity graph:")
print(f"  Nodes: {len(G_similarity.nodes):,}")
print(f"  Edges: {len(G_similarity.edges):,}")
if len(G_similarity.nodes) > 0:
    print(f"  Average degree: {2*len(G_similarity.edges)/len(G_similarity.nodes):.2f}")
    print(f"  Connected components: {nx.number_connected_components(G_similarity)}")

## 8. Visualize Similarity Clusters

Visualize the similarity graph and identify architectural clusters.

In [None]:
# Visualize similarity graph using network layout
if len(G_similarity.nodes) > 0:
    print("Computing graph layout...")
    
    # Use spring layout for visualization
    pos = nx.spring_layout(G_similarity, k=1, iterations=50, seed=42)
    
    # Get node colors by family if available
    if 'family' in df_sample.columns and df_sample['family'].notna().sum() > 0:
        families = df_sample.set_index('modelId')['family'].to_dict()
        unique_families = df_sample['family'].dropna().unique()
        family_colors = plt.cm.tab20(np.linspace(0, 1, len(unique_families)))
        family_color_map = dict(zip(unique_families, family_colors))
        node_colors = [family_color_map.get(families.get(node, 'Unknown'), 'gray') for node in G_similarity.nodes()]
    else:
        node_colors = 'steelblue'
    
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))
    
    # Left: Full graph
    nx.draw(G_similarity, pos, ax=axes[0], 
            node_color=node_colors, 
            node_size=30, 
            edge_color='gray', 
            alpha=0.6,
            with_labels=False,
            width=0.5)
    axes[0].set_title(f'Similarity Graph ({len(G_similarity.nodes):,} nodes, {len(G_similarity.edges):,} edges)', fontsize=13)
    
    # Right: Largest connected component
    if nx.number_connected_components(G_similarity) > 0:
        largest_cc = max(nx.connected_components(G_similarity), key=len)
        G_largest = G_similarity.subgraph(largest_cc).copy()
        
        if len(G_largest.nodes) > 0:
            pos_largest = nx.spring_layout(G_largest, k=1, iterations=50, seed=42)
            if isinstance(node_colors, list):
                node_colors_largest = [node_colors[list(G_similarity.nodes).index(node)] for node in G_largest.nodes]
            else:
                node_colors_largest = node_colors
            
            nx.draw(G_largest, pos_largest, ax=axes[1],
                   node_color=node_colors_largest,
                   node_size=50,
                   edge_color='gray',
                   alpha=0.7,
                   with_labels=False,
                   width=0.8)
            axes[1].set_title(f'Largest Connected Component ({len(G_largest.nodes):,} nodes)', fontsize=13)
    
    plt.tight_layout()
    plt.savefig('figures/similarity_graph.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Graph statistics
    print(f"\nGraph Statistics:")
    print(f"  Number of nodes: {len(G_similarity.nodes):,}")
    print(f"  Number of edges: {len(G_similarity.edges):,}")
    print(f"  Number of connected components: {nx.number_connected_components(G_similarity)}")
    if len(G_similarity.nodes) > 0:
        print(f"  Average clustering coefficient: {nx.average_clustering(G_similarity):.3f}")
        print(f"  Density: {nx.density(G_similarity):.4f}")
else:
    print("No graph to visualize")

## 9. Find Nearest Neighbors

For each model, find its architecturally most similar neighbors.

In [None]:
# Find nearest neighbors for each model
print("Finding nearest neighbors...")

nearest_neighbors = []
n_show = min(20, len(df_sample))  # Show top 20 models

for i in range(n_show):
    model_id = df_sample.iloc[i]['modelId']
    distances = similarity_matrix[i, :]
    
    # Get top 5 nearest neighbors (excluding self)
    nearest_indices = np.argsort(distances)[1:6]
    
    neighbors_info = []
    for j in nearest_indices:
        neighbor_id = df_sample.iloc[j]['modelId']
        distance = distances[j]
        similarity = 1 - distance
        neighbors_info.append({
            'neighbor': neighbor_id,
            'distance': distance,
            'similarity': similarity
        })
    
    nearest_neighbors.append({
        'model': model_id,
        'neighbors': neighbors_info
    })

# Display sample
print(f"\nSample nearest neighbors (showing first 5 models):")
for i, nn_info in enumerate(nearest_neighbors[:5]):
    print(f"\n{i+1}. {nn_info['model']}")
    for j, neighbor in enumerate(nn_info['neighbors'][:3]):
        print(f"   {j+1}. {neighbor['neighbor']} (similarity: {neighbor['similarity']:.3f}, distance: {neighbor['distance']:.3f})")

## 10. Compare Similarity Metrics

Compare Gower distance with other similarity metrics (L2, L1, Cosine).

In [None]:
# Compare different similarity metrics on a sample
print("Comparing similarity metrics...")

# Sample pairs for comparison
n_pairs = min(10, len(df_sample) // 2)
comparison_pairs = [(i, i+1) for i in range(n_pairs)]

metrics_comparison = []

for i, j in comparison_pairs:
    vec_i = df_sample.iloc[i][available_features]
    vec_j = df_sample.iloc[j][available_features]
    
    # Extract numeric features only for L2/L1/Cosine
    numeric_vec_i = vec_i[numeric_features].dropna()
    numeric_vec_j = vec_j[numeric_features].dropna()
    
    # Common numeric features
    common_numeric = numeric_vec_i.index.intersection(numeric_vec_j.index)
    
    if len(common_numeric) > 0:
        vec_i_numeric = numeric_vec_i[common_numeric].astype(float)
        vec_j_numeric = numeric_vec_j[common_numeric].astype(float)
        
        # Standardize for fair comparison
        scaler = StandardScaler()
        vec_i_scaled = scaler.fit_transform(vec_i_numeric.values.reshape(1, -1))[0]
        vec_j_scaled = scaler.fit_transform(vec_j_numeric.values.reshape(1, -1))[0]
        
        # Compute metrics
        gower_dist = gower_distance(vec_i, vec_j, numeric_features, categorical_features, boolean_feature_list)
        l2_dist = np.linalg.norm(vec_i_scaled - vec_j_scaled)
        l1_dist = np.sum(np.abs(vec_i_scaled - vec_j_scaled))
        
        # Cosine similarity (1 - cosine distance)
        norm_i = np.linalg.norm(vec_i_scaled)
        norm_j = np.linalg.norm(vec_j_scaled)
        if norm_i > 0 and norm_j > 0:
            cosine_sim = np.dot(vec_i_scaled, vec_j_scaled) / (norm_i * norm_j)
            cosine_dist = 1 - cosine_sim
        else:
            cosine_dist = 1.0
        
        metrics_comparison.append({
            'pair': f"{df_sample.iloc[i]['modelId']} vs {df_sample.iloc[j]['modelId']}",
            'gower': gower_dist,
            'l2': l2_dist,
            'l1': l1_dist,
            'cosine': cosine_dist
        })

if len(metrics_comparison) > 0:
    df_metrics = pd.DataFrame(metrics_comparison)
    print(f"\n✓ Compared {len(df_metrics)} pairs")
    print(f"\nMetric comparison (mean distances):")
    print(f"  Gower: {df_metrics['gower'].mean():.3f}")
    print(f"  L2 (Euclidean): {df_metrics['l2'].mean():.3f}")
    print(f"  L1 (Manhattan): {df_metrics['l1'].mean():.3f}")
    print(f"  Cosine distance: {df_metrics['cosine'].mean():.3f}")
    
    # Visualize comparison
    fig, ax = plt.subplots(figsize=(12, 6))
    x_pos = np.arange(len(df_metrics))
    width = 0.2
    
    ax.bar(x_pos - 1.5*width, df_metrics['gower'], width, label='Gower', alpha=0.7)
    ax.bar(x_pos - 0.5*width, df_metrics['l2'], width, label='L2', alpha=0.7)
    ax.bar(x_pos + 0.5*width, df_metrics['l1'], width, label='L1', alpha=0.7)
    ax.bar(x_pos + 1.5*width, df_metrics['cosine'], width, label='Cosine', alpha=0.7)
    
    ax.set_xlabel('Model Pair', fontsize=11)
    ax.set_ylabel('Distance', fontsize=11)
    ax.set_title('Similarity Metric Comparison', fontsize=13)
    ax.set_xticks(x_pos)
    ax.set_xticklabels([f"Pair {i+1}" for i in range(len(df_metrics))], rotation=45, ha='right')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig('figures/similarity_metrics_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No valid pairs for metric comparison")

## 11. Summary Statistics

Summary of similarity analysis results.

In [None]:
# Summary statistics
print("="*60)
print("CONFIG SIMILARITY ANALYSIS SUMMARY")
print("="*60)
print(f"\nDataset:")
print(f"  Total models analyzed: {len(df_sample):,}")
print(f"  Features used: {len(available_features)}")
print(f"    - Numeric: {len(numeric_features)}")
print(f"    - Categorical: {len(categorical_features)}")
print(f"    - Boolean: {len(boolean_feature_list)}")

print(f"\nSimilarity Matrix:")
print(f"  Size: {similarity_matrix.shape[0]}x{similarity_matrix.shape[1]}")
print(f"  Mean distance: {similarity_matrix[similarity_matrix > 0].mean():.3f}")
print(f"  Median distance: {np.median(similarity_matrix[similarity_matrix > 0]):.3f}")
print(f"  Min distance: {similarity_matrix[similarity_matrix > 0].min():.3f}")
print(f"  Max distance: {similarity_matrix[similarity_matrix > 0].max():.3f}")

print(f"\nSimilarity Graph:")
print(f"  Nodes: {len(G_similarity.nodes):,}")
print(f"  Edges: {len(G_similarity.edges):,}")
print(f"  Connected components: {nx.number_connected_components(G_similarity)}")
if len(G_similarity.nodes) > 0:
    print(f"  Average degree: {2*len(G_similarity.edges)/len(G_similarity.nodes):.2f}")
    print(f"  Density: {nx.density(G_similarity):.4f}")
    print(f"  Average clustering: {nx.average_clustering(G_similarity):.3f}")

print("\n" + "="*60)

## 12. Optional: Config Drift Analysis

**Goal**: Measure how architecture drifts along family tree edges, replicating the trait drift analysis from the AI Ecosystem paper.

**Key Questions**:
- Do fine-tunes preserve architecture?
- Which families mutate architecture the most?
- What is the distribution of config drift within vs between families?

**Note**: This section requires family graph data. It will gracefully skip if graph files are not available.

In [None]:
# Load family graph to analyze parent-child relationships
import pickle

# Try to load the family graph
G_family = None
try:
    graph_paths = [
        '../data/ai_ecosystem_graph_finetune_fulljson.pkl',
        '../data/ai_ecosystem_graph_nomerges.pkl',
        '../data/ai_ecosystem_graph.pkl'
    ]
    for path in graph_paths:
        if os.path.exists(path):
            with open(path, 'rb') as f:
                G_family = pickle.load(f)
            print(f"✓ Loaded family graph from {path}")
            print(f"  Nodes: {len(G_family.nodes):,}")
            print(f"  Edges: {len(G_family.edges):,}")
            break
except Exception as e:
    print(f"Could not load graph: {e}")
    print("Will compute drift from parent_model columns in dataframe")

# Compute config drift for parent-child pairs
if G_family is not None:
    # Extract parent-child pairs from graph
    parent_child_pairs = []
    for parent, child in G_family.edges():
        if parent in df['modelId'].values and child in df['modelId'].values:
            parent_child_pairs.append((parent, child))
    
    print(f"Found {len(parent_child_pairs):,} parent-child pairs in graph")
else:
    # Fallback: use parent_model columns from dataframe
    print("Using parent_model columns from dataframe")
    parent_child_pairs = []
    
    # Check for parent columns
    parent_cols = ['parent_model', 'finetune_parent', 'quantized_parent', 'adapter_parent', 'merge_parent']
    available_parent_cols = [col for col in parent_cols if col in df.columns]
    
    if len(available_parent_cols) > 0:
        for idx, row in df.iterrows():
            model_id = row['modelId']
            for col in available_parent_cols:
                if pd.notna(row[col]):
                    try:
                        parents = eval(row[col]) if isinstance(row[col], str) else row[col]
                        if isinstance(parents, list):
                            for parent in parents:
                                if parent in df['modelId'].values:
                                    parent_child_pairs.append((parent, model_id))
                    except:
                        continue
        
        print(f"Found {len(parent_child_pairs):,} parent-child pairs from dataframe columns")
    else:
        print("⚠ No parent columns found in dataframe")

# Sample pairs for analysis (if too many)
MAX_PAIRS = 5000
if len(parent_child_pairs) > MAX_PAIRS:
    import random
    random.seed(42)
    parent_child_pairs = random.sample(parent_child_pairs, MAX_PAIRS)
    print(f"Sampled {len(parent_child_pairs):,} pairs for analysis")

print(f"\nTotal parent-child pairs to analyze: {len(parent_child_pairs):,}")

In [None]:
# Compute config drift for each parent-child pair
if len(parent_child_pairs) > 0:
    print("Computing config drift for parent-child pairs...")
    
    drift_data = []
    for i, (parent_id, child_id) in enumerate(parent_child_pairs):
        if i % 500 == 0 and i > 0:
            print(f"  Processed {i}/{len(parent_child_pairs)} pairs...")
        
        # Get config vectors
        parent_row = df[df['modelId'] == parent_id]
        child_row = df[df['modelId'] == child_id]
        
        if len(parent_row) == 0 or len(child_row) == 0:
            continue
        
        parent_vec = parent_row.iloc[0][available_features]
        child_vec = child_row.iloc[0][available_features]
        
        # Compute Gower distance (drift)
        drift = gower_distance(
            parent_vec,
            child_vec,
            numeric_features,
            categorical_features,
            boolean_feature_list
        )
        
        # Get family info if available
        parent_family = parent_row.iloc[0].get('family', 'Unknown')
        child_family = child_row.iloc[0].get('family', 'Unknown')
        same_family = parent_family == child_family and parent_family != 'Unknown'
        
        drift_data.append({
            'parent_id': parent_id,
            'child_id': child_id,
            'drift': drift,
            'similarity': 1 - drift,
            'parent_family': parent_family,
            'child_family': child_family,
            'same_family': same_family
        })
    
    df_drift = pd.DataFrame(drift_data)
    print(f"\n✓ Computed drift for {len(df_drift):,} parent-child pairs")
    if len(df_drift) > 0:
        print(f"  Mean drift: {df_drift['drift'].mean():.3f}")
        print(f"  Median drift: {df_drift['drift'].median():.3f}")
    else:
        print("  ⚠ No drift data computed - check parent-child pairs extraction")
else:
    print("⚠ No parent-child pairs found - skipping drift analysis")

In [None]:
# Visualize config drift distributions
if 'df_drift' in locals() and len(df_drift) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Overall drift distribution
    axes[0,0].hist(df_drift['drift'], bins=50, color='steelblue', alpha=0.7, edgecolor='white')
    median_drift = df_drift['drift'].median()
    axes[0,0].axvline(median_drift, color='red', linestyle='--', linewidth=2, label=f'Median: {median_drift:.3f}')
    axes[0,0].set_xlabel('Config Drift (Gower Distance)', fontsize=11)
    axes[0,0].set_ylabel('Count', fontsize=11)
    axes[0,0].set_title('Distribution of Config Drift Along Parent-Child Edges', fontsize=13)
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3, axis='y')
    
    # 2. Within-family vs between-family drift
    if df_drift['same_family'].sum() > 0:
        within_family = df_drift[df_drift['same_family'] == True]['drift']
        between_family = df_drift[df_drift['same_family'] == False]['drift']
        
        axes[0,1].hist([within_family, between_family], bins=30, label=['Within Family', 'Between Families'], 
                      alpha=0.7, color=['seagreen', 'coral'], edgecolor='white')
        axes[0,1].set_xlabel('Config Drift', fontsize=11)
        axes[0,1].set_ylabel('Count', fontsize=11)
        axes[0,1].set_title('Config Drift: Within vs Between Families', fontsize=13)
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3, axis='y')
        
        print(f"\nWithin-family drift: mean={within_family.mean():.3f}, median={within_family.median():.3f}")
        print(f"Between-family drift: mean={between_family.mean():.3f}, median={between_family.median():.3f}")
    
    # 3. Drift by family (top families)
    if 'parent_family' in df_drift.columns:
        family_drift = df_drift.groupby('parent_family')['drift'].agg(['mean', 'median', 'count']).sort_values('count', ascending=False)
        top_families = family_drift.head(10)
        
        if len(top_families) > 0:
            x_pos = np.arange(len(top_families))
            width = 0.35
            axes[1,0].bar(x_pos - width/2, top_families['mean'], width, label='Mean', color='steelblue', alpha=0.7)
            axes[1,0].bar(x_pos + width/2, top_families['median'], width, label='Median', color='coral', alpha=0.7)
            axes[1,0].set_xticks(x_pos)
            axes[1,0].set_xticklabels(top_families.index, rotation=45, ha='right')
            axes[1,0].set_ylabel('Config Drift', fontsize=11)
            axes[1,0].set_title('Config Drift by Family (Top 10)', fontsize=13)
            axes[1,0].legend()
            axes[1,0].grid(True, alpha=0.3, axis='y')
    
    # 4. Cumulative drift distribution
    sorted_drift = np.sort(df_drift['drift'])
    cumulative = np.arange(1, len(sorted_drift) + 1) / len(sorted_drift)
    axes[1,1].plot(sorted_drift, cumulative, linewidth=2, color='purple')
    axes[1,1].axvline(df_drift['drift'].median(), color='red', linestyle='--', linewidth=2, alpha=0.7)
    axes[1,1].set_xlabel('Config Drift', fontsize=11)
    axes[1,1].set_ylabel('Cumulative Fraction', fontsize=11)
    axes[1,1].set_title('Cumulative Distribution of Config Drift', fontsize=13)
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('figures/config_drift_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save drift data
    df_drift.to_csv('config_drift_pairs.csv', index=False)
    print("\n✓ Drift data saved to config_drift_pairs.csv")
else:
    print("No drift data to visualize")