# Config Drift Analysis

**Goal**: Analyze how architecture drifts along parent-child relationships in the model ecosystem.

**Key Questions**:
1. How much does architecture change between parent and child models?
2. Do fine-tunes preserve architecture more than other relationship types?
3. Which config fields mutate most frequently?
4. Are some families more architecturally stable than others?

**Contents**:
- Config drift computation for parent-child pairs
- Within-family vs between-family drift comparison
- Mutational landscape analysis (mutation rates per config field)
- Family-specific mutation profiles

**Dependencies**: Uses Gower distance function (defined in notebook 08 or redefined here).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import warnings
warnings.filterwarnings('ignore')

# Style matching main repo
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

In [None]:
# Load config data
df = pd.read_csv('data/model_configs_expanded.csv', low_memory=False)
print(f"Loaded {len(df):,} models with config.json")
print(f"Total columns: {len(df.columns)}")

## 1. Setup: Feature Preparation and Gower Distance

**Note**: This section redefines the Gower distance function and feature preparation for self-contained execution. If you've already run notebook 08, you can skip this and import those utilities instead.

In [None]:
# Define feature groups for similarity computation
architecture_features = [
    'config_model_type', 'config_hidden_size', 'config_num_hidden_layers',
    'config_num_attention_heads', 'config_intermediate_size'
]
capacity_features = [
    'config_vocab_size', 'config_max_position_embeddings', 'config_num_key_value_heads'
]
precision_features = [
    'config_torch_dtype', 'config_rope_theta', 'config_rope_scaling_type'
]
boolean_features = ['uses_moe', 'uses_gqa', 'uses_rope', 'uses_quantization']

all_features = architecture_features + capacity_features + precision_features + boolean_features
available_features = [f for f in all_features if f in df.columns]

# Categorize features by type
numeric_features = []
categorical_features = []
boolean_feature_list = []

for feat in available_features:
    if feat in df.columns:
        sample_values = df[feat].dropna().head(100)
        if len(sample_values) == 0:
            continue
        try:
            pd.to_numeric(sample_values, errors='raise')
            numeric_features.append(feat)
        except (ValueError, TypeError):
            unique_vals = sample_values.unique()
            if len(unique_vals) <= 2 and set(str(v).lower() for v in unique_vals).issubset({'true', 'false', '1', '0', 'yes', 'no', 'nan'}):
                boolean_feature_list.append(feat)
            else:
                categorical_features.append(feat)

print(f"Features: {len(available_features)} total ({len(numeric_features)} numeric, {len(categorical_features)} categorical, {len(boolean_feature_list)} boolean)")

In [None]:
# Gower distance function (for mixed numeric + categorical similarity)
def gower_distance(x, y, numeric_cols, categorical_cols, boolean_cols):
    """Compute Gower distance between two config vectors."""
    distance = 0.0
    count = 0
    
    # Numeric features: normalized absolute difference
    for col in numeric_cols:
        if col in x.index and col in y.index:
            x_val, y_val = x[col], y[col]
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            try:
                x_num, y_num = float(x_val), float(y_val)
                max_val = max(abs(x_num), abs(y_num))
                if max_val > 0:
                    distance += abs(x_num - y_num) / max_val
                count += 1
            except (ValueError, TypeError):
                continue
    
    # Categorical features: 0 if same, 1 if different
    for col in categorical_cols:
        if col in x.index and col in y.index:
            x_val, y_val = x[col], y[col]
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            if str(x_val) != str(y_val):
                distance += 1.0
            count += 1
    
    # Boolean features: 0 if same, 1 if different
    for col in boolean_cols:
        if col in x.index and col in y.index:
            x_val, y_val = x[col], y[col]
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            x_bool = bool(x_val) if not pd.isna(x_val) else False
            y_bool = bool(y_val) if not pd.isna(y_val) else False
            if x_bool != y_bool:
                distance += 1.0
            count += 1
    
    return distance / count if count > 0 else 1.0

print("✓ Gower distance function defined")

## 2. Load Family Graph and Extract Parent-Child Pairs

## 1. Prepare Config Features for Similarity Computation

In [None]:
# Load family graph to analyze parent-child relationships
import pickle
import os

# Try to load the family graph
G_family = None
try:
    graph_paths = [
        'data/ai_ecosystem_graph_finetune_fulljson.pkl',
        'data/ai_ecosystem_graph_nomerges.pkl',
        'data/ai_ecosystem_graph.pkl'
    ]
    for path in graph_paths:
        if os.path.exists(path):
            with open(path, 'rb') as f:
                G_family = pickle.load(f)
            print(f"Loaded family graph from {path}")
            print(f"  Nodes: {len(G_family.nodes):,}")
            print(f"  Edges: {len(G_family.edges):,}")
            break
except Exception as e:
    print(f"Could not load graph: {e}")
    print("Will compute drift from parent_model columns in dataframe")

# Compute config drift for parent-child pairs
if G_family is not None:
    # Extract parent-child pairs from graph
    parent_child_pairs = []
    for parent, child in G_family.edges():
        if parent in df['modelId'].values and child in df['modelId'].values:
            parent_child_pairs.append((parent, child))
    
    print(f"Found {len(parent_child_pairs):,} parent-child pairs in graph")
else:
    # Fallback: use parent_model columns from dataframe
    print("Using parent_model columns from dataframe")
    parent_child_pairs = []
    
    # Check for parent columns
    parent_cols = ['parent_model', 'finetune_parent', 'quantized_parent', 'adapter_parent', 'merge_parent']
    available_parent_cols = [col for col in parent_cols if col in df.columns]
    
    if len(available_parent_cols) > 0:
        for idx, row in df.iterrows():
            model_id = row['modelId']
            for col in available_parent_cols:
                if pd.notna(row[col]):
                    try:
                        parents = eval(row[col]) if isinstance(row[col], str) else row[col]
                        if isinstance(parents, list):
                            for parent in parents:
                                if parent in df['modelId'].values:
                                    parent_child_pairs.append((parent, model_id))
                    except:
                        continue
        
        print(f"Found {len(parent_child_pairs):,} parent-child pairs from dataframe columns")

# Sample pairs for analysis (if too many)
MAX_PAIRS = 5000
if len(parent_child_pairs) > MAX_PAIRS:
    import random
    parent_child_pairs = random.sample(parent_child_pairs, MAX_PAIRS)
    print(f"Sampled {len(parent_child_pairs):,} pairs for analysis")

print(f"\\nTotal parent-child pairs to analyze: {len(parent_child_pairs):,}")

In [None]:
# Compute config drift for each parent-child pair
print("Computing config drift for parent-child pairs...")

drift_data = []
for i, (parent_id, child_id) in enumerate(parent_child_pairs):
    if i % 500 == 0 and i > 0:
        print(f"  Processed {i}/{len(parent_child_pairs)} pairs...")
    
    # Get config vectors
    parent_row = df[df['modelId'] == parent_id]
    child_row = df[df['modelId'] == child_id]
    
    if len(parent_row) == 0 or len(child_row) == 0:
        continue
    
    parent_vec = parent_row.iloc[0][available_features]
    child_vec = child_row.iloc[0][available_features]
    
    # Compute Gower distance (drift)
    drift = gower_distance(
        parent_vec,
        child_vec,
        numeric_features,
        categorical_features,
        boolean_feature_list
    )
    
    # Get family info if available
    parent_family = parent_row.iloc[0].get('family', 'Unknown')
    child_family = child_row.iloc[0].get('family', 'Unknown')
    same_family = parent_family == child_family and parent_family != 'Unknown'
    
    drift_data.append({
        'parent_id': parent_id,
        'child_id': child_id,
        'drift': drift,
        'similarity': 1 - drift,
        'parent_family': parent_family,
        'child_family': child_family,
        'same_family': same_family
    })

df_drift = pd.DataFrame(drift_data)
print(f"\\n✓ Computed drift for {len(df_drift):,} parent-child pairs")
print(f"  Mean drift: {df_drift['drift'].mean():.3f}")
print(f"  Median drift: {df_drift['drift'].median():.3f}")

In [None]:
# Visualize config drift distributions
if len(df_drift) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Overall drift distribution
    axes[0,0].hist(df_drift['drift'], bins=50, color='steelblue', alpha=0.7, edgecolor='white')
    axes[0,0].axvline(df_drift['drift'].median(), color='red', linestyle='--', linewidth=2, label=f'Median: {df_drift[\"drift\"].median():.3f}')
    axes[0,0].set_xlabel('Config Drift (Gower Distance)', fontsize=11)
    axes[0,0].set_ylabel('Count', fontsize=11)
    axes[0,0].set_title('Distribution of Config Drift Along Parent-Child Edges', fontsize=13)
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3, axis='y')
    
    # 2. Within-family vs between-family drift
    if df_drift['same_family'].sum() > 0:
        within_family = df_drift[df_drift['same_family'] == True]['drift']
        between_family = df_drift[df_drift['same_family'] == False]['drift']
        
        axes[0,1].hist([within_family, between_family], bins=30, label=['Within Family', 'Between Families'], 
                      alpha=0.7, color=['seagreen', 'coral'], edgecolor='white')
        axes[0,1].set_xlabel('Config Drift', fontsize=11)
        axes[0,1].set_ylabel('Count', fontsize=11)
        axes[0,1].set_title('Config Drift: Within vs Between Families', fontsize=13)
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3, axis='y')
        
        print(f"\\nWithin-family drift: mean={within_family.mean():.3f}, median={within_family.median():.3f}")
        print(f"Between-family drift: mean={between_family.mean():.3f}, median={between_family.median():.3f}")
    
    # 3. Drift by family (top families)
    if 'parent_family' in df_drift.columns:
        family_drift = df_drift.groupby('parent_family')['drift'].agg(['mean', 'median', 'count']).sort_values('count', ascending=False)
        top_families = family_drift.head(10)
        
        x_pos = np.arange(len(top_families))
        width = 0.35
        axes[1,0].bar(x_pos - width/2, top_families['mean'], width, label='Mean', color='steelblue', alpha=0.7)
        axes[1,0].bar(x_pos + width/2, top_families['median'], width, label='Median', color='coral', alpha=0.7)
        axes[1,0].set_xticks(x_pos)
        axes[1,0].set_xticklabels(top_families.index, rotation=45, ha='right')
        axes[1,0].set_ylabel('Config Drift', fontsize=11)
        axes[1,0].set_title('Config Drift by Family (Top 10)', fontsize=13)
        axes[1,0].legend()
        axes[1,0].grid(True, alpha=0.3, axis='y')
    
    # 4. Cumulative drift distribution
    sorted_drift = np.sort(df_drift['drift'])
    cumulative = np.arange(1, len(sorted_drift) + 1) / len(sorted_drift)
    axes[1,1].plot(sorted_drift, cumulative, linewidth=2, color='purple')
    axes[1,1].axvline(df_drift['drift'].median(), color='red', linestyle='--', linewidth=2, alpha=0.7)
    axes[1,1].set_xlabel('Config Drift', fontsize=11)
    axes[1,1].set_ylabel('Cumulative Fraction', fontsize=11)
    axes[1,1].set_title('Cumulative Distribution of Config Drift', fontsize=13)
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('figures/config_drift_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save drift data
    df_drift.to_csv('config_drift_pairs.csv', index=False)
    print("\\n✓ Drift data saved to config_drift_pairs.csv")
else:
    print("No drift data to visualize")

## 12. Mutational Landscape: Which Config Fields Drift Most?

**Goal**: Identify which config fields mutate most frequently along parent-child edges, parallel to trait mutation analysis in the original paper.

In [None]:
# Analyze which config fields change most frequently
print("Analyzing mutational landscape...")

mutation_counts = {feat: {'changed': 0, 'unchanged': 0, 'missing': 0} for feat in available_features}

for parent_id, child_id in parent_child_pairs[:min(5000, len(parent_child_pairs))]:
    parent_row = df[df['modelId'] == parent_id]
    child_row = df[df['modelId'] == child_id]
    
    if len(parent_row) == 0 or len(child_row) == 0:
        continue
    
    parent_vec = parent_row.iloc[0]
    child_vec = child_row.iloc[0]
    
    for feat in available_features:
        parent_val = parent_vec.get(feat)
        child_val = child_vec.get(feat)
        
        # Check if missing
        if pd.isna(parent_val) or pd.isna(child_val):
            mutation_counts[feat]['missing'] += 1
            continue
        
        # Check if changed
        if feat in numeric_features:
            try:
                parent_num = float(parent_val)
                child_num = float(child_val)
                if abs(parent_num - child_num) > 1e-6:  # Numeric difference
                    mutation_counts[feat]['changed'] += 1
                else:
                    mutation_counts[feat]['unchanged'] += 1
            except:
                if str(parent_val) != str(child_val):
                    mutation_counts[feat]['changed'] += 1
                else:
                    mutation_counts[feat]['unchanged'] += 1
        else:
            if str(parent_val) != str(child_val):
                mutation_counts[feat]['changed'] += 1
            else:
                mutation_counts[feat]['unchanged'] += 1

# Compute mutation rates
mutation_rates = []
for feat, counts in mutation_counts.items():
    total = counts['changed'] + counts['unchanged']
    if total > 0:
        rate = counts['changed'] / total
        mutation_rates.append({
            'feature': feat,
            'mutation_rate': rate,
            'n_changed': counts['changed'],
            'n_unchanged': counts['unchanged'],
            'n_missing': counts['missing'],
            'total_pairs': total
        })

df_mutations = pd.DataFrame(mutation_rates).sort_values('mutation_rate', ascending=False)
print(f"\\n✓ Analyzed mutations for {len(df_mutations)} features")
print(f"\\nTop 10 most frequently mutated features:")
print(df_mutations.head(10)[['feature', 'mutation_rate', 'n_changed', 'total_pairs']].to_string(index=False))

In [None]:
# Visualize mutational landscape
if len(df_mutations) > 0:
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
    
    # Top: Mutation rates for top features
    top_mutations = df_mutations.head(20)
    colors = ['coral' if rate > 0.5 else 'steelblue' for rate in top_mutations['mutation_rate']]
    
    bars = axes[0].barh(range(len(top_mutations)), top_mutations['mutation_rate'], color=colors, alpha=0.7)
    axes[0].set_yticks(range(len(top_mutations)))
    axes[0].set_yticklabels(top_mutations['feature'], fontsize=9)
    axes[0].invert_yaxis()
    axes[0].set_xlabel('Mutation Rate', fontsize=11)
    axes[0].set_title('Top 20 Most Frequently Mutated Config Features', fontsize=13)
    axes[0].axvline(0.5, color='red', linestyle='--', alpha=0.5, label='50% threshold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3, axis='x')
    
    # Add value labels
    for i, (idx, row) in enumerate(top_mutations.iterrows()):
        axes[0].text(row['mutation_rate'] + 0.01, i, f\"{row['mutation_rate']:.2%}\", va='center', fontsize=8)
    
    # Bottom: Mutation counts by feature type
    df_mutations['feature_type'] = df_mutations['feature'].apply(
        lambda x: 'Numeric' if x in numeric_features else ('Categorical' if x in categorical_features else 'Boolean')
    )
    
    type_summary = df_mutations.groupby('feature_type').agg({
        'mutation_rate': 'mean',
        'n_changed': 'sum',
        'total_pairs': 'sum'
    }).reset_index()
    
    x_pos = np.arange(len(type_summary))
    width = 0.35
    axes[1].bar(x_pos - width/2, type_summary['mutation_rate'], width, label='Mean Mutation Rate', color='steelblue', alpha=0.7)
    axes[1].bar(x_pos + width/2, type_summary['n_changed'] / type_summary['total_pairs'], width, 
               label='Overall Mutation Rate', color='coral', alpha=0.7)
    axes[1].set_xticks(x_pos)
    axes[1].set_xticklabels(type_summary['feature_type'])
    axes[1].set_ylabel('Mutation Rate', fontsize=11)
    axes[1].set_title('Mutation Rates by Feature Type', fontsize=13)
    axes[1].legend()
    axes[1].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig('figures/mutational_landscape.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save mutation data
    df_mutations.to_csv('config_mutation_rates.csv', index=False)
    print("\\n✓ Mutation data saved to config_mutation_rates.csv")