# Config Similarity Basics

**Goal**: Core similarity computation and graph construction for config-based architectural similarity.

**Key Questions**:
1. Which models are architecturally similar?
2. How do models cluster in architecture space?
3. What are the nearest neighbors in architecture space?
4. How do different similarity metrics compare?

**Contents**:
- Feature preparation and categorization
- Gower distance implementation (handles mixed data types)
- Similarity matrix computation
- Similarity graph construction
- Graph visualization and statistics
- Nearest neighbors analysis
- Similarity metrics comparison (Gower, L2, L1, Cosine)

**This is the foundational notebook** - other notebooks build on these utilities.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import warnings
warnings.filterwarnings('ignore')

# Style matching main repo
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

In [None]:
# Load config data
df = pd.read_csv('data/model_configs_expanded.csv', low_memory=False)
print(f"Loaded {len(df):,} models with config.json")
print(f"Total columns: {len(df.columns)}")

## 1. Prepare Config Features for Similarity Computation

In [None]:
# Define feature groups for similarity computation

# Core architecture features (high weight)
architecture_features = [
    'config_model_type',
    'config_hidden_size',
    'config_num_hidden_layers',
    'config_num_attention_heads',
    'config_intermediate_size'
]

# Capacity features (medium weight)
capacity_features = [
    'config_vocab_size',
    'config_max_position_embeddings',
    'config_num_key_value_heads'
]

# Precision/compute features (lower weight)
precision_features = [
    'config_torch_dtype',
    'config_rope_theta',
    'config_rope_scaling_type'
]

# Boolean flags (low weight)
boolean_features = [
    'uses_moe',
    'uses_gqa',
    'uses_rope',
    'uses_quantization'
]

# All features for similarity
all_features = architecture_features + capacity_features + precision_features + boolean_features

# Filter to features that exist in dataframe
available_features = [f for f in all_features if f in df.columns]
print(f"Available features for similarity: {len(available_features)}")
print(f"Features: {available_features[:10]}...")

## 2. Implement Gower Distance (Recommended for Mixed Data Types)

In [None]:
def gower_distance(x, y, numeric_cols, categorical_cols, boolean_cols):
    """
    Compute Gower distance between two config vectors.
    
    For numeric: normalized absolute difference
    For categorical: 0 if same, 1 if different
    For boolean: 0 if same, 1 if different
    Missing values: ignored in that dimension
    """
    distance = 0.0
    count = 0
    
    # Numeric features: normalized absolute difference
    for col in numeric_cols:
        if col in x.index and col in y.index:
            x_val = x[col]
            y_val = y[col]
            
            # Skip if either is missing
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            
            # Convert to numeric
            try:
                x_num = float(x_val)
                y_num = float(y_val)
                
                # Normalized difference (using max-min normalization)
                # For now, use absolute difference normalized by max value
                max_val = max(abs(x_num), abs(y_num))
                if max_val > 0:
                    distance += abs(x_num - y_num) / max_val
                else:
                    distance += 0  # Both are 0
                count += 1
            except (ValueError, TypeError):
                continue
    
    # Categorical features: 0 if same, 1 if different
    for col in categorical_cols:
        if col in x.index and col in y.index:
            x_val = x[col]
            y_val = y[col]
            
            # Skip if either is missing
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            
            # Compare as strings
            if str(x_val) != str(y_val):
                distance += 1.0
            count += 1
    
    # Boolean features: 0 if same, 1 if different
    for col in boolean_cols:
        if col in x.index and col in y.index:
            x_val = x[col]
            y_val = y[col]
            
            # Skip if either is missing
            if pd.isna(x_val) or pd.isna(y_val):
                continue
            
            # Normalize boolean values
            x_bool = bool(x_val) if not pd.isna(x_val) else False
            y_bool = bool(y_val) if not pd.isna(y_val) else False
            
            if x_bool != y_bool:
                distance += 1.0
            count += 1
    
    # Return average distance (0 = identical, 1 = completely different)
    if count > 0:
        return distance / count
    else:
        return 1.0  # No common features = maximum distance

print("Gower distance function defined")

## 3. Categorize Features by Type

In [None]:
# Categorize features by data type
numeric_features = []
categorical_features = []
boolean_feature_list = []

for feat in available_features:
    if feat in df.columns:
        # Check data type
        sample_values = df[feat].dropna().head(100)
        
        if len(sample_values) == 0:
            continue
        
        # Try to convert to numeric
        try:
            pd.to_numeric(sample_values, errors='raise')
            numeric_features.append(feat)
        except (ValueError, TypeError):
            # Check if boolean-like
            unique_vals = sample_values.unique()
            if len(unique_vals) <= 2 and set(str(v).lower() for v in unique_vals).issubset({'true', 'false', '1', '0', 'yes', 'no', 'nan'}):
                boolean_feature_list.append(feat)
            else:
                categorical_features.append(feat)

print(f"Numeric features: {len(numeric_features)}")
print(f"  {numeric_features[:5]}...")
print(f"\nCategorical features: {len(categorical_features)}")
print(f"  {categorical_features[:5]}...")
print(f"\nBoolean features: {len(boolean_feature_list)}")
print(f"  {boolean_feature_list}")

## 4. Compute Similarity Matrix (Sample for MVP)

In [None]:
# For MVP, sample a subset of models to make computation feasible
# Focus on models with complete config data

# Filter to models with at least some key features
key_features = ['config_hidden_size', 'config_num_hidden_layers', 'config_model_type']
df_complete = df[df[key_features].notna().any(axis=1)].copy()

# Sample for MVP (can increase later)
SAMPLE_SIZE = 1000  # Start with 1000 models for MVP
if len(df_complete) > SAMPLE_SIZE:
    # Stratified sample by family if available
    if 'family' in df_complete.columns:
        df_sample = df_complete.groupby('family', group_keys=False).apply(
            lambda x: x.sample(min(len(x), SAMPLE_SIZE // len(df_complete['family'].unique()) + 1))
        ).head(SAMPLE_SIZE)
    else:
        df_sample = df_complete.sample(n=SAMPLE_SIZE, random_state=42)
else:
    df_sample = df_complete.copy()

print(f"Sampling {len(df_sample):,} models for similarity computation")
print(f"This will compute {len(df_sample) * (len(df_sample) - 1) // 2:,} pairwise distances")

## 11. Architecture Phylogeny: Config Drift Along Parent-Child Edges

**Goal**: Measure how architecture drifts along family tree edges, replicating the trait drift analysis from the AI Ecosystem paper.

**Key Questions**:
- Do fine-tunes preserve architecture?
- Which families mutate architecture the most?
- What is the distribution of config drift within vs between families?

In [None]:
# Load family graph to analyze parent-child relationships
import pickle
import os

# Try to load the family graph
G_family = None
try:
    graph_paths = [
        'data/ai_ecosystem_graph_finetune_fulljson.pkl',
        'data/ai_ecosystem_graph_nomerges.pkl',
        'data/ai_ecosystem_graph.pkl'
    ]
    for path in graph_paths:
        if os.path.exists(path):
            with open(path, 'rb') as f:
                G_family = pickle.load(f)
            print(f"Loaded family graph from {path}")
            print(f"  Nodes: {len(G_family.nodes):,}")
            print(f"  Edges: {len(G_family.edges):,}")
            break
except Exception as e:
    print(f"Could not load graph: {e}")
    print("Will compute drift from parent_model columns in dataframe")

# Compute config drift for parent-child pairs
if G_family is not None:
    # Extract parent-child pairs from graph
    parent_child_pairs = []
    for parent, child in G_family.edges():
        if parent in df['modelId'].values and child in df['modelId'].values:
            parent_child_pairs.append((parent, child))
    
    print(f"Found {len(parent_child_pairs):,} parent-child pairs in graph")
else:
    # Fallback: use parent_model columns from dataframe
    print("Using parent_model columns from dataframe")
    parent_child_pairs = []
    
    # Check for parent columns
    parent_cols = ['parent_model', 'finetune_parent', 'quantized_parent', 'adapter_parent', 'merge_parent']
    available_parent_cols = [col for col in parent_cols if col in df.columns]
    
    if len(available_parent_cols) > 0:
        for idx, row in df.iterrows():
            model_id = row['modelId']
            for col in available_parent_cols:
                if pd.notna(row[col]):
                    try:
                        parents = eval(row[col]) if isinstance(row[col], str) else row[col]
                        if isinstance(parents, list):
                            for parent in parents:
                                if parent in df['modelId'].values:
                                    parent_child_pairs.append((parent, model_id))
                    except:
                        continue
        
        print(f"Found {len(parent_child_pairs):,} parent-child pairs from dataframe columns")

# Sample pairs for analysis (if too many)
MAX_PAIRS = 5000
if len(parent_child_pairs) > MAX_PAIRS:
    import random
    parent_child_pairs = random.sample(parent_child_pairs, MAX_PAIRS)
    print(f"Sampled {len(parent_child_pairs):,} pairs for analysis")

print(f"\\nTotal parent-child pairs to analyze: {len(parent_child_pairs):,}")

In [None]:
# Compute config drift for each parent-child pair
print("Computing config drift for parent-child pairs...")

drift_data = []
for i, (parent_id, child_id) in enumerate(parent_child_pairs):
    if i % 500 == 0 and i > 0:
        print(f"  Processed {i}/{len(parent_child_pairs)} pairs...")
    
    # Get config vectors
    parent_row = df[df['modelId'] == parent_id]
    child_row = df[df['modelId'] == child_id]
    
    if len(parent_row) == 0 or len(child_row) == 0:
        continue
    
    parent_vec = parent_row.iloc[0][available_features]
    child_vec = child_row.iloc[0][available_features]
    
    # Compute Gower distance (drift)
    drift = gower_distance(
        parent_vec,
        child_vec,
        numeric_features,
        categorical_features,
        boolean_feature_list
    )
    
    # Get family info if available
    parent_family = parent_row.iloc[0].get('family', 'Unknown')
    child_family = child_row.iloc[0].get('family', 'Unknown')
    same_family = parent_family == child_family and parent_family != 'Unknown'
    
    drift_data.append({
        'parent_id': parent_id,
        'child_id': child_id,
        'drift': drift,
        'similarity': 1 - drift,
        'parent_family': parent_family,
        'child_family': child_family,
        'same_family': same_family
    })

df_drift = pd.DataFrame(drift_data)
print(f"\\n✓ Computed drift for {len(df_drift):,} parent-child pairs")
print(f"  Mean drift: {df_drift['drift'].mean():.3f}")
print(f"  Median drift: {df_drift['drift'].median():.3f}")

In [None]:
# Visualize config drift distributions
if len(df_drift) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Overall drift distribution
    axes[0,0].hist(df_drift['drift'], bins=50, color='steelblue', alpha=0.7, edgecolor='white')
    axes[0,0].axvline(df_drift['drift'].median(), color='red', linestyle='--', linewidth=2, label=f'Median: {df_drift[\"drift\"].median():.3f}')
    axes[0,0].set_xlabel('Config Drift (Gower Distance)', fontsize=11)
    axes[0,0].set_ylabel('Count', fontsize=11)
    axes[0,0].set_title('Distribution of Config Drift Along Parent-Child Edges', fontsize=13)
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3, axis='y')
    
    # 2. Within-family vs between-family drift
    if df_drift['same_family'].sum() > 0:
        within_family = df_drift[df_drift['same_family'] == True]['drift']
        between_family = df_drift[df_drift['same_family'] == False]['drift']
        
        axes[0,1].hist([within_family, between_family], bins=30, label=['Within Family', 'Between Families'], 
                      alpha=0.7, color=['seagreen', 'coral'], edgecolor='white')
        axes[0,1].set_xlabel('Config Drift', fontsize=11)
        axes[0,1].set_ylabel('Count', fontsize=11)
        axes[0,1].set_title('Config Drift: Within vs Between Families', fontsize=13)
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3, axis='y')
        
        print(f"\\nWithin-family drift: mean={within_family.mean():.3f}, median={within_family.median():.3f}")
        print(f"Between-family drift: mean={between_family.mean():.3f}, median={between_family.median():.3f}")
    
    # 3. Drift by family (top families)
    if 'parent_family' in df_drift.columns:
        family_drift = df_drift.groupby('parent_family')['drift'].agg(['mean', 'median', 'count']).sort_values('count', ascending=False)
        top_families = family_drift.head(10)
        
        x_pos = np.arange(len(top_families))
        width = 0.35
        axes[1,0].bar(x_pos - width/2, top_families['mean'], width, label='Mean', color='steelblue', alpha=0.7)
        axes[1,0].bar(x_pos + width/2, top_families['median'], width, label='Median', color='coral', alpha=0.7)
        axes[1,0].set_xticks(x_pos)
        axes[1,0].set_xticklabels(top_families.index, rotation=45, ha='right')
        axes[1,0].set_ylabel('Config Drift', fontsize=11)
        axes[1,0].set_title('Config Drift by Family (Top 10)', fontsize=13)
        axes[1,0].legend()
        axes[1,0].grid(True, alpha=0.3, axis='y')
    
    # 4. Cumulative drift distribution
    sorted_drift = np.sort(df_drift['drift'])
    cumulative = np.arange(1, len(sorted_drift) + 1) / len(sorted_drift)
    axes[1,1].plot(sorted_drift, cumulative, linewidth=2, color='purple')
    axes[1,1].axvline(df_drift['drift'].median(), color='red', linestyle='--', linewidth=2, alpha=0.7)
    axes[1,1].set_xlabel('Config Drift', fontsize=11)
    axes[1,1].set_ylabel('Cumulative Fraction', fontsize=11)
    axes[1,1].set_title('Cumulative Distribution of Config Drift', fontsize=13)
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('figures/config_drift_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save drift data
    df_drift.to_csv('config_drift_pairs.csv', index=False)
    print("\\n✓ Drift data saved to config_drift_pairs.csv")
else:
    print("No drift data to visualize")