# Architectural Fitness: Which Configs Produce More Descendants?

**Goal**: Identify which architectural configurations are "fittest" in the ecosystem - i.e., which lead to more descendants, usage, or adoption.

**Core Question**: Which architectures are "fittest" in the ecosystem ‚Äì i.e., which config patterns lead to more descendants, usage, or adoption?

**Analysis Approach**:
1. Compute architectural traits (hidden_size, num_layers, context length, dtype, model_type)
2. Attach ecosystem stats (#children, #downloads, #likes)
3. Regress "#descendants" or "#downloads" on architectural traits
4. Compare "fitness" landscapes by architecture family
5. Look for non-linear effects and sweet spots (e.g., 7B models being more "reproductively fit" than 13B or 70B)

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

# Style matching main repo
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

In [8]:
# Load config data
df_configs = pd.read_csv('data/model_configs_expanded.csv', low_memory=False)
print(f"Loaded {len(df_configs):,} models with config.json")

# Load main dataset for ecosystem stats
# Try to load from HuggingFace dataset (modelbiome/ai_ecosystem), fallback on local CSVs if datasets lib missing
print("Loading from HuggingFace dataset (modelbiome/ai_ecosystem)...")

try:
    from datasets import load_dataset
    DATASETS_AVAILABLE = True
except ImportError as e:
    print("ModuleNotFoundError: No module named 'datasets'")
    DATASETS_AVAILABLE = False

if DATASETS_AVAILABLE:
    try:
    dataset = load_dataset("modelbiome/ai_ecosystem", split="train")
    print(f"Dataset loaded with {len(dataset):,} models")
    print(f"Available columns: {dataset.column_names[:20]}...")
    
    # Build DataFrame column by column, checking if each exists
    data_dict = {'modelId': dataset['model_id']}
    
    # Columns we need for fitness analysis
    numeric_cols = ['downloads', 'likes']
    list_cols = ['parent_model', 'finetune_parent', 'quantized_parent', 
                 'adapter_parent', 'merge_parent']
    
    # Add numeric columns
    for col in numeric_cols:
        if col in dataset.column_names:
            data_dict[col] = dataset[col]
        else:
            data_dict[col] = [0] * len(dataset)
    
    # Add list columns (parent relationships)
    for col in list_cols:
        if col in dataset.column_names:
            data_dict[col] = dataset[col]
        else:
            data_dict[col] = [[]] * len(dataset)
    
    df_main = pd.DataFrame(data_dict)
    print(f"Loaded {len(df_main):,} models from HuggingFace")
    print(f"Columns in df_main: {df_main.columns.tolist()}")
    
except Exception as e:
    print(f"Error loading from HuggingFace: {e}")
    import traceback
    traceback.print_exc()
    print("\nTrying alternative: load from expanded dataset if available...")
    # Try to find an expanded dataset CSV
    try:
        import os
        alt_paths = [
            'data/ai_ecosystem_expanded.csv',
            'data/ai_ecosystem.csv',
            '../ai_ecosystem.csv'
        ]
        found = False
        for path in alt_paths:
            if os.path.exists(path):
                df_main = pd.read_csv(path, low_memory=False)
                if 'modelId' not in df_main.columns and 'model_id' in df_main.columns:
                    df_main['modelId'] = df_main['model_id']
                # Ensure required columns exist
                for col in ['downloads', 'likes', 'parent_model', 'finetune_parent']:
                    if col not in df_main.columns:
                        if col in ['parent_model', 'finetune_parent']:
                            df_main[col] = [[]] * len(df_main)
                        else:
                            df_main[col] = 0
                print(f"Loaded {len(df_main):,} models from {path}")
                found = True
                break
        
        if not found:
            # Last resort: create minimal dataframe
            df_main = pd.DataFrame({'modelId': df_configs['modelId']})
            df_main['downloads'] = 0
            df_main['likes'] = 0
            df_main['parent_model'] = [[]] * len(df_main)
            df_main['finetune_parent'] = [[]] * len(df_main)
            print("Created minimal dataframe - fitness analysis will use graph data")
    except Exception as e2:
        print(f"Error in fallback: {e2}")
        df_main = pd.DataFrame({'modelId': df_configs['modelId']})
        df_main['downloads'] = 0
        df_main['likes'] = 0
        df_main['parent_model'] = [[]] * len(df_main)
        df_main['finetune_parent'] = [[]] * len(df_main)

# Join - only use columns that exist
cols_to_merge = ['modelId']
for col in ['downloads', 'likes', 'parent_model', 'finetune_parent']:
    if col in df_main.columns:
        cols_to_merge.append(col)

df = df_configs.merge(df_main[cols_to_merge], on='modelId', how='left')
print(f"\nJoined dataset: {len(df):,} models")
if 'downloads' in df.columns:
    print(f"Models with downloads data: {df['downloads'].notna().sum():,}")
    print(f"Total downloads: {df['downloads'].sum():,}")

Loaded 14,557 models with config.json
Loading from HuggingFace dataset (modelbiome/ai_ecosystem)...


ModuleNotFoundError: No module named 'datasets'

In [None]:
# Count descendants - try graph first, then fallback to parent_model counts
import pickle
import networkx as nx

def count_descendants_from_graph(model_id, G):
    """Count all descendants using graph traversal"""
    if model_id in G:
        descendants = set()
        to_process = [model_id]
        while to_process:
            current = to_process.pop()
            if current in G:
                for child in G.successors(current):
                    if child not in descendants:
                        descendants.add(child)
                        to_process.append(child)
        return len(descendants)
    return 0

def count_descendants_from_parents(df_with_parents):
    """Count direct children from parent_model lists"""
    parent_counts = {}
    for idx, row in df_with_parents.iterrows():
        parents = row.get('parent_model', [])
        # Handle both string representation of lists and actual lists
        if isinstance(parents, str):
            try:
                import ast
                parents = ast.literal_eval(parents)
            except:
                parents = []
        if isinstance(parents, list):
            for parent in parents:
                if parent:  # Skip empty strings
                    parent_counts[parent] = parent_counts.get(parent, 0) + 1
    return parent_counts

try:
    with open('data/ai_ecosystem_graph_finetune_fulljson.pkl', 'rb') as f:
        G = pickle.load(f)
    print(f"Loaded graph with {len(G.nodes):,} nodes")
    
    print("Counting descendants from graph (this may take a few minutes)...")
    df['num_descendants'] = df['modelId'].apply(lambda x: count_descendants_from_graph(x, G))
    print("‚úì Descendant counts computed from graph")
    
except Exception as e:
    print(f"Could not load graph: {e}")
    print("Using parent_model counts as proxy...")
    
    # Fallback: count direct children from parent_model column
    if 'parent_model' in df.columns:
        parent_counts = count_descendants_from_parents(df)
        df['num_descendants'] = df['modelId'].map(parent_counts).fillna(0)
        print(f"‚úì Using parent counts as proxy: {df['num_descendants'].sum():,} total parent relationships")
    else:
        print("No parent_model column available - setting descendants to 0")
        df['num_descendants'] = 0

In [None]:
# Prepare architectural features
df['hidden_size'] = pd.to_numeric(df['config_hidden_size'], errors='coerce')
df['num_layers'] = pd.to_numeric(df['config_num_hidden_layers'], errors='coerce')
df['context_length'] = pd.to_numeric(df['config_max_position_embeddings'], errors='coerce')
df['approx_params'] = pd.to_numeric(df['config_approx_params_billions'], errors='coerce')

# Log transforms for regression
df['log_hidden_size'] = np.log1p(df['hidden_size'])
df['log_num_layers'] = np.log1p(df['num_layers'])
df['log_context_length'] = np.log1p(df['context_length'])
df['log_params'] = np.log1p(df['approx_params'])
df['log_downloads'] = np.log1p(df['downloads'])
df['log_descendants'] = np.log1p(df['num_descendants'])

# Model family
family_cols = [c for c in df.columns if c.startswith('is_') and 'family' in c]
df['model_family'] = 'other'
for col in family_cols:
    family_name = col.replace('is_', '').replace('_family', '')
    mask = ((df[col] == True) | (df[col] == 'True'))
    df.loc[mask, 'model_family'] = family_name

print(f"\nModels by family:")
print(df['model_family'].value_counts().head(10))

## 1. Fitness Landscape: Downloads vs Architectural Traits

In [None]:
# Filter to models with valid data
df_fit = df[
    (df['hidden_size'].notna()) & 
    (df['num_layers'].notna()) &
    (df['downloads'].notna()) &
    (df['downloads'] > 0)
].copy()

print(f"Models with complete data: {len(df_fit):,}")

# Regression: downloads ~ architectural traits
features = ['log_hidden_size', 'log_num_layers', 'log_context_length', 'log_params']
X = df_fit[features].fillna(0)
y = df_fit['log_downloads']

# Remove rows with any NaN
valid_mask = ~X.isna().any(axis=1) & ~y.isna()
X_clean = X[valid_mask]
y_clean = y[valid_mask]

if len(X_clean) > 100:
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_clean)
    
    model = LinearRegression()
    model.fit(X_scaled, y_clean)
    
    print("\n" + "="*80)
    print("REGRESSION: log(downloads) ~ Architectural Traits")
    print("="*80)
    print(f"\nR¬≤ Score: {model.score(X_scaled, y_clean):.4f}")
    print(f"\nCoefficients:")
    for feat, coef in zip(features, model.coef_):
        print(f"  {feat}: {coef:.4f}")
    print(f"\nIntercept: {model.intercept_:.4f}")

## 2. Fitness by Model Size (Parameter Count)

In [None]:
# Group by approximate parameter count bins
df_fit['param_bin'] = pd.cut(df_fit['approx_params'], 
                              bins=[0, 1, 3, 7, 13, 30, 70, 200, np.inf],
                              labels=['<1B', '1-3B', '3-7B', '7-13B', '13-30B', '30-70B', '70-200B', '>200B'])

# Calculate average fitness metrics by bin
fitness_by_size = df_fit.groupby('param_bin').agg({
    'num_descendants': ['mean', 'median', 'count'],
    'downloads': ['mean', 'median'],
    'likes': ['mean', 'median']
}).round(2)

print("\n" + "="*80)
print("FITNESS BY MODEL SIZE")
print("="*80)
print(fitness_by_size)

# Plot fitness landscape
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Average descendants by size
size_order = ['<1B', '1-3B', '3-7B', '7-13B', '13-30B', '30-70B', '70-200B', '>200B']
size_data = df_fit[df_fit['param_bin'].isin(size_order)]

desc_by_size = size_data.groupby('param_bin')['num_descendants'].mean()
axes[0].bar(range(len(desc_by_size)), desc_by_size.values, color=plt.cm.viridis(np.linspace(0, 1, len(desc_by_size))))
axes[0].set_xticks(range(len(desc_by_size)))
axes[0].set_xticklabels(desc_by_size.index, rotation=45, ha='right')
axes[0].set_ylabel('Average # Descendants')
axes[0].set_title('Reproductive Fitness by Model Size')
axes[0].grid(True, alpha=0.3, axis='y')

# Average downloads by size
dl_by_size = size_data.groupby('param_bin')['downloads'].mean()
axes[1].bar(range(len(dl_by_size)), dl_by_size.values, color=plt.cm.plasma(np.linspace(0, 1, len(dl_by_size))))
axes[1].set_xticks(range(len(dl_by_size)))
axes[1].set_xticklabels(dl_by_size.index, rotation=45, ha='right')
axes[1].set_ylabel('Average Downloads')
axes[1].set_title('Usage Fitness by Model Size')
axes[1].set_yscale('log')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('figures/fitness_by_size.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Fitness by Architecture Family

In [None]:
# Compare fitness across families
main_families = ['llama', 'mistral', 'qwen', 'gpt', 'bert', 'deepseek']
df_family = df_fit[df_fit['model_family'].isin(main_families)].copy()

family_fitness = df_family.groupby('model_family').agg({
    'num_descendants': ['mean', 'median', 'count'],
    'downloads': ['mean', 'median'],
    'likes': ['mean', 'median'],
    'approx_params': 'mean'
}).round(2)

print("\n" + "="*80)
print("FITNESS BY ARCHITECTURE FAMILY")
print("="*80)
print(family_fitness)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

family_order = family_fitness.sort_values(('num_descendants', 'mean'), ascending=False).index

desc_means = [family_fitness.loc[f, ('num_descendants', 'mean')] for f in family_order]
axes[0].barh(range(len(family_order)), desc_means, color=plt.cm.Set3(np.linspace(0, 1, len(family_order))))
axes[0].set_yticks(range(len(family_order)))
axes[0].set_yticklabels([f.title() for f in family_order])
axes[0].set_xlabel('Average # Descendants')
axes[0].set_title('Reproductive Fitness by Family')
axes[0].grid(True, alpha=0.3, axis='x')

dl_means = [family_fitness.loc[f, ('downloads', 'mean')] for f in family_order]
axes[1].barh(range(len(family_order)), dl_means, color=plt.cm.Set2(np.linspace(0, 1, len(family_order))))
axes[1].set_yticks(range(len(family_order)))
axes[1].set_yticklabels([f.title() for f in family_order])
axes[1].set_xlabel('Average Downloads')
axes[1].set_title('Usage Fitness by Family')
axes[1].set_xscale('log')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig('figures/fitness_by_family.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Context Length Fitness by Family

In [None]:
# Compare returns to context length by family
df_ctx = df_family[df_family['context_length'].notna()].copy()
df_ctx['ctx_bin'] = pd.cut(df_ctx['context_length'], 
                           bins=[0, 2048, 8192, 32768, 131072, np.inf],
                           labels=['<2K', '2-8K', '8-32K', '32-128K', '>128K'])

# Plot context length vs fitness by family
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for family in ['llama', 'mistral', 'qwen']:
    family_df = df_ctx[df_ctx['model_family'] == family]
    if len(family_df) > 10:
        ctx_fitness = family_df.groupby('ctx_bin')['num_descendants'].mean()
        axes[0].plot(range(len(ctx_fitness)), ctx_fitness.values, 
                    marker='o', label=family.title(), linewidth=2, markersize=6)
        
        ctx_dl = family_df.groupby('ctx_bin')['downloads'].mean()
        axes[1].plot(range(len(ctx_dl)), ctx_dl.values, 
                    marker='o', label=family.title(), linewidth=2, markersize=6)

axes[0].set_xticks(range(len(ctx_fitness)))
axes[0].set_xticklabels(ctx_fitness.index, rotation=45, ha='right')
axes[0].set_ylabel('Average # Descendants')
axes[0].set_title('Context Length Fitness: Descendants')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xticks(range(len(ctx_dl)))
axes[1].set_xticklabels(ctx_dl.index, rotation=45, ha='right')
axes[1].set_ylabel('Average Downloads')
axes[1].set_title('Context Length Fitness: Downloads')
axes[1].set_yscale('log')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('figures/fitness_context_by_family.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Sweet Spot Analysis: Non-Linear Effects

In [None]:
# Look for sweet spots in parameter count
# Focus on common sizes: 1B, 3B, 7B, 13B, 30B, 70B
sweet_spots = [1, 3, 7, 13, 30, 70]

df_sweet = df_fit[
    (df_fit['approx_params'].notna()) & 
    (df_fit['approx_params'] > 0.5) & 
    (df_fit['approx_params'] < 200)
].copy()

# Bin by proximity to sweet spots
def find_nearest_sweet_spot(params):
    if pd.isna(params):
        return None
    nearest = min(sweet_spots, key=lambda x: abs(x - params))
    return f"~{nearest}B"

df_sweet['sweet_spot'] = df_sweet['approx_params'].apply(find_nearest_sweet_spot)

# Calculate fitness by sweet spot
sweet_fitness = df_sweet.groupby('sweet_spot').agg({
    'num_descendants': ['mean', 'median', 'count'],
    'downloads': ['mean', 'median']
}).round(2)

print("\n" + "="*80)
print("SWEET SPOT ANALYSIS")
print("="*80)
print(sweet_fitness)

# Plot
fig, ax = plt.subplots(figsize=(12, 6))

spot_order = [f"~{s}B" for s in sweet_spots]
spot_order = [s for s in spot_order if s in sweet_fitness.index]

desc_means = [sweet_fitness.loc[s, ('num_descendants', 'mean')] for s in spot_order]
ax.plot(range(len(spot_order)), desc_means, marker='o', markersize=10, 
        linewidth=3, label='Average Descendants', color='#2E86AB')

ax.set_xticks(range(len(spot_order)))
ax.set_xticklabels(spot_order)
ax.set_ylabel('Average # Descendants')
ax.set_xlabel('Model Size (Approx. Parameters)')
ax.set_title('Reproductive Fitness: Sweet Spot Analysis', fontsize=14)
ax.grid(True, alpha=0.3)
ax.legend()

plt.tight_layout()
plt.savefig('figures/fitness_sweet_spots.png', dpi=300, bbox_inches='tight')
plt.show()

# Identify sweetest spot
best_spot = sweet_fitness[('num_descendants', 'mean')].idxmax()
best_fitness = sweet_fitness.loc[best_spot, ('num_descendants', 'mean')]
print(f"\nüèÜ Sweetest Spot: {best_spot} with {best_fitness:.1f} average descendants")

## 6. Feature Impact on Fitness

In [None]:
# Compare fitness for models with/without key features
features_to_test = ['uses_moe', 'uses_gqa', 'uses_rope', 'uses_quantization']

feature_impact = []
for feat in features_to_test:
    if feat in df_fit.columns:
        df_feat = df_fit[df_fit[feat].isin([True, 'True', 1])]
        df_no_feat = df_fit[df_fit[feat].isin([False, 'False', 0])]
        
        if len(df_feat) > 10 and len(df_no_feat) > 10:
            feature_impact.append({
                'feature': feat.replace('uses_', ''),
                'with_feature_desc': df_feat['num_descendants'].mean(),
                'without_feature_desc': df_no_feat['num_descendants'].mean(),
                'with_feature_dl': df_feat['downloads'].mean(),
                'without_feature_dl': df_no_feat['downloads'].mean(),
                'n_with': len(df_feat),
                'n_without': len(df_no_feat)
            })

impact_df = pd.DataFrame(feature_impact)

print("\n" + "="*80)
print("FEATURE IMPACT ON FITNESS")
print("="*80)
print(impact_df.round(2))

# Plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

x_pos = np.arange(len(impact_df))
width = 0.35

axes[0].bar(x_pos - width/2, impact_df['with_feature_desc'], width, 
            label='With Feature', color='#2E86AB', alpha=0.8)
axes[0].bar(x_pos + width/2, impact_df['without_feature_desc'], width, 
            label='Without Feature', color='#A23B72', alpha=0.8)
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(impact_df['feature'], rotation=45, ha='right')
axes[0].set_ylabel('Average # Descendants')
axes[0].set_title('Feature Impact: Descendants')
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

axes[1].bar(x_pos - width/2, impact_df['with_feature_dl'], width, 
            label='With Feature', color='#2E86AB', alpha=0.8)
axes[1].bar(x_pos + width/2, impact_df['without_feature_dl'], width, 
            label='Without Feature', color='#A23B72', alpha=0.8)
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(impact_df['feature'], rotation=45, ha='right')
axes[1].set_ylabel('Average Downloads')
axes[1].set_title('Feature Impact: Downloads')
axes[1].set_yscale('log')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('figures/fitness_feature_impact.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Summary: Evolutionary Landscape

In [None]:
print("\n" + "="*80)
print("ARCHITECTURAL FITNESS SUMMARY")
print("="*80)

print(f"\nüìä Dataset:")
print(f"   Models analyzed: {len(df_fit):,}")
print(f"   Total descendants tracked: {df_fit['num_descendants'].sum():,}")
print(f"   Total downloads: {df_fit['downloads'].sum():,}")

print(f"\nüèÜ Most Fit by Size:")
best_size = fitness_by_size[('num_descendants', 'mean')].idxmax()
print(f"   Size bin: {best_size}")
print(f"   Average descendants: {fitness_by_size.loc[best_size, ('num_descendants', 'mean'):.1f}")

print(f"\nüëë Most Fit Family:")
best_family = family_fitness[('num_descendants', 'mean')].idxmax()
print(f"   Family: {best_family.title()}")
print(f"   Average descendants: {family_fitness.loc[best_family, ('num_descendants', 'mean'):.1f}")

print(f"\nüí° Key Insights:")
print(f"   ‚Ä¢ Models with GQA: {impact_df[impact_df['feature']=='gqa']['with_feature_desc'].values[0]:.1f} avg descendants")
print(f"   ‚Ä¢ Models without GQA: {impact_df[impact_df['feature']=='gqa']['without_feature_desc'].values[0]:.1f} avg descendants")
if 'moe' in impact_df['feature'].values:
    print(f"   ‚Ä¢ Models with MoE: {impact_df[impact_df['feature']=='moe']['with_feature_desc'].values[0]:.1f} avg descendants")
    print(f"   ‚Ä¢ Models without MoE: {impact_df[impact_df['feature']=='moe']['without_feature_desc'].values[0]:.1f} avg descendants")