# Config.json Analysis: Contributors & Summary Statistics

**Goal**: Understand who contributes config.json files and get an overview of the architectural landscape.

**Key Questions**:
1. Which organizations contribute the most models with config.json?
2. What's the distribution of architecture features across contributors?
3. What are the overall summary statistics for model configurations?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Style matching main repo
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

In [None]:
# Load the expanded config dataset
df = pd.read_csv('data/model_configs_expanded.csv', low_memory=False)
print(f"Total models with config.json: {len(df):,}")
print(f"Total columns: {len(df.columns)}")
df.head(3)

## 1. Extract Organizations from Model IDs

In [None]:
# Extract organization/username from modelId
def extract_org(model_id):
    if pd.isna(model_id):
        return 'unknown'
    parts = str(model_id).split('/')
    return parts[0] if len(parts) >= 2 else 'community'

df['organization'] = df['modelId'].apply(extract_org)
print(f"Unique organizations: {df['organization'].nunique():,}")

In [None]:
# Top 30 contributors by number of models with config.json
top_orgs = df['organization'].value_counts().head(30)

fig, ax = plt.subplots(figsize=(14, 8))
colors = plt.cm.viridis(np.linspace(0, 0.9, len(top_orgs)))
bars = ax.barh(range(len(top_orgs)), top_orgs.values, color=colors)
ax.set_yticks(range(len(top_orgs)))
ax.set_yticklabels(top_orgs.index)
ax.invert_yaxis()
ax.set_xlabel('Number of Models with config.json')
ax.set_title('Top 30 Organizations Contributing Models with config.json', fontsize=14)

for i, (bar, val) in enumerate(zip(bars, top_orgs.values)):
    ax.text(val + 5, bar.get_y() + bar.get_height()/2, f'{val:,}', va='center', fontsize=9)

plt.tight_layout()
plt.savefig('figures/top_contributors.png', dpi=300, bbox_inches='tight')
plt.show()

## 2. Summary Statistics Overview

In [None]:
# Key numeric columns to summarize
numeric_cols = [
    'config_hidden_size', 'config_intermediate_size', 'config_num_hidden_layers',
    'config_num_attention_heads', 'config_num_key_value_heads', 'config_vocab_size',
    'config_max_position_embeddings', 'config_rope_theta', 'config_approx_params_billions',
    'config_head_dimension', 'config_gqa_ratio'
]

# Calculate summary stats
summary_stats = df[numeric_cols].describe().T
summary_stats['non_null'] = df[numeric_cols].notna().sum()
summary_stats['non_null_pct'] = (summary_stats['non_null'] / len(df) * 100).round(1)

print("\n" + "="*80)
print("SUMMARY STATISTICS FOR KEY PARAMETERS")
print("="*80)
summary_stats

In [None]:
# Architecture feature prevalence
feature_cols = ['uses_moe', 'uses_gqa', 'uses_rope', 'uses_quantization', 'uses_lora']

feature_counts = {}
for col in feature_cols:
    if col in df.columns:
        true_count = ((df[col] == True) | (df[col] == 'True')).sum()
        feature_counts[col.replace('uses_', '')] = true_count

fig, ax = plt.subplots(figsize=(10, 5))
features = list(feature_counts.keys())
counts = list(feature_counts.values())
percentages = [c/len(df)*100 for c in counts]

colors = plt.cm.Set2(np.linspace(0, 1, len(features)))
bars = ax.bar(features, counts, color=colors)
ax.set_ylabel('Number of Models')
ax.set_title('Architecture Feature Adoption', fontsize=14)

for bar, pct in zip(bars, percentages):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50, 
            f'{pct:.1f}%', ha='center', fontsize=10)

plt.tight_layout()
plt.savefig('figures/feature_adoption.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Model Families Distribution

In [None]:
# Model family counts
family_cols = [c for c in df.columns if c.startswith('is_') and 'family' in c]

family_counts = {}
for col in family_cols:
    true_count = ((df[col] == True) | (df[col] == 'True')).sum()
    family_name = col.replace('is_', '').replace('_family', '').title()
    if true_count > 0:
        family_counts[family_name] = true_count

family_counts = dict(sorted(family_counts.items(), key=lambda x: x[1], reverse=True))

fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.coolwarm(np.linspace(0, 1, len(family_counts)))
bars = ax.barh(list(family_counts.keys()), list(family_counts.values()), color=colors)
ax.invert_yaxis()
ax.set_xlabel('Number of Models')
ax.set_title('Model Family Distribution', fontsize=14)

for bar, val in zip(bars, family_counts.values()):
    ax.text(val + 10, bar.get_y() + bar.get_height()/2, f'{val:,}', va='center')

plt.tight_layout()
plt.savefig('figures/model_families.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Size & Context Categories

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Size category distribution
size_order = ['small', 'medium', 'large', 'xlarge']
size_counts = df['size_category'].value_counts()
size_counts = size_counts.reindex([s for s in size_order if s in size_counts.index])

colors1 = plt.cm.YlOrRd(np.linspace(0.2, 0.8, len(size_counts)))
axes[0].pie(size_counts.values, labels=size_counts.index, autopct='%1.1f%%', 
            colors=colors1, explode=[0.02]*len(size_counts))
axes[0].set_title('Model Size Distribution', fontsize=13)

# Context category distribution  
context_order = ['short', 'medium', 'long', 'very_long']
context_counts = df['context_category'].value_counts()
context_counts = context_counts.reindex([c for c in context_order if c in context_counts.index])

colors2 = plt.cm.Blues(np.linspace(0.3, 0.9, len(context_counts)))
axes[1].pie(context_counts.values, labels=context_counts.index, autopct='%1.1f%%',
            colors=colors2, explode=[0.02]*len(context_counts))
axes[1].set_title('Context Length Distribution', fontsize=13)

plt.tight_layout()
plt.savefig('figures/size_context_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Top Organizations by Architecture Features

In [None]:
# Create a heatmap of architecture features by top organizations
top_20_orgs = df['organization'].value_counts().head(20).index.tolist()
df_top = df[df['organization'].isin(top_20_orgs)].copy()

# Calculate feature rates by organization
feature_rates = []
for org in top_20_orgs:
    org_df = df_top[df_top['organization'] == org]
    rates = {'organization': org, 'count': len(org_df)}
    
    for feat in ['uses_moe', 'uses_gqa', 'uses_rope', 'uses_quantization']:
        if feat in org_df.columns:
            rate = ((org_df[feat] == True) | (org_df[feat] == 'True')).sum() / len(org_df) * 100
            rates[feat.replace('uses_', '')] = rate
    
    feature_rates.append(rates)

rates_df = pd.DataFrame(feature_rates).set_index('organization')
rates_df = rates_df.drop('count', axis=1)

fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(rates_df.values, cmap='YlGnBu', aspect='auto', vmin=0, vmax=100)
ax.set_xticks(range(len(rates_df.columns)))
ax.set_xticklabels(rates_df.columns)
ax.set_yticks(range(len(rates_df.index)))
ax.set_yticklabels(rates_df.index)
for i in range(len(rates_df.index)):
    for j in range(len(rates_df.columns)):
        ax.text(j, i, f'{rates_df.iloc[i, j]:.1f}', ha='center', va='center', fontsize=8)
plt.colorbar(im, ax=ax, label='Adoption Rate (%)')
ax.set_title('Architecture Feature Adoption by Organization (Top 20)', fontsize=14)
ax.set_xlabel('Feature')
ax.set_ylabel('Organization')

plt.tight_layout()
plt.savefig('figures/feature_adoption_by_org.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Model Type Distribution

In [None]:
# Top 20 model types
model_types = df['config_model_type'].value_counts().head(20)

fig, ax = plt.subplots(figsize=(12, 8))
colors = plt.cm.Spectral(np.linspace(0, 1, len(model_types)))
bars = ax.barh(range(len(model_types)), model_types.values, color=colors)
ax.set_yticks(range(len(model_types)))
ax.set_yticklabels(model_types.index)
ax.invert_yaxis()
ax.set_xlabel('Number of Models')
ax.set_title('Top 20 Model Architecture Types (from config.json)', fontsize=14)

for bar, val in zip(bars, model_types.values):
    ax.text(val + 5, bar.get_y() + bar.get_height()/2, f'{val:,}', va='center', fontsize=9)

plt.tight_layout()
plt.savefig('figures/model_types.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Summary Table: Key Insights

In [None]:
# Generate summary insights
print("="*80)
print("KEY INSIGHTS SUMMARY")
print("="*80)

print(f"\nüìä DATASET OVERVIEW")
print(f"   Total models with config.json: {len(df):,}")
print(f"   Unique organizations: {df['organization'].nunique():,}")
print(f"   Unique model types: {df['config_model_type'].nunique():,}")

print(f"\nüè¢ TOP CONTRIBUTORS")
for i, (org, count) in enumerate(top_orgs.head(5).items(), 1):
    pct = count / len(df) * 100
    print(f"   {i}. {org}: {count:,} models ({pct:.1f}%)")

print(f"\nüîß ARCHITECTURE FEATURES")
for feat, count in feature_counts.items():
    pct = count / len(df) * 100
    print(f"   {feat.upper()}: {count:,} models ({pct:.1f}%)")

print(f"\nüìè SIZE DISTRIBUTION")
for size in size_order:
    if size in size_counts.index:
        count = size_counts[size]
        pct = count / size_counts.sum() * 100
        print(f"   {size.title()}: {count:,} ({pct:.1f}%)")

print(f"\nüìê PARAMETER STATISTICS")
hidden_sizes = pd.to_numeric(df['config_hidden_size'], errors='coerce').dropna()
print(f"   Hidden Size: median={hidden_sizes.median():.0f}, range=[{hidden_sizes.min():.0f}, {hidden_sizes.max():.0f}]")

layers = pd.to_numeric(df['config_num_hidden_layers'], errors='coerce').dropna()
print(f"   Num Layers: median={layers.median():.0f}, range=[{layers.min():.0f}, {layers.max():.0f}]")

vocab = pd.to_numeric(df['config_vocab_size'], errors='coerce').dropna()
print(f"   Vocab Size: median={vocab.median():.0f}, range=[{vocab.min():.0f}, {vocab.max():.0f}]")

In [None]:
# Save key stats to CSV for reference
stats_summary = {
    'metric': ['total_models', 'unique_orgs', 'unique_model_types',
               'moe_count', 'gqa_count', 'rope_count', 'quantization_count',
               'median_hidden_size', 'median_layers', 'median_vocab_size'],
    'value': [len(df), df['organization'].nunique(), df['config_model_type'].nunique(),
              feature_counts.get('moe', 0), feature_counts.get('gqa', 0), 
              feature_counts.get('rope', 0), feature_counts.get('quantization', 0),
              hidden_sizes.median(), layers.median(), vocab.median()]
}

pd.DataFrame(stats_summary).to_csv('summary_stats.csv', index=False)
print("\n‚úì Summary stats saved to summary_stats.csv")