# S-Curve Analysis: Architectural Feature Diffusion Over Time

**Goal**: Track how architectural innovations diffuse across the Hugging Face ecosystem over time.

**Core Question**: How do certain architectural innovations diffuse over time across the Hugging Face ecosystem?

**Key Features to Track**:
1. **torch_dtype**: fp16, bf16, fp32, 4-bit, 8-bit quantization
2. **max_position_embeddings**: Context length expansions (â‰¥8k, â‰¥32k, â‰¥128k)
3. **num_key_value_heads**: Grouped-query attention (GQA/MQA) adoption
4. **transformers_version**: Proxy for library-era / feature availability

**Analysis**:
- Fraction of new models per month adopting each feature
- Compare across task tags (LLM vs vision vs audio) or licenses
- Identify S-curve patterns of diffusion

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Style matching main repo
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

In [None]:
# Load config data
df_configs = pd.read_csv('data/model_configs_expanded.csv', low_memory=False)
print(f"Loaded {len(df_configs):,} models with config.json")

# Load main dataset for timestamps and metadata
# Try to load from HuggingFace dataset (modelbiome/ai_ecosystem), fallback on local CSVs if datasets lib missing
print("Loading from HuggingFace dataset (modelbiome/ai_ecosystem)...")

try:
    from datasets import load_dataset
    DATASETS_AVAILABLE = True
except ImportError as e:
    print("ModuleNotFoundError: No module named 'datasets'")
    DATASETS_AVAILABLE = False

if DATASETS_AVAILABLE:
    try:
        dataset = load_dataset("modelbiome/ai_ecosystem", split="train")
        print(f"Dataset loaded with {len(dataset):,} models")
        print(f"Available columns: {dataset.column_names[:15]}...")
        
        # Build DataFrame column by column, checking if each exists
        data_dict = {'modelId': dataset['model_id']}
        
        # Map of desired column names to possible dataset column names
        col_mapping = {
            'createdAt': ['createdAt', 'created_at'],
            'downloads': ['downloads'],
            'likes': ['likes'],
            'pipeline_tag': ['pipeline_tag', 'pipelineTag']
        }
        
        for target_col, possible_cols in col_mapping.items():
            found = False
            for possible_col in possible_cols:
                if possible_col in dataset.column_names:
                    data_dict[target_col] = dataset[possible_col]
                    found = True
                    break
            if not found:
                # Set default values
                if target_col == 'createdAt' or target_col == 'pipeline_tag':
                    data_dict[target_col] = [None] * len(dataset)
                else:
                    data_dict[target_col] = [0] * len(dataset)
        
        df_main = pd.DataFrame(data_dict)
        print(f"Loaded {len(df_main):,} models from HuggingFace")
        print(f"Columns in df_main: {df_main.columns.tolist()}")
        
    except Exception as e:
        print(f"Error loading from HuggingFace: {e}")
        import traceback
        traceback.print_exc()
        print("\nTrying alternative: load from expanded dataset if available...")
        DATASETS_AVAILABLE = False  # fallback below

if not DATASETS_AVAILABLE:
    # Try to find an expanded dataset CSV
    try:
        import os
        alt_paths = [
            'data/ai_ecosystem_expanded.csv',
            'data/ai_ecosystem.csv',
            '../ai_ecosystem.csv'
        ]
        found = False
        for path in alt_paths:
            if os.path.exists(path):
                df_main = pd.read_csv(path, low_memory=False)
                if 'modelId' not in df_main.columns and 'model_id' in df_main.columns:
                    df_main['modelId'] = df_main['model_id']
                # Ensure required columns exist
                for col in ['createdAt', 'downloads', 'likes', 'pipeline_tag']:
                    if col not in df_main.columns:
                        df_main[col] = None if col in ['createdAt', 'pipeline_tag'] else 0
                print(f"Loaded {len(df_main):,} models from {path}")
                found = True
                break
        
        if not found:
            # Last resort: create minimal dataframe
            df_main = pd.DataFrame({'modelId': df_configs['modelId']})
            df_main['createdAt'] = None
            df_main['downloads'] = 0
            df_main['likes'] = 0
            df_main['pipeline_tag'] = None
            print("Created minimal dataframe - time-based analysis will be limited")
    except Exception as e2:
        print(f"Error in fallback: {e2}")
        df_main = pd.DataFrame({'modelId': df_configs['modelId']})
        df_main['createdAt'] = None
        df_main['downloads'] = 0
        df_main['likes'] = 0
        df_main['pipeline_tag'] = None

# Join configs with main dataset - only use columns that exist
cols_to_merge = ['modelId']
for col in ['createdAt', 'downloads', 'likes', 'pipeline_tag']:
    if col in df_main.columns:
        cols_to_merge.append(col)

df = df_configs.merge(df_main[cols_to_merge], on='modelId', how='left')
print(f"\nJoined dataset: {len(df):,} models with both config and metadata")
if 'createdAt' in df.columns:
    print(f"Models with createdAt: {df['createdAt'].notna().sum():,}")
else:
    print("Warning: createdAt column not available - time-based analysis will be limited")

In [None]:
# Parse createdAt timestamps
def parse_date(date_str):
    if pd.isna(date_str):
        return None
    try:
        return pd.to_datetime(date_str)
    except:
        return None

df['created_date'] = df['createdAt'].apply(parse_date)
df['year_month'] = df['created_date'].dt.to_period('M')

# Filter to models with valid dates
df_dated = df[df['created_date'].notna()].copy()
print(f"Models with valid dates: {len(df_dated):,}")
print(f"Date range: {df_dated['created_date'].min()} to {df_dated['created_date'].max()}")

## 1. Torch Dtype Adoption Over Time

In [None]:
# Categorize torch_dtype
def categorize_dtype(dtype_str):
    if pd.isna(dtype_str):
        return 'unknown'
    dtype_str = str(dtype_str).lower()
    if 'bfloat16' in dtype_str or 'bf16' in dtype_str:
        return 'bf16'
    elif 'float16' in dtype_str or 'fp16' in dtype_str:
        return 'fp16'
    elif 'float32' in dtype_str or 'fp32' in dtype_str:
        return 'fp32'
    elif 'int4' in dtype_str or '4-bit' in dtype_str:
        return '4-bit'
    elif 'int8' in dtype_str or '8-bit' in dtype_str:
        return '8-bit'
    else:
        return 'other'

df_dated['dtype_category'] = df_dated['config_torch_dtype'].apply(categorize_dtype)

# Calculate monthly adoption rates
monthly_stats = df_dated.groupby('year_month').agg({
    'modelId': 'count',
    'dtype_category': lambda x: x.value_counts().to_dict()
}).reset_index()
monthly_stats.columns = ['year_month', 'total_models', 'dtype_counts']

# Extract adoption rates for each dtype
dtype_types = ['bf16', 'fp16', 'fp32', '4-bit', '8-bit']
for dtype in dtype_types:
    monthly_stats[f'{dtype}_count'] = monthly_stats['dtype_counts'].apply(
        lambda x: x.get(dtype, 0) if isinstance(x, dict) else 0
    )
    monthly_stats[f'{dtype}_rate'] = monthly_stats[f'{dtype}_count'] / monthly_stats['total_models'] * 100

# Plot S-curves
fig, ax = plt.subplots(figsize=(14, 7))

for dtype in dtype_types:
    if monthly_stats[f'{dtype}_count'].sum() > 0:
        ax.plot(monthly_stats['year_month'].astype(str), monthly_stats[f'{dtype}_rate'], 
                marker='o', markersize=3, label=f'{dtype.upper()}', linewidth=2, alpha=0.8)

ax.set_xlabel('Month')
ax.set_ylabel('Adoption Rate (% of new models)')
ax.set_title('Torch Dtype Adoption Over Time (S-Curve)', fontsize=14)
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('figures/scurve_dtype_adoption.png', dpi=300, bbox_inches='tight')
plt.show()

## 2. Context Length Expansion Over Time

In [None]:
# Categorize context lengths
df_dated['max_pos'] = pd.to_numeric(df_dated['config_max_position_embeddings'], errors='coerce')

df_dated['ctx_8k'] = (df_dated['max_pos'] >= 8192).astype(int)
df_dated['ctx_32k'] = (df_dated['max_pos'] >= 32768).astype(int)
df_dated['ctx_128k'] = (df_dated['max_pos'] >= 131072).astype(int)

# Monthly adoption rates
ctx_monthly = df_dated.groupby('year_month').agg({
    'modelId': 'count',
    'ctx_8k': 'sum',
    'ctx_32k': 'sum',
    'ctx_128k': 'sum'
}).reset_index()
ctx_monthly.columns = ['year_month', 'total', 'count_8k', 'count_32k', 'count_128k']

ctx_monthly['rate_8k'] = ctx_monthly['count_8k'] / ctx_monthly['total'] * 100
ctx_monthly['rate_32k'] = ctx_monthly['count_32k'] / ctx_monthly['total'] * 100
ctx_monthly['rate_128k'] = ctx_monthly['count_128k'] / ctx_monthly['total'] * 100

# Plot
fig, ax = plt.subplots(figsize=(14, 7))

ax.plot(ctx_monthly['year_month'].astype(str), ctx_monthly['rate_8k'], 
        marker='o', markersize=3, label='â‰¥8K context', linewidth=2, alpha=0.8)
ax.plot(ctx_monthly['year_month'].astype(str), ctx_monthly['rate_32k'], 
        marker='s', markersize=3, label='â‰¥32K context', linewidth=2, alpha=0.8)
ax.plot(ctx_monthly['year_month'].astype(str), ctx_monthly['rate_128k'], 
        marker='^', markersize=3, label='â‰¥128K context', linewidth=2, alpha=0.8)

ax.set_xlabel('Month')
ax.set_ylabel('Adoption Rate (% of new models)')
ax.set_title('Context Length Expansion Over Time', fontsize=14)
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('figures/scurve_context_length.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Grouped-Query Attention (GQA) Adoption

In [None]:
# Detect GQA/MQA (num_key_value_heads < num_attention_heads)
df_dated['num_heads'] = pd.to_numeric(df_dated['config_num_attention_heads'], errors='coerce')
df_dated['num_kv_heads'] = pd.to_numeric(df_dated['config_num_key_value_heads'], errors='coerce')

# GQA: num_kv_heads exists and is less than num_attention_heads
df_dated['uses_gqa'] = (
    (df_dated['num_kv_heads'].notna()) & 
    (df_dated['num_heads'].notna()) &
    (df_dated['num_kv_heads'] < df_dated['num_heads'])
).astype(int)

# Monthly adoption
gqa_monthly = df_dated.groupby('year_month').agg({
    'modelId': 'count',
    'uses_gqa': 'sum'
}).reset_index()
gqa_monthly.columns = ['year_month', 'total', 'gqa_count']
gqa_monthly['gqa_rate'] = gqa_monthly['gqa_count'] / gqa_monthly['total'] * 100

# Plot
fig, ax = plt.subplots(figsize=(14, 7))

ax.plot(gqa_monthly['year_month'].astype(str), gqa_monthly['gqa_rate'], 
        marker='o', markersize=4, label='GQA/MQA', linewidth=2.5, color='#2E86AB')

ax.set_xlabel('Month')
ax.set_ylabel('Adoption Rate (% of new models)')
ax.set_title('Grouped-Query Attention (GQA) Adoption Over Time', fontsize=14)
ax.legend(loc='best')
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('figures/scurve_gqa_adoption.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Transformers Version Evolution

In [None]:
# Extract major.minor version
def extract_version(version_str):
    if pd.isna(version_str):
        return None
    try:
        parts = str(version_str).split('.')
        if len(parts) >= 2:
            return f"{parts[0]}.{parts[1]}"
    except:
        pass
    return None

df_dated['transformers_major_minor'] = df_dated['config_transformers_version'].apply(extract_version)

# Top versions
top_versions = df_dated['transformers_major_minor'].value_counts().head(10).index.tolist()

# Monthly adoption by version
version_monthly = df_dated[df_dated['transformers_major_minor'].isin(top_versions)].groupby(
    ['year_month', 'transformers_major_minor']
).size().unstack(fill_value=0)

# Calculate rates
monthly_totals = df_dated.groupby('year_month').size()
version_monthly_pct = version_monthly.div(monthly_totals, axis=0) * 100

# Plot
fig, ax = plt.subplots(figsize=(14, 7))

for version in top_versions[:6]:  # Top 6 versions
    if version in version_monthly_pct.columns:
        ax.plot(version_monthly_pct.index.astype(str), version_monthly_pct[version], 
                marker='o', markersize=3, label=f'v{version}', linewidth=2, alpha=0.7)

ax.set_xlabel('Month')
ax.set_ylabel('Adoption Rate (% of new models)')
ax.set_title('Transformers Library Version Adoption Over Time', fontsize=14)
ax.legend(loc='best', ncol=2)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('figures/scurve_transformers_version.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Comparison by Task Type

In [None]:
# Compare GQA adoption by pipeline_tag
task_types = ['text-generation', 'image-to-image', 'image-classification', 'automatic-speech-recognition']
df_task = df_dated[df_dated['pipeline_tag'].isin(task_types)].copy()

fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.flatten()

for idx, task in enumerate(task_types[:4]):
    task_df = df_task[df_task['pipeline_tag'] == task]
    if len(task_df) > 0:
        task_monthly = task_df.groupby('year_month').agg({
            'modelId': 'count',
            'uses_gqa': 'sum'
        }).reset_index()
        task_monthly['gqa_rate'] = task_monthly['uses_gqa'] / task_monthly['modelId'] * 100
        
        axes[idx].plot(task_monthly['year_month'].astype(str), task_monthly['gqa_rate'], 
                      marker='o', markersize=3, linewidth=2, color='#2E86AB')
        axes[idx].set_title(f'{task.replace("-", " ").title()}', )
        axes[idx].set_ylabel('GQA Adoption Rate (%)')
        axes[idx].grid(True, alpha=0.3)
        axes[idx].tick_params(axis='x', rotation=45)

plt.suptitle('GQA Adoption by Task Type', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig('figures/scurve_gqa_by_task.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Combined S-Curve Summary

In [None]:
# Create a comprehensive S-curve plot
fig, ax = plt.subplots(figsize=(16, 8))

# Normalize all rates to 0-100% scale
features_to_plot = [
    ('bf16', monthly_stats['bf16_rate'], '#FF6B6B'),
    ('fp16', monthly_stats['fp16_rate'], '#4ECDC4'),
    ('GQA', gqa_monthly['gqa_rate'], '#2E86AB'),
    ('â‰¥32K context', ctx_monthly['rate_32k'], '#A23B72'),
    ('â‰¥128K context', ctx_monthly['rate_128k'], '#F18F01')
]

for name, rates, color in features_to_plot:
    if len(rates) > 0 and rates.max() > 0:
        if name == 'GQA':
            x_vals = gqa_monthly['year_month'].astype(str)
        elif 'context' in name:
            x_vals = ctx_monthly['year_month'].astype(str)
        else:
            x_vals = monthly_stats['year_month'].astype(str)
        
        ax.plot(x_vals, rates, marker='o', markersize=4, label=name, 
                linewidth=2.5, color=color, alpha=0.8)

ax.set_xlabel('Month', fontsize=12)
ax.set_ylabel('Adoption Rate (% of new models)', fontsize=12)
ax.set_title('Architectural Feature Diffusion: S-Curves Over Time', fontsize=16)
ax.legend(loc='best', fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('figures/scurve_comprehensive.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n" + "="*80)
print("KEY INSIGHTS")
print("="*80)
print(f"\nðŸ“ˆ Feature Adoption Summary:")
print(f"   BF16: Peak adoption {monthly_stats['bf16_rate'].max():.1f}%")
print(f"   FP16: Peak adoption {monthly_stats['fp16_rate'].max():.1f}%")
print(f"   GQA: Peak adoption {gqa_monthly['gqa_rate'].max():.1f}%")
print(f"   â‰¥32K context: Peak adoption {ctx_monthly['rate_32k'].max():.1f}%")
print(f"   â‰¥128K context: Peak adoption {ctx_monthly['rate_128k'].max():.1f}%")