# 01 - Data Exploration & Entropy Analysis

This notebook explores collected GitHub data and calculates entropy metrics for Stadium project classification.

**Prerequisites:**
- Run `00_setup_and_test.ipynb` first to collect data
- Data files should exist in `data/raw/`

**Goals:**
1. Load and explore collected project data
2. Calculate contributor entropy (Shannon entropy)
3. Analyze dominance patterns
4. Validate Stadium classification criteria

## Setup

In [None]:
import json
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.insert(0, '../src')

from analysis.entropy_calculation import EntropyCalculator

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = [10, 6]

print("✅ Setup complete!")

## 1. Load Collected Data

In [None]:
# Find all collected data files
data_dir = Path("../data/raw")
data_files = list(data_dir.glob("*_data.json"))

print(f"Found {len(data_files)} data file(s):")
for f in data_files:
    size_kb = f.stat().st_size / 1024
    print(f"  - {f.name} ({size_kb:.1f} KB)")

In [None]:
# Load all project data
projects = {}

for file_path in data_files:
    with open(file_path, 'r') as f:
        data = json.load(f)
        repo_name = data['repository']['full_name']
        projects[repo_name] = data
        print(f"Loaded: {repo_name}")

print(f"\n✅ Loaded {len(projects)} project(s)")

## 2. Explore Project Metrics

In [None]:
# Create summary DataFrame
summary_data = []

for repo_name, data in projects.items():
    repo = data['repository']
    maintainers = data['maintainers']['statistics']
    pr_stats = data['pull_requests']['statistics']
    issue_stats = data['issues']['statistics']
    
    summary_data.append({
        'repository': repo_name,
        'stars': repo.get('stargazers_count', 0),
        'forks': repo.get('forks_count', 0),
        'language': repo.get('language', 'Unknown'),
        'contributors': len(data['contributors']),
        'active_maintainers': maintainers.get('active_maintainers_6mo', 0),
        'commits': len(data['recent_commits']),
        'total_prs': pr_stats.get('total_prs', 0),
        'merge_rate': pr_stats.get('merged_count', 0) / max(pr_stats.get('total_prs', 1), 1),
        'avg_merge_time_hrs': pr_stats.get('avg_time_to_merge', 0),
        'conflict_rate': pr_stats.get('conflict_rate', 0),
        'total_issues': issue_stats.get('total_issues', 0),
        'avg_close_time_hrs': issue_stats.get('avg_time_to_close', 0),
    })

df_summary = pd.DataFrame(summary_data)
df_summary

## 3. Calculate Contributor Entropy

**Shannon Entropy** measures the distribution of contributions:
- **Low entropy** → Concentrated contributions (few dominant contributors) → Stadium characteristic
- **High entropy** → Distributed contributions (many equal contributors) → Federation characteristic

In [None]:
# Initialize entropy calculator
entropy_calc = EntropyCalculator()

# Calculate entropy for each project
entropy_results = []

for repo_name, data in projects.items():
    contributors = data['contributors']
    
    if len(contributors) > 0:
        # Calculate contributor entropy
        entropy, normalized_entropy = entropy_calc.contributor_entropy(contributors)
        
        # Calculate dominance metrics
        total_contributions = sum(c['contributions'] for c in contributors)
        top_contributor = contributors[0]
        top_2_contributions = sum(c['contributions'] for c in contributors[:2])
        
        entropy_results.append({
            'repository': repo_name,
            'entropy': entropy,
            'normalized_entropy': normalized_entropy,
            'max_possible_entropy': np.log2(len(contributors)),
            'top_contributor': top_contributor['login'],
            'top_contributor_pct': top_contributor['contributions'] / total_contributions * 100,
            'top_2_pct': top_2_contributions / total_contributions * 100,
            'gini_coefficient': entropy_calc.gini_coefficient([c['contributions'] for c in contributors])
        })
        
        print(f"\n{repo_name}:")
        print(f"  Shannon Entropy: {entropy:.3f} bits")
        print(f"  Normalized Entropy: {normalized_entropy:.3f} (0=concentrated, 1=uniform)")
        print(f"  Top Contributor: {top_contributor['login']} ({top_contributor['contributions'] / total_contributions * 100:.1f}%)")

df_entropy = pd.DataFrame(entropy_results)
df_entropy

## 4. Visualize Contribution Distribution

In [None]:
# Plot contribution distribution for each project
for repo_name, data in projects.items():
    contributors = data['contributors'][:20]  # Top 20
    
    if len(contributors) == 0:
        continue
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar chart of top contributors
    names = [c['login'][:15] for c in contributors]
    contributions = [c['contributions'] for c in contributors]
    
    ax1 = axes[0]
    bars = ax1.barh(names[::-1], contributions[::-1], color='steelblue')
    ax1.set_xlabel('Contributions')
    ax1.set_title(f'{repo_name} - Top 20 Contributors')
    
    # Highlight top contributor
    bars[-1].set_color('coral')
    
    # Cumulative contribution curve (Lorenz-like)
    all_contributors = data['contributors']
    all_contributions = sorted([c['contributions'] for c in all_contributors], reverse=True)
    cumulative = np.cumsum(all_contributions) / sum(all_contributions) * 100
    
    ax2 = axes[1]
    ax2.plot(range(1, len(cumulative) + 1), cumulative, 'b-', linewidth=2, label='Actual')
    ax2.plot([1, len(cumulative)], [0, 100], 'r--', alpha=0.5, label='Perfect equality')
    ax2.fill_between(range(1, len(cumulative) + 1), cumulative, alpha=0.3)
    ax2.set_xlabel('Number of Contributors')
    ax2.set_ylabel('Cumulative % of Contributions')
    ax2.set_title(f'{repo_name} - Contribution Concentration')
    ax2.legend()
    ax2.set_xlim(1, min(50, len(cumulative)))
    ax2.set_ylim(0, 100)
    
    # Add annotation for key percentages
    for threshold in [50, 80, 90]:
        idx = np.searchsorted(cumulative, threshold)
        if idx < len(cumulative):
            ax2.axhline(y=threshold, color='gray', linestyle=':', alpha=0.5)
            ax2.annotate(f'{threshold}% by {idx+1} contributors', 
                        xy=(idx+1, threshold), fontsize=9)
    
    plt.tight_layout()
    plt.show()

## 5. Stadium Classification Analysis

Based on our research framework, a **Stadium project** exhibits:
- Low normalized entropy (< 0.5)
- High top contributor dominance (> 40%)
- Few active maintainers (≤ 3, or high concentration despite more)

In [None]:
def classify_project(row):
    """Classify project based on entropy and dominance metrics."""
    
    # Stadium indicators
    low_entropy = row['normalized_entropy'] < 0.5
    high_dominance = row['top_contributor_pct'] > 40
    high_top2_dominance = row['top_2_pct'] > 60
    high_gini = row['gini_coefficient'] > 0.7
    
    # Count Stadium indicators
    stadium_score = sum([low_entropy, high_dominance, high_top2_dominance, high_gini])
    
    if stadium_score >= 3:
        return 'Stadium (Strong)'
    elif stadium_score >= 2:
        return 'Stadium (Likely)'
    elif stadium_score >= 1:
        return 'Hybrid/Uncertain'
    else:
        return 'Federation/Club'

if len(df_entropy) > 0:
    df_entropy['classification'] = df_entropy.apply(classify_project, axis=1)
    
    print("\n" + "=" * 60)
    print("PROJECT CLASSIFICATION RESULTS")
    print("=" * 60)
    
    for _, row in df_entropy.iterrows():
        print(f"\n{row['repository']}:")
        print(f"  Classification: {row['classification']}")
        print(f"  Normalized Entropy: {row['normalized_entropy']:.3f}")
        print(f"  Top Contributor: {row['top_contributor']} ({row['top_contributor_pct']:.1f}%)")
        print(f"  Top 2 Contributors: {row['top_2_pct']:.1f}%")
        print(f"  Gini Coefficient: {row['gini_coefficient']:.3f}")
    
    print("\n" + "=" * 60)

## 6. Governance Patterns Analysis

In [None]:
# Analyze governance files presence
governance_data = []

for repo_name, data in projects.items():
    gov_files = data.get('governance_files', {})
    governance_data.append({
        'repository': repo_name,
        'GOVERNANCE.md': gov_files.get('GOVERNANCE.md', False),
        'CONTRIBUTING.md': gov_files.get('CONTRIBUTING.md', False),
        'CODE_OF_CONDUCT.md': gov_files.get('CODE_OF_CONDUCT.md', False),
        'SECURITY.md': gov_files.get('SECURITY.md', False),
        'MAINTAINERS.md': gov_files.get('MAINTAINERS.md', False),
        'CODEOWNERS': gov_files.get('.github/CODEOWNERS', False),
    })

df_governance = pd.DataFrame(governance_data)

print("Governance Files Present:")
print("─" * 60)
for _, row in df_governance.iterrows():
    print(f"\n{row['repository']}:")
    for col in df_governance.columns[1:]:
        status = "✓" if row[col] else "✗"
        print(f"  {status} {col}")

## 7. Export Analysis Results

In [None]:
# Merge all analysis into single DataFrame
if len(df_entropy) > 0 and len(df_summary) > 0:
    df_analysis = df_summary.merge(df_entropy, on='repository')
    
    # Save to CSV
    output_path = Path("../data/processed/analysis_results.csv")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df_analysis.to_csv(output_path, index=False)
    
    print(f"✅ Analysis saved to: {output_path}")
    
    # Display final table
    display_cols = ['repository', 'stars', 'contributors', 'active_maintainers', 
                    'normalized_entropy', 'top_contributor_pct', 'gini_coefficient', 
                    'classification']
    df_analysis[display_cols]

## Next Steps

1. **Collect more Stadium candidates** from `data/stadium_candidates.md`
2. **Compare with Federation/Club projects** for statistical validation
3. **Run hypothesis tests** (H1-H6) once sample sizes are sufficient
4. **Temporal entropy analysis** - how entropy changes over time