# 02 - Batch Data Collection

Efficiently collect data for multiple Stadium project candidates.

**Goals:**
1. Load Stadium candidates from `data/stadium_candidates.md`
2. Quick verification of maintainer counts
3. Batch collection with rate limit management
4. Progress tracking and error handling

## Setup

In [None]:
import os
import sys
import json
import time
from pathlib import Path
from datetime import datetime

import pandas as pd
from dotenv import load_dotenv

# Add src to path
sys.path.insert(0, '../src')
from collection.github_collector import GitHubCollector

# Load environment from .env file
env_path = Path("../.env")
if env_path.exists():
    load_dotenv(env_path)
    print(f"‚úÖ Loaded .env from {env_path.resolve()}")
else:
    load_dotenv()
    print("‚ö†Ô∏è  No .env file found, trying default locations")

GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')

if not GITHUB_TOKEN:
    raise ValueError(
        "GITHUB_TOKEN not found!\n"
        "1. Copy .env.example to .env: cp ../.env.example ../.env\n"
        "2. Edit .env and add your GitHub token\n"
        "3. Get a token at: https://github.com/settings/tokens"
    )

if GITHUB_TOKEN == "your_github_token_here":
    raise ValueError(
        "GITHUB_TOKEN is still the placeholder!\n"
        "Edit ../.env and replace with your actual token"
    )

# Initialize collector
collector = GitHubCollector(token=GITHUB_TOKEN)

print("‚úÖ Setup complete!")
rate = collector.get_rate_limit()
print(f"   Rate limit: {rate['core']['remaining']}/{rate['core']['limit']}")

## 1. Define Stadium Candidates

Based on research criteria:
- High usage/downloads
- Few maintainers (‚â§3 ideal, or high dominance)
- Critical infrastructure packages

In [None]:
# Import from centralized candidate lists
import sys
sys.path.insert(0, '../data')
from candidates import (
    STADIUM_ALL, STADIUM_COLLECTED, STADIUM_HIGH_PRIORITY,
    FEDERATION_CANDIDATES, CLUB_CANDIDATES, TOY_CANDIDATES,
    get_uncollected, print_status
)

# Show collection status across all categories
print_status()

# Use the centralized Stadium candidates
all_candidates = [{"repo": repo, "ecosystem": "mixed"} for repo in STADIUM_ALL]

print(f"\nTotal Stadium candidates: {len(all_candidates)}")
print(f"Already collected: {len(STADIUM_COLLECTED)}")
print(f"Remaining: {len(get_uncollected('stadium'))}")

## 2. Quick Verification - Check Maintainer Counts

Before full collection, quickly verify candidates meet Stadium criteria.

In [None]:
def quick_verify(repo_name: str) -> dict:
    """Quick verification of Stadium criteria."""
    try:
        # Get basic metrics
        metrics = collector.collect_repository_metrics(repo_name)
        maintainers = collector.collect_maintainer_data(repo_name)
        
        # Get top contributor dominance
        contributors = collector.collect_contributor_data(repo_name, max_contributors=10)
        
        dominance = 0
        if contributors:
            total = sum(c['contributions'] for c in contributors)
            if total > 0:
                dominance = contributors[0]['contributions'] / total * 100
        
        return {
            "repo": repo_name,
            "stars": metrics.get('stargazers_count', 0),
            "language": metrics.get('language', 'Unknown'),
            "active_maintainers": maintainers['statistics'].get('active_maintainers_6mo', 0),
            "top_contributor": contributors[0]['login'] if contributors else 'N/A',
            "top_contributor_pct": round(dominance, 1),
            "stadium_likely": maintainers['statistics'].get('active_maintainers_6mo', 0) <= 3 or dominance > 40,
            "error": None
        }
    except Exception as e:
        return {
            "repo": repo_name,
            "error": str(e)
        }

In [None]:
# Quick verify all candidates (uses ~50-100 API calls per repo)
print("Quick verification of Stadium candidates...")
print("="*70)

verification_results = []

for i, candidate in enumerate(all_candidates):
    repo = candidate['repo']
    print(f"[{i+1}/{len(all_candidates)}] Checking {repo}...", end=" ")
    
    result = quick_verify(repo)
    result['ecosystem'] = candidate['ecosystem']
    verification_results.append(result)
    
    if result.get('error'):
        print(f"‚ùå Error: {result['error'][:50]}")
    elif result.get('stadium_likely'):
        print(f"‚úÖ Stadium likely ({result['active_maintainers']} maintainers, {result['top_contributor_pct']}% dominance)")
    else:
        print(f"‚ö†Ô∏è  Maybe not Stadium ({result['active_maintainers']} maintainers, {result['top_contributor_pct']}% dominance)")
    
    # Rate limit check
    if (i + 1) % 5 == 0:
        rate = collector.get_rate_limit()
        print(f"    [Rate limit: {rate['core']['remaining']}/{rate['core']['limit']}]")
        if rate['core']['remaining'] < 500:
            print("‚ö†Ô∏è  Rate limit low, pausing...")
            time.sleep(60)

print("\n" + "="*70)
print("Verification complete!")

In [None]:
# Display verification results
df_verify = pd.DataFrame(verification_results)

# Filter successful verifications
df_success = df_verify[df_verify['error'].isna()].copy()

print(f"\nSuccessfully verified: {len(df_success)}/{len(df_verify)}")
print(f"Stadium likely: {df_success['stadium_likely'].sum()}")
print(f"Uncertain: {(~df_success['stadium_likely']).sum()}")

# Display table
display_cols = ['repo', 'ecosystem', 'stars', 'active_maintainers', 'top_contributor', 'top_contributor_pct', 'stadium_likely']
df_success[display_cols].sort_values('stadium_likely', ascending=False)

## 3. Select Confirmed Stadium Projects

In [None]:
# Use centralized uncollected list
# This uses the COLLECTED list from stadium_candidates.py
uncollected = get_uncollected("stadium")

# Also allow collecting high priority first
confirmed_stadium = STADIUM_HIGH_PRIORITY + [r for r in uncollected if r not in STADIUM_HIGH_PRIORITY]

print(f"Confirmed Stadium projects to collect ({len(confirmed_stadium)}):")
for repo in confirmed_stadium[:10]:  # Show first 10
    priority = "HIGH" if repo in STADIUM_HIGH_PRIORITY else ""
    print(f"  - {repo} {priority}")
if len(confirmed_stadium) > 10:
    print(f"  ... and {len(confirmed_stadium) - 10} more")

## 4. Batch Collection - Full Dataset

Collect complete data for confirmed Stadium projects.

In [None]:
def collect_with_retry(repo_name: str, since_days: int = 365, max_retries: int = 3) -> dict:
    """Collect data with retry logic."""
    for attempt in range(max_retries):
        try:
            data = collector.collect_complete_dataset(repo_name, since_days=since_days)
            return {"success": True, "data": data, "error": None}
        except Exception as e:
            if "rate limit" in str(e).lower():
                print(f"      Rate limit hit, waiting 60s...")
                time.sleep(60)
            elif attempt < max_retries - 1:
                print(f"      Retry {attempt + 1}/{max_retries}...")
                time.sleep(5)
            else:
                return {"success": False, "data": None, "error": str(e)}
    return {"success": False, "data": None, "error": "Max retries exceeded"}

In [None]:
# Check which projects already have data
data_dir = Path("../data/raw")
existing_files = {f.stem.replace('_data', '').replace('_', '/'): f for f in data_dir.glob("*_data.json")}

print(f"Existing data files: {len(existing_files)}")
for repo in existing_files:
    print(f"  ‚úì {repo}")

# Filter to only collect missing ones
to_collect = [repo for repo in confirmed_stadium if repo not in existing_files]
print(f"\nNeed to collect: {len(to_collect)}")

In [None]:
# Batch collection
collection_results = []
start_time = datetime.now()

print(f"Starting batch collection for {len(to_collect)} projects...")
print("="*70)

for i, repo in enumerate(to_collect):
    print(f"\n[{i+1}/{len(to_collect)}] Collecting {repo}...")
    
    # Check rate limit before starting
    rate = collector.get_rate_limit()
    if rate['core']['remaining'] < 500:
        wait_time = 60
        print(f"    ‚è≥ Rate limit low ({rate['core']['remaining']}), waiting {wait_time}s...")
        time.sleep(wait_time)
    
    # Collect data
    result = collect_with_retry(repo, since_days=365)
    
    if result['success']:
        # Save to file
        output_path = data_dir / f"{repo.replace('/', '_')}_data.json"
        collector.save_data(result['data'], output_path)
        
        stars = result['data']['repository'].get('stargazers_count', 0)
        contributors = len(result['data']['contributors'])
        print(f"    ‚úÖ Success! ({stars:,} stars, {contributors} contributors)")
        
        collection_results.append({
            "repo": repo,
            "success": True,
            "stars": stars,
            "contributors": contributors,
            "file": str(output_path)
        })
    else:
        print(f"    ‚ùå Failed: {result['error'][:50]}")
        collection_results.append({
            "repo": repo,
            "success": False,
            "error": result['error']
        })

elapsed = datetime.now() - start_time
print("\n" + "="*70)
print(f"Batch collection complete! Time: {elapsed}")

In [None]:
# Summary
df_results = pd.DataFrame(collection_results)

print("\n" + "="*60)
print("COLLECTION SUMMARY")
print("="*60)

if len(df_results) > 0:
    success_count = df_results['success'].sum()
    print(f"Successful: {success_count}/{len(df_results)}")
    print(f"Failed: {len(df_results) - success_count}")
    
    if 'stars' in df_results.columns:
        df_success = df_results[df_results['success']]
        print(f"\nTotal stars collected: {df_success['stars'].sum():,}")
        print(f"Total contributors: {df_success['contributors'].sum():,}")

# Show all collected data
all_data_files = list(data_dir.glob("*_data.json"))
print(f"\nTotal data files: {len(all_data_files)}")
total_size = sum(f.stat().st_size for f in all_data_files) / 1024
print(f"Total size: {total_size:.1f} KB")

## 5. Final Rate Limit Check

In [None]:
rate = collector.get_rate_limit()
print(f"\nüìä Final Rate Limit Status:")
print(f"   Core API: {rate['core']['remaining']}/{rate['core']['limit']} remaining")
print(f"   Resets at: {rate['core']['reset']}")

## Next Steps

1. Run `01_data_exploration.ipynb` to analyze collected data
2. Run `03_statistical_analysis.ipynb` for hypothesis testing
3. Add Federation/Club control projects for comparison