In [1]:
# M Science Analysis - Part 2B: GitHub Developer Intelligence
# Real Alternative Data from GitHub API - Developer Adoption Signals
# Predicting Software Company Performance Through Developer Community Health

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from datetime import datetime, timedelta
import time
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (14, 8)

print("üë®‚Äçüíª GITHUB DEVELOPER INTELLIGENCE ANALYSIS")
print("=" * 50)
print("Using real GitHub API data to track developer adoption signals")
print("Alternative data approach for predicting software company performance")
print()

üë®‚Äçüíª GITHUB DEVELOPER INTELLIGENCE ANALYSIS
Using real GitHub API data to track developer adoption signals
Alternative data approach for predicting software company performance



In [2]:
# =============================================================================
# 1. GITHUB API SETUP & COMPANY MAPPING
# =============================================================================

# Charles Rogers' coverage companies mapped to their GitHub presence
COMPANY_GITHUB_MAPPING = {
    # Cloud Infrastructure & DevTools
    'MDB': {
        'name': 'MongoDB',
        'ticker': 'MDB',
        'main_repo': 'mongodb/mongo',
        'additional_repos': ['mongodb/mongoid', 'mongodb/node-mongodb-native', 'mongodb/motor'],
        'org': 'mongodb',
        'category': 'Database'
    },
    'NET': {
        'name': 'Cloudflare',
        'ticker': 'NET', 
        'main_repo': 'cloudflare/workers-sdk',
        'additional_repos': ['cloudflare/cloudflare-go', 'cloudflare/terraform-provider-cloudflare', 'cloudflare/wrangler'],
        'org': 'cloudflare',
        'category': 'CDN/Edge'
    },
    'GTLB': {
        'name': 'GitLab',
        'ticker': 'GTLB',
        'main_repo': 'gitlabhq/gitlabhq',
        'additional_repos': ['gitlab-org/gitlab-runner', 'gitlab-org/gitlab-foss'],
        'org': 'gitlabhq',
        'category': 'DevOps'
    },
    'DOCN': {
        'name': 'DigitalOcean',
        'ticker': 'DOCN',
        'main_repo': 'digitalocean/doctl',
        'additional_repos': ['digitalocean/terraform-provider-digitalocean', 'digitalocean/sample-django'],
        'org': 'digitalocean',
        'category': 'Cloud Platform'
    },
    'TEAM': {
        'name': 'Atlassian',
        'ticker': 'TEAM',
        'main_repo': 'atlassian/react-beautiful-dnd',
        'additional_repos': ['atlassian/atlaskit-mk-2', 'atlassian/design-system'],
        'org': 'atlassian',
        'category': 'DevTools'
    },
    'AKAM': {
        'name': 'Akamai',
        'ticker': 'AKAM',
        'main_repo': 'akamai/cli',
        'additional_repos': ['akamai/terraform-provider-akamai', 'akamai/boomerang'],
        'org': 'akamai',
        'category': 'CDN'
    },
    'FSLY': {
        'name': 'Fastly',
        'ticker': 'FSLY',
        'main_repo': 'fastly/cli',
        'additional_repos': ['fastly/terraform-provider-fastly', 'fastly/compute-starter-kit-javascript-default'],
        'org': 'fastly',
        'category': 'CDN/Edge'
    }
}

# GitHub API configuration
GITHUB_API_BASE = 'https://api.github.com'
# Note: For production, you'd want to add your GitHub token for higher rate limits
# GITHUB_TOKEN = 'your_token_here'  # 5000 requests/hour with token vs 60 without

def get_github_data(endpoint, params=None):
    """
    Fetch data from GitHub API with error handling and rate limiting
    """
    url = f"{GITHUB_API_BASE}/{endpoint}"
    headers = {
        'Accept': 'application/vnd.github.v3+json',
        'User-Agent': 'M-Science-Analysis'
    }
    
    # Add token if available (uncomment if you have a token)
    # if 'GITHUB_TOKEN' in globals():
    #     headers['Authorization'] = f'token {GITHUB_TOKEN}'
    
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 403:
            print(f"‚ö†Ô∏è  Rate limited. Waiting 60 seconds...")
            time.sleep(60)
            response = requests.get(url, headers=headers, params=params)
        
        response.raise_for_status()
        return response.json()
    
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error fetching {endpoint}: {e}")
        return None

print("üîß Setting up GitHub API connection...")
print("üìù Note: Using unauthenticated API (60 requests/hour limit)")
print("   For production: Add GitHub token for 5000 requests/hour")

üîß Setting up GitHub API connection...
üìù Note: Using unauthenticated API (60 requests/hour limit)
   For production: Add GitHub token for 5000 requests/hour


In [3]:
# =============================================================================
# 2. REPOSITORY INTELLIGENCE COLLECTION
# =============================================================================

def collect_repo_intelligence(repo_path):
    """
    Collect comprehensive intelligence about a GitHub repository
    """
    print(f"   üìä Analyzing {repo_path}...")
    
    # Get basic repository information
    repo_data = get_github_data(f'repos/{repo_path}')
    if not repo_data:
        return None
    
    # Get contributor statistics (limited by API rate limits)
    contributors_data = get_github_data(f'repos/{repo_path}/contributors', {'per_page': 30})
    
    # Get recent releases
    releases_data = get_github_data(f'repos/{repo_path}/releases', {'per_page': 10})
    
    # Get issue statistics
    issues_data = get_github_data(f'repos/{repo_path}/issues', {'state': 'all', 'per_page': 1})
    
    # Calculate derived metrics
    intelligence = {
        'repo': repo_path,
        'stars': repo_data.get('stargazers_count', 0),
        'forks': repo_data.get('forks_count', 0),
        'watchers': repo_data.get('watchers_count', 0),
        'open_issues': repo_data.get('open_issues_count', 0),
        'size_kb': repo_data.get('size', 0),
        'created_at': repo_data.get('created_at'),
        'updated_at': repo_data.get('updated_at'),
        'pushed_at': repo_data.get('pushed_at'),
        'language': repo_data.get('language'),
        'has_wiki': repo_data.get('has_wiki', False),
        'has_projects': repo_data.get('has_projects', False),
        'archived': repo_data.get('archived', False),
        'disabled': repo_data.get('disabled', False),
        'contributors_count': len(contributors_data) if contributors_data else 0,
        'recent_releases': len(releases_data) if releases_data else 0,
        'fork_to_star_ratio': repo_data.get('forks_count', 0) / max(repo_data.get('stargazers_count', 1), 1),
        'activity_score': 0  # Will calculate based on recent activity
    }
    
    # Calculate activity score based on recent updates
    try:
        last_push = datetime.fromisoformat(repo_data.get('pushed_at', '').replace('Z', '+00:00'))
        days_since_push = (datetime.now(last_push.tzinfo) - last_push).days
        intelligence['days_since_last_push'] = days_since_push
        intelligence['activity_score'] = max(0, 100 - days_since_push)  # Score decreases with inactivity
    except:
        intelligence['days_since_last_push'] = 999
        intelligence['activity_score'] = 0
    
    # Add a small delay to respect rate limits
    time.sleep(1)
    
    return intelligence

print("\n" + "="*60)
print("1Ô∏è‚É£  COLLECTING GITHUB REPOSITORY INTELLIGENCE")
print("="*60)

# Collect data for all companies
company_github_data = {}

for ticker, company_info in COMPANY_GITHUB_MAPPING.items():
    print(f"\nüè¢ {company_info['name']} ({ticker}) - {company_info['category']}")
    
    company_repos = []
    
    # Analyze main repository
    main_repo_data = collect_repo_intelligence(company_info['main_repo'])
    if main_repo_data:
        main_repo_data['repo_type'] = 'main'
        company_repos.append(main_repo_data)
    
    # Analyze additional repositories (limit to 2 to manage rate limits)
    for additional_repo in company_info['additional_repos'][:2]:
        additional_repo_data = collect_repo_intelligence(additional_repo)
        if additional_repo_data:
            additional_repo_data['repo_type'] = 'additional'
            company_repos.append(additional_repo_data)
    
    company_github_data[ticker] = {
        'company_info': company_info,
        'repositories': company_repos
    }

print(f"\n‚úÖ Collected data for {len(company_github_data)} companies")


1Ô∏è‚É£  COLLECTING GITHUB REPOSITORY INTELLIGENCE

üè¢ MongoDB (MDB) - Database
   üìä Analyzing mongodb/mongo...
   üìä Analyzing mongodb/mongoid...
   üìä Analyzing mongodb/node-mongodb-native...

üè¢ Cloudflare (NET) - CDN/Edge
   üìä Analyzing cloudflare/workers-sdk...
   üìä Analyzing cloudflare/cloudflare-go...
   üìä Analyzing cloudflare/terraform-provider-cloudflare...

üè¢ GitLab (GTLB) - DevOps
   üìä Analyzing gitlabhq/gitlabhq...
   üìä Analyzing gitlab-org/gitlab-runner...
‚ùå Error fetching repos/gitlab-org/gitlab-runner: 404 Client Error: Not Found for url: https://api.github.com/repos/gitlab-org/gitlab-runner
   üìä Analyzing gitlab-org/gitlab-foss...
‚ùå Error fetching repos/gitlab-org/gitlab-foss: 404 Client Error: Not Found for url: https://api.github.com/repos/gitlab-org/gitlab-foss

üè¢ DigitalOcean (DOCN) - Cloud Platform
   üìä Analyzing digitalocean/doctl...
   üìä Analyzing digitalocean/terraform-provider-digitalocean...
   üìä Analyzing digit