# Wingspan Data Reconnaissance

This notebook investigates the sparsity and availability of wingspan data from the NBA Draft Combine API across multiple seasons.

## Objectives
1. Quantify data coverage across different seasons
2. Understand the data structure and completeness
3. Determine the best approach for integrating sparse wingspan data


In [None]:
import sys
from pathlib import Path
import pandas as pd
import json
from typing import Dict, List, Any

# Add the project root to the path
project_root = Path.cwd()
sys.path.insert(0, str(project_root))

from src.nba_stats.api.nba_stats_client import NBAStatsClient
from src.nba_stats.api.response_models import DraftCombineAnthroResponse


In [None]:
# Initialize the API client
client = NBAStatsClient()

# Define seasons to test (covering a wide range)
seasons_to_test = [
    "2024-25",  # Most recent
    "2023-24",  # Last season
    "2020-21",  # COVID year
    "2015-16",  # Mid-2010s
    "2010-11",  # Early 2010s
    "2005-06",  # Mid-2000s
    "2000-01",  # Early 2000s
]

print(f"Testing {len(seasons_to_test)} seasons for wingspan data availability...")


In [None]:
# Test each season and collect data
results = []

for season in seasons_to_test:
    print(f"\nTesting season: {season}")
    
    try:
        # Fetch data for this season
        response = client.get_draft_combine_anthro(season)
        
        if response and 'resultSets' in response:
            result_set = response['resultSets'][0]
            row_count = len(result_set.get('rowSet', []))
            
            # Count non-null wingspan values
            wingspan_count = 0
            for row in result_set.get('rowSet', []):
                if len(row) > 11 and row[11] is not None:  # WINGSPAN is at index 11
                    wingspan_count += 1
            
            results.append({
                'season': season,
                'total_players': row_count,
                'wingspan_available': wingspan_count,
                'wingspan_coverage_pct': (wingspan_count / row_count * 100) if row_count > 0 else 0,
                'status': 'success'
            })
            
            print(f"  Total players: {row_count}")
            print(f"  Wingspan available: {wingspan_count}")
            print(f"  Coverage: {(wingspan_count / row_count * 100):.1f}%")
            
        else:
            results.append({
                'season': season,
                'total_players': 0,
                'wingspan_available': 0,
                'wingspan_coverage_pct': 0,
                'status': 'no_data'
            })
            print(f"  No data available")
            
    except Exception as e:
        results.append({
            'season': season,
            'total_players': 0,
            'wingspan_available': 0,
            'wingspan_coverage_pct': 0,
            'status': f'error: {str(e)}'
        })
        print(f"  Error: {e}")

print(f"\nCompleted testing {len(seasons_to_test)} seasons")


In [None]:
# Create a summary DataFrame
df_results = pd.DataFrame(results)

print("=== WINGSPAN DATA RECONNAISSANCE SUMMARY ===")
print(df_results.to_string(index=False))

print(f"\n=== KEY FINDINGS ===")
print(f"Seasons with data: {len(df_results[df_results['status'] == 'success'])}")
print(f"Total players across all seasons: {df_results['total_players'].sum()}")
print(f"Total wingspan records: {df_results['wingspan_available'].sum()}")
print(f"Overall coverage: {(df_results['wingspan_available'].sum() / df_results['total_players'].sum() * 100):.1f}%")

# Calculate average coverage for successful seasons
successful_seasons = df_results[df_results['status'] == 'success']
if len(successful_seasons) > 0:
    avg_coverage = successful_seasons['wingspan_coverage_pct'].mean()
    print(f"Average coverage (successful seasons): {avg_coverage:.1f}%")


In [None]:
# Let's also examine the data structure for one successful season
print("\n=== DATA STRUCTURE ANALYSIS ===")

# Find a successful season to examine
successful_season = None
for result in results:
    if result['status'] == 'success' and result['total_players'] > 0:
        successful_season = result['season']
        break

if successful_season:
    print(f"Examining data structure for season: {successful_season}")
    
    response = client.get_draft_combine_anthro(successful_season)
    if response and 'resultSets' in response:
        result_set = response['resultSets'][0]
        headers = result_set.get('headers', [])
        sample_row = result_set.get('rowSet', [])[0] if result_set.get('rowSet') else []
        
        print(f"Headers ({len(headers)}): {headers}")
        print(f"Sample row: {sample_row}")
        
        # Check data quality
        print(f"\nData Quality Check:")
        for i, header in enumerate(headers):
            non_null_count = sum(1 for row in result_set.get('rowSet', []) 
                               if i < len(row) and row[i] is not None)
            total_count = len(result_set.get('rowSet', []))
            print(f"  {header}: {non_null_count}/{total_count} ({non_null_count/total_count*100:.1f}%)")
else:
    print("No successful seasons found to examine structure")


## Conclusions and Recommendations

Based on this reconnaissance, we can make informed decisions about how to integrate wingspan data:

1. **Data Sparsity**: The data is highly sparse, only available for draft combine attendees
2. **Coverage**: Even for successful seasons, coverage is limited to combine participants
3. **Design Implications**: Wingspan should be treated as an enhancement metric, not a core requirement
4. **Database Design**: A separate, nullable table is the correct approach
5. **Integration Strategy**: Defer complex imputation and make data available but optional
