# NFL Data Validation & Quality Check

This notebook validates our data sources and identifies any gaps before building ML models.

## Goals:
1. Check data availability and quality
2. Identify missing data for ML features
3. Validate data completeness for predictions
4. Document data gaps and next steps


In [11]:
import nfl_data_py as nfl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## 1. Data Source Validation


In [12]:
# Define seasons to check
seasons = [2020, 2021, 2022, 2023, 2024]

print("=== DATA SOURCE VALIDATION ===")
print(f"Checking data for seasons: {seasons}")
print()

# Test each data source
data_sources = {
    'schedules': nfl.import_schedules,
    'weekly_data': nfl.import_weekly_data,
    'play_by_play': lambda years: nfl.import_pbp_data(years, cache=False),
    'seasonal_rosters': nfl.import_seasonal_rosters,
    'next_gen_passing': lambda years: nfl.import_ngs_data(stat_type='passing', years=years),
    'next_gen_rushing': lambda years: nfl.import_ngs_data(stat_type='rushing', years=years),
    'next_gen_receiving': lambda years: nfl.import_ngs_data(stat_type='receiving', years=years)
}

data_availability = {}

for source_name, source_func in data_sources.items():
    try:
        data = source_func(seasons)
        data_availability[source_name] = {
            'status': 'SUCCESS',
            'rows': len(data),
            'columns': len(data.columns),
            'date_range': f"{data.get('season', pd.Series()).min()}-{data.get('season', pd.Series()).max()}" if 'season' in data.columns else 'N/A'
        }
        print(f"✅ {source_name}: {len(data)} rows, {len(data.columns)} columns")
    except Exception as e:
        data_availability[source_name] = {
            'status': 'FAILED',
            'error': str(e)
        }
        print(f"❌ {source_name}: {str(e)}")

print()


=== DATA SOURCE VALIDATION ===
Checking data for seasons: [2020, 2021, 2022, 2023, 2024]

✅ schedules: 1408 rows, 46 columns
Downcasting floats.
✅ weekly_data: 28026 rows, 53 columns
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.
Downcasting floats.
✅ play_by_play: 246218 rows, 397 columns
✅ seasonal_rosters: 15464 rows, 37 columns
✅ next_gen_passing: 3026 rows, 29 columns
✅ next_gen_rushing: 3055 rows, 22 columns
✅ next_gen_receiving: 7469 rows, 23 columns



## 2. ML Feature Availability Check


In [13]:
print("=== ML FEATURE AVAILABILITY ===")

# Load core datasets first
try:
    games = nfl.import_schedules(seasons)
    player_stats = nfl.import_weekly_data(seasons)
    pbp_data = nfl.import_pbp_data(seasons, cache=False)
    rosters = nfl.import_seasonal_rosters(seasons)
    print(f"✅ Loaded {len(games)} games, {len(player_stats)} player stats, {len(pbp_data)} plays, {len(rosters)} roster records")
except Exception as e:
    print(f"❌ Error loading data: {e}")
    games = pd.DataFrame()
    player_stats = pd.DataFrame()
    pbp_data = pd.DataFrame()
    rosters = pd.DataFrame()

# Define features needed for different prediction tasks
prediction_tasks = {
    'game_outcome': {
        'required_features': [
            'home_team', 'away_team', 'home_score', 'away_score',
            'weather', 'stadium', 'season', 'week'
        ],
        'description': 'Predict which team wins a game'
    },
    'player_fantasy_points': {
        'required_features': [
            'player_id', 'position', 'passing_yards', 'rushing_yards',
            'receiving_yards', 'touchdowns', 'fumbles', 'interceptions'
        ],
        'description': 'Predict player fantasy performance'
    },
    'team_performance': {
        'required_features': [
            'team', 'total_yards', 'turnovers', 'time_of_possession',
            'third_down_conversion', 'red_zone_efficiency', 'epa'
        ],
        'description': 'Predict team offensive/defensive performance',
        'derivable_from_pbp': [
            'total_yards', 'turnovers', 'time_of_possession',
            'third_down_conversion', 'red_zone_efficiency', 'epa'
        ]
    }
}

feature_availability = {}

for task, config in prediction_tasks.items():
    print(f"\n{task.upper().replace('_', ' ')}:")
    print(f"  Description: {config['description']}")
    
    available_features = []
    missing_features = []
    
    for feature in config['required_features']:
        if feature in games.columns:
            available_features.append(feature)
        elif feature in player_stats.columns:
            available_features.append(feature)
        elif feature in pbp_data.columns:
            available_features.append(feature)
        elif feature in rosters.columns:
            available_features.append(feature)
        elif 'derivable_from_pbp' in config and feature in config['derivable_from_pbp']:
            available_features.append(f"{feature} (derivable from PBP)")
        else:
            missing_features.append(feature)
    
    feature_availability[task] = {
        'available': available_features,
        'missing': missing_features,
        'coverage': len(available_features) / len(config['required_features'])
    }
    
    print(f"  ✅ Available: {available_features}")
    print(f"  ❌ Missing: {missing_features}")
    print(f"  📊 Coverage: {feature_availability[task]['coverage']:.1%}")


=== ML FEATURE AVAILABILITY ===
Downcasting floats.
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.
Downcasting floats.
✅ Loaded 1408 games, 28026 player stats, 246218 plays, 15464 roster records

GAME OUTCOME:
  Description: Predict which team wins a game
  ✅ Available: ['home_team', 'away_team', 'home_score', 'away_score', 'weather', 'stadium', 'season', 'week']
  ❌ Missing: []
  📊 Coverage: 100.0%

PLAYER FANTASY POINTS:
  Description: Predict player fantasy performance
  ✅ Available: ['player_id', 'position', 'passing_yards', 'rushing_yards', 'receiving_yards', 'interceptions']
  ❌ Missing: ['touchdowns', 'fumbles']
  📊 Coverage: 75.0%

TEAM PERFORMANCE:
  Description: Predict team offensive/defensive performance
  ✅ Available: ['team', 'total_yards (derivable from PBP)', 'turnovers (derivable from PBP)', 'time_of_possession (derivable from PBP)', 'third_down_conversion (derivable from PBP)', 'red_zone_efficiency (derivable from PBP)', 'epa']
  ❌ Missing: []
  📊 Coverage: 10

## 3. Feature Engineering from Play-by-Play Data


In [14]:
print("=== FEATURE ENGINEERING FROM PBP DATA ===")

if 'pbp_data' in locals() and len(pbp_data) > 0:
    print(f"✅ PBP data available: {len(pbp_data)} plays")
    
    # Show what we can derive
    derivable_features = {
        'total_yards': 'Sum of yards_gained by team per game',
        'turnovers': 'Sum of interceptions + fumbles lost by team per game',
        'time_of_possession': 'Sum of drive_time_of_possession by team per game',
        'third_down_conversion': 'third_down_converted / (converted + failed)',
        'red_zone_efficiency': 'TDs in red zone / red zone attempts',
        'total_epa': 'Sum of EPA by team per game',
        'passing_epa': 'EPA on passing plays only',
        'rushing_epa': 'EPA on rushing plays only',
        'first_downs': 'Sum of first_down by team per game',
        'penalties': 'Sum of penalty by team per game',
        'sacks': 'Sum of sack by team per game',
        'explosive_plays': 'Plays with yards_gained >= 20',
        'big_plays': 'Plays with yards_gained >= 40'
    }
    
    print("\n🎯 DERIVABLE TEAM PERFORMANCE FEATURES:")
    for feature, description in derivable_features.items():
        print(f"  ✅ {feature}: {description}")
    
    # Demonstrate feature engineering
    print("\n📊 SAMPLE FEATURE ENGINEERING:")
    
    # Example: Team yards per game
    if 'yards_gained' in pbp_data.columns and 'posteam' in pbp_data.columns:
        team_yards = pbp_data.groupby(['game_id', 'posteam'])['yards_gained'].sum().reset_index()
        print(f"  - Team total yards: {len(team_yards)} team-game records")
    
    # Example: Team turnovers per game
    if all(col in pbp_data.columns for col in ['interception', 'fumble_lost', 'posteam']):
        team_turnovers = pbp_data.groupby(['game_id', 'posteam']).agg({
            'interception': 'sum',
            'fumble_lost': 'sum'
        }).reset_index()
        team_turnovers['total_turnovers'] = team_turnovers['interception'] + team_turnovers['fumble_lost']
        print(f"  - Team turnovers: {len(team_turnovers)} team-game records")
    
    # Example: Third down conversion rate
    if all(col in pbp_data.columns for col in ['third_down_converted', 'third_down_failed', 'down']):
        third_down_data = pbp_data[pbp_data['down'] == 3]
        if len(third_down_data) > 0:
            third_down_stats = third_down_data.groupby(['game_id', 'posteam']).agg({
                'third_down_converted': 'sum',
                'third_down_failed': 'sum'
            }).reset_index()
            third_down_stats['third_down_conversion_rate'] = (
                third_down_stats['third_down_converted'] / 
                (third_down_stats['third_down_converted'] + third_down_stats['third_down_failed'])
            )
            print(f"  - Third down conversion: {len(third_down_stats)} team-game records")
    
    print(f"\n🚀 CONCLUSION: All team performance features are derivable from PBP data!")
    
else:
    print("❌ PBP data not available for feature engineering demonstration")


=== FEATURE ENGINEERING FROM PBP DATA ===
✅ PBP data available: 246218 plays

🎯 DERIVABLE TEAM PERFORMANCE FEATURES:
  ✅ total_yards: Sum of yards_gained by team per game
  ✅ turnovers: Sum of interceptions + fumbles lost by team per game
  ✅ time_of_possession: Sum of drive_time_of_possession by team per game
  ✅ third_down_conversion: third_down_converted / (converted + failed)
  ✅ red_zone_efficiency: TDs in red zone / red zone attempts
  ✅ total_epa: Sum of EPA by team per game
  ✅ passing_epa: EPA on passing plays only
  ✅ rushing_epa: EPA on rushing plays only
  ✅ first_downs: Sum of first_down by team per game
  ✅ penalties: Sum of penalty by team per game
  ✅ sacks: Sum of sack by team per game
  ✅ explosive_plays: Plays with yards_gained >= 20
  ✅ big_plays: Plays with yards_gained >= 40

📊 SAMPLE FEATURE ENGINEERING:
  - Team total yards: 2816 team-game records
  - Team turnovers: 2816 team-game records
  - Third down conversion: 2816 team-game records

🚀 CONCLUSION: All team

## 4. Summary & Recommendations


In [15]:
print("=== UPDATED ML READINESS ASSESSMENT ===")
print()

# Overall data health score
if 'data_availability' in locals():
    total_sources = len(data_sources)
    successful_sources = sum(1 for source in data_availability.values() if source['status'] == 'SUCCESS')
    data_health_score = successful_sources / total_sources
    
    print(f"📊 DATA HEALTH SCORE: {data_health_score:.1%}")
    print(f"   ({successful_sources}/{total_sources} data sources working)")
    print()

# ML readiness assessment with feature engineering
if 'feature_availability' in locals():
    print("🤖 ML READINESS (WITH FEATURE ENGINEERING):")
    
    for task, features in feature_availability.items():
        # Count derivable features as available
        total_features = len(features['available']) + len(features['missing'])
        derivable_count = sum(1 for f in features['available'] if '(derivable from PBP)' in f)
        actual_available = len(features['available']) - derivable_count
        actual_missing = len(features['missing']) - derivable_count
        
        # If we have PBP data, all derivable features are actually available
        if 'pbp_data' in locals() and len(pbp_data) > 0:
            if task == 'team_performance':
                actual_missing = 0  # All team features derivable from PBP
                actual_available = total_features
        
        coverage = actual_available / total_features if total_features > 0 else 0
        
        print(f"   {task.replace('_', ' ').title()}: {coverage:.1%} coverage")
        if actual_missing > 0:
            print(f"     Missing: {features['missing']}")
    
    avg_coverage = np.mean([
        (len(features['available']) - sum(1 for f in features['available'] if '(derivable from PBP)' in f)) / 
        (len(features['available']) + len(features['missing']))
        for features in feature_availability.values()
    ])
    
    # Adjust for team performance if PBP available
    if 'pbp_data' in locals() and len(pbp_data) > 0:
        team_perf_coverage = 1.0  # All derivable from PBP
        other_coverage = np.mean([
            (len(features['available']) - sum(1 for f in features['available'] if '(derivable from PBP)' in f)) / 
            (len(features['available']) + len(features['missing']))
            for task, features in feature_availability.items() if task != 'team_performance'
        ])
        avg_coverage = (team_perf_coverage + other_coverage) / 2
    
    print(f"\n📈 OVERALL ML READINESS: {avg_coverage:.1%}")
    print()

# Recommendations
print("🎯 RECOMMENDATIONS:")
print()

if 'data_availability' in locals() and data_health_score < 0.8:
    print("1. 🔧 FIX DATA SOURCES:")
    for source, status in data_availability.items():
        if status['status'] == 'FAILED':
            print(f"   - {source}: {status['error']}")
    print()

print("2. 🚀 NEXT STEPS:")
if 'data_availability' in locals() and 'feature_availability' in locals():
    if data_health_score >= 0.8 and avg_coverage >= 0.8:
        print("   ✅ EXCELLENT data quality - ready to build ML models!")
        print("   📋 Suggested implementation order:")
        print("      1. 🏈 Player Fantasy Points Prediction (simplest, highest coverage)")
        print("      2. 🎯 Game Outcome Prediction (good coverage, high business value)")
        print("      3. 📊 Team Performance Prediction (requires PBP feature engineering)")
        print("   🛠️  Feature Engineering Required:")
        print("      - Aggregate PBP data to team-game level")
        print("      - Join player stats with seasonal rosters for team context")
        print("      - Create rolling averages and trend features")
    else:
        print("   ⚠️  Address data quality issues before building ML models")
        print("   📋 Suggested order:")
        print("      1. Fix failing data sources")
        print("      2. Implement feature engineering pipeline")
        print("      3. Then proceed with ML model development")


=== UPDATED ML READINESS ASSESSMENT ===

📊 DATA HEALTH SCORE: 100.0%
   (7/7 data sources working)

🤖 ML READINESS (WITH FEATURE ENGINEERING):
   Game Outcome: 100.0% coverage
   Player Fantasy Points: 75.0% coverage
     Missing: ['touchdowns', 'fumbles']
   Team Performance: 100.0% coverage

📈 OVERALL ML READINESS: 93.8%

🎯 RECOMMENDATIONS:

2. 🚀 NEXT STEPS:
   ✅ EXCELLENT data quality - ready to build ML models!
   📋 Suggested implementation order:
      1. 🏈 Player Fantasy Points Prediction (simplest, highest coverage)
      2. 🎯 Game Outcome Prediction (good coverage, high business value)
      3. 📊 Team Performance Prediction (requires PBP feature engineering)
   🛠️  Feature Engineering Required:
      - Aggregate PBP data to team-game level
      - Join player stats with seasonal rosters for team context
      - Create rolling averages and trend features


## 3. Summary & Recommendations


In [16]:
print("=== SUMMARY & RECOMMENDATIONS ===")
print()

# Overall data health score
if 'data_availability' in locals():
    total_sources = len(data_sources)
    successful_sources = sum(1 for source in data_availability.values() if source['status'] == 'SUCCESS')
    data_health_score = successful_sources / total_sources
    
    print(f"📊 DATA HEALTH SCORE: {data_health_score:.1%}")
    print(f"   ({successful_sources}/{total_sources} data sources working)")
    print()

# ML readiness assessment
if 'feature_availability' in locals():
    avg_coverage = np.mean([task['coverage'] for task in feature_availability.values()])
    print(f"🤖 ML READINESS: {avg_coverage:.1%}")
    print(f"   (Average feature coverage across prediction tasks)")
    print()

# Recommendations
print("🎯 RECOMMENDATIONS:")
print()

if 'data_availability' in locals() and data_health_score < 0.8:
    print("1. 🔧 FIX DATA SOURCES:")
    for source, status in data_availability.items():
        if status['status'] == 'FAILED':
            print(f"   - {source}: {status['error']}")
    print()

if 'feature_availability' in locals():
    print("2. 📈 IMPROVE FEATURE COVERAGE:")
    for task, features in feature_availability.items():
        if features['coverage'] < 0.8:
            print(f"   - {task}: Missing {features['missing']}")
    print()

print("3. 🚀 NEXT STEPS:")
if 'data_availability' in locals() and 'feature_availability' in locals():
    if data_health_score >= 0.8 and avg_coverage >= 0.7:
        print("   ✅ Data quality is good - ready to build ML models!")
        print("   📋 Suggested order:")
        print("      1. Start with player fantasy points prediction (simplest)")
        print("      2. Build game outcome prediction")
        print("      3. Add team performance prediction")
    else:
        print("   ⚠️  Address data quality issues before building ML models")
        print("   📋 Suggested order:")
        print("      1. Fix failing data sources")
        print("      2. Identify alternative data sources for missing features")
        print("      3. Implement data quality monitoring")
        print("      4. Then proceed with ML model development")


=== SUMMARY & RECOMMENDATIONS ===

📊 DATA HEALTH SCORE: 100.0%
   (7/7 data sources working)

🤖 ML READINESS: 91.7%
   (Average feature coverage across prediction tasks)

🎯 RECOMMENDATIONS:

2. 📈 IMPROVE FEATURE COVERAGE:
   - player_fantasy_points: Missing ['touchdowns', 'fumbles']

3. 🚀 NEXT STEPS:
   ✅ Data quality is good - ready to build ML models!
   📋 Suggested order:
      1. Start with player fantasy points prediction (simplest)
      2. Build game outcome prediction
      3. Add team performance prediction
