# NBA Over/Under Predictor - Exploratory Data Analysis

**Project**: NBA Player Over/Under Point Predictor  
**Team**: Marcelo, Isiah, Gustavo (prod-2 environment)  
**Created**: November 20, 2025

## Objectives

This notebook performs initial exploratory data analysis (EDA) on our NBA datasets to:

1. **Data Quality Assessment**: Check data completeness, missing values, and consistency
2. **Structure Understanding**: Understand the schema and relationships between datasets
3. **Target Player Analysis**: Focus on our 13 selected players and their statistics
4. **Threshold Definition**: Determine appropriate point thresholds for each player
5. **Feature Discovery**: Identify potential features for our ML models

## Selected Target Players

Our prediction system will focus on these 13 NBA players:
- Mikal Bridges
- Buddy Hield 
- Harrison Barnes
- Nikola Jokiƒá
- James Harden
- Rudy Gobert
- Nikola Vuƒçeviƒá
- Tobias Harris
- Devin Booker
- Karl-Anthony Towns
- Jrue Holiday
- Stephen Curry
- Kevin Durant

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")
print(f"Analysis performed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Define our target players
TARGET_PLAYERS = [
    'Mikal Bridges',
    'Buddy Hield', 
    'Harrison Barnes',
    'Nikola Jokiƒá',
    'James Harden',
    'Rudy Gobert',
    'Nikola Vuƒçeviƒá',
    'Tobias Harris',
    'Devin Booker',
    'Karl-Anthony Towns',
    'Jrue Holiday',
    'Stephen Curry',
    'Kevin Durant'
]

print(f"Target Players ({len(TARGET_PLAYERS)}):")
for i, player in enumerate(TARGET_PLAYERS, 1):
    print(f"{i:2d}. {player}")

## 1. Data Loading and Initial Inspection

In [None]:
# Define data file paths
data_path = '../data/raw/'

files_to_load = {
    'player_stats': 'PlayerStatistics.csv',
    'games': 'Games.csv',
    'players': 'Players.csv',
    'team_stats': 'TeamStatistics.csv'
}

print("Loading datasets...")
data = {}

for name, filename in files_to_load.items():
    filepath = f"{data_path}{filename}"
    print(f"Loading {filename}...", end=' ')
    
    try:
        df = pd.read_csv(filepath)
        data[name] = df
        print(f"‚úì {df.shape[0]:,} rows, {df.shape[1]} columns")
    except Exception as e:
        print(f"‚úó Error: {e}")

print(f"\nSuccessfully loaded {len(data)} datasets.")

In [None]:
# Quick overview of all datasets
print("Dataset Overview:")
print("=" * 60)

for name, df in data.items():
    memory_usage = df.memory_usage(deep=True).sum() / (1024**2)  # MB
    print(f"{name.upper():15s} | {df.shape[0]:8,} rows | {df.shape[1]:2d} cols | {memory_usage:6.1f} MB")

print("=" * 60)

## 2. PlayerStatistics.csv - Primary Dataset Analysis

In [None]:
# Focus on PlayerStatistics - our primary training dataset
player_stats = data['player_stats']

print("PlayerStatistics.csv - Detailed Analysis")
print("=" * 50)
print(f"Shape: {player_stats.shape}")
print(f"Memory usage: {player_stats.memory_usage(deep=True).sum() / (1024**2):.1f} MB")
print(f"Date range: {player_stats['gameDate'].min()} to {player_stats['gameDate'].max()}")

# Display column information
print("\nColumn Information:")
print("-" * 30)
for i, col in enumerate(player_stats.columns):
    dtype = player_stats[col].dtype
    non_null = player_stats[col].count()
    null_pct = (len(player_stats) - non_null) / len(player_stats) * 100
    print(f"{i+1:2d}. {col:25s} | {str(dtype):10s} | {null_pct:5.1f}% missing")

In [None]:
# Sample of the data to understand structure
print("Sample PlayerStatistics records:")
display(player_stats.head(3))

print("\nData types:")
display(player_stats.dtypes)

In [None]:
# Create full player name for matching with our target list
player_stats['fullName'] = player_stats['firstName'].astype(str) + ' ' + player_stats['lastName'].astype(str)

# Basic statistics on key numeric columns
key_stats = ['points', 'numMinutes', 'assists', 'reboundsTotal', 'fieldGoalsMade', 'fieldGoalsAttempted']

print("Key Statistics Summary:")
display(player_stats[key_stats].describe())

## 3. Target Player Analysis

In [None]:
# Find our target players in the dataset
print("Finding target players in dataset...")
print("=" * 40)

found_players = {}
missing_players = []

for target_player in TARGET_PLAYERS:
    # Look for exact matches
    matches = player_stats[player_stats['fullName'] == target_player]
    
    if len(matches) > 0:
        found_players[target_player] = {
            'games': len(matches),
            'avg_points': matches['points'].mean(),
            'std_points': matches['points'].std(),
            'min_date': matches['gameDate'].min(),
            'max_date': matches['gameDate'].max()
        }
        print(f"‚úì {target_player:20s} | {len(matches):4d} games | avg: {matches['points'].mean():5.1f} pts")
    else:
        missing_players.append(target_player)
        print(f"‚úó {target_player:20s} | NOT FOUND")

print(f"\nFound {len(found_players)}/{len(TARGET_PLAYERS)} target players")
if missing_players:
    print(f"Missing players: {missing_players}")

In [None]:
# For missing players, let's search more flexibly
if missing_players:
    print("\nSearching for missing players with fuzzy matching...")
    print("=" * 50)
    
    unique_players = player_stats['fullName'].unique()
    
    for missing_player in missing_players:
        # Split the name to search by first and last name separately
        parts = missing_player.split()
        if len(parts) >= 2:
            first_name = parts[0]
            last_name = parts[-1]
            
            # Search for players with matching last name
            possible_matches = [
                name for name in unique_players 
                if last_name.lower() in name.lower()
            ]
            
            print(f"\nSearching for '{missing_player}':")
            if possible_matches:
                print("Possible matches:")
                for match in possible_matches[:5]:  # Show top 5 matches
                    count = len(player_stats[player_stats['fullName'] == match])
                    avg_pts = player_stats[player_stats['fullName'] == match]['points'].mean()
                    print(f"  - {match:25s} ({count:3d} games, {avg_pts:5.1f} avg pts)")
            else:
                print("  No potential matches found")

In [None]:
# Filter to only our successfully found target players for analysis
target_player_data = player_stats[player_stats['fullName'].isin(found_players.keys())].copy()

print(f"Target Player Dataset: {len(target_player_data):,} games from {len(found_players)} players")
print(f"Date range: {target_player_data['gameDate'].min()} to {target_player_data['gameDate'].max()}")

# Summary statistics for our target players
print("\nTarget Player Summary Statistics:")
summary_stats = target_player_data.groupby('fullName').agg({
    'points': ['count', 'mean', 'std', 'min', 'max'],
    'numMinutes': 'mean',
    'gameDate': ['min', 'max']
}).round(2)

display(summary_stats)

## 4. Data Quality Assessment

In [None]:
# Check for missing values in key columns
print("Missing Values Analysis - Target Players")
print("=" * 45)

key_columns = [
    'points', 'numMinutes', 'assists', 'rebounds Total',
    'fieldGoalsMade', 'fieldGoalsAttempted', 'threePointersMade', 'threePointersAttempted',
    'freeThrowsMade', 'freeThrowsAttempted', 'home', 'win'
]

# Filter to columns that actually exist
existing_key_columns = [col for col in key_columns if col in target_player_data.columns]

missing_analysis = []
for col in existing_key_columns:
    missing_count = target_player_data[col].isnull().sum()
    missing_pct = (missing_count / len(target_player_data)) * 100
    missing_analysis.append({
        'column': col,
        'missing_count': missing_count,
        'missing_pct': missing_pct
    })

missing_df = pd.DataFrame(missing_analysis).sort_values('missing_pct', ascending=False)
display(missing_df)

In [None]:
# Check for data anomalies
print("Data Quality Checks - Target Players")
print("=" * 40)

# Games with 0 minutes played
zero_minutes = target_player_data[target_player_data['numMinutes'] == 0]
print(f"Games with 0 minutes played: {len(zero_minutes):,} ({len(zero_minutes)/len(target_player_data)*100:.1f}%)")

# Games with 0 points
zero_points = target_player_data[target_player_data['points'] == 0]
print(f"Games with 0 points: {len(zero_points):,} ({len(zero_points)/len(target_player_data)*100:.1f}%)")

# Games with negative values (shouldn't exist)
negative_points = target_player_data[target_player_data['points'] < 0]
negative_minutes = target_player_data[target_player_data['numMinutes'] < 0]
print(f"Games with negative points: {len(negative_points)}")
print(f"Games with negative minutes: {len(negative_minutes)}")

# Very high scoring games (potential outliers)
high_scoring = target_player_data[target_player_data['points'] > 50]
print(f"Games with 50+ points: {len(high_scoring)}")

if len(high_scoring) > 0:
    print("\nHigh-scoring games:")
    display(high_scoring[['fullName', 'gameDate', 'points', 'numMinutes']].head(10))

## 5. Points Distribution Analysis

In [None]:
# Overall points distribution for target players
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Overall distribution
axes[0,0].hist(target_player_data['points'], bins=50, alpha=0.7, edgecolor='black')
axes[0,0].set_title('Overall Points Distribution - All Target Players')
axes[0,0].set_xlabel('Points')
axes[0,0].set_ylabel('Frequency')
axes[0,0].axvline(target_player_data['points'].mean(), color='red', linestyle='--', 
                  label=f'Mean: {target_player_data["points"].mean():.1f}')
axes[0,0].legend()

# 2. Box plot by player
target_player_data.boxplot(column='points', by='fullName', ax=axes[0,1], rot=45)
axes[0,1].set_title('Points Distribution by Player')
axes[0,1].set_xlabel('Player')

# 3. Points vs Minutes scatter
axes[1,0].scatter(target_player_data['numMinutes'], target_player_data['points'], alpha=0.5)
axes[1,0].set_title('Points vs Minutes Played')
axes[1,0].set_xlabel('Minutes Played')
axes[1,0].set_ylabel('Points')

# 4. Games by points ranges
point_ranges = pd.cut(target_player_data['points'], bins=[0, 10, 20, 30, 40, 100], 
                     labels=['0-9', '10-19', '20-29', '30-39', '40+'])
point_ranges.value_counts().plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Games by Point Ranges')
axes[1,1].set_xlabel('Point Range')
axes[1,1].set_ylabel('Number of Games')
axes[1,1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

## 6. Individual Player Analysis & Threshold Definition

In [None]:
# Calculate detailed statistics for each player to determine thresholds
print("Individual Player Analysis for Threshold Setting")
print("=" * 55)

player_thresholds = []

for player_name in found_players.keys():
    player_games = target_player_data[target_player_data['fullName'] == player_name]
    
    # Filter out games with 0 minutes (didn't play)
    active_games = player_games[player_games['numMinutes'] > 0]
    
    if len(active_games) == 0:
        continue
        
    # Calculate statistics
    avg_points = active_games['points'].mean()
    median_points = active_games['points'].median()
    std_points = active_games['points'].std()
    q25 = active_games['points'].quantile(0.25)
    q75 = active_games['points'].quantile(0.75)
    
    # Proposed threshold (season average)
    proposed_threshold = avg_points
    
    # Calculate over/under distribution at this threshold
    over_games = (active_games['points'] > proposed_threshold).sum()
    under_games = (active_games['points'] <= proposed_threshold).sum()
    over_pct = over_games / len(active_games) * 100
    
    player_thresholds.append({
        'player': player_name,
        'games_played': len(active_games),
        'avg_points': avg_points,
        'median_points': median_points,
        'std_points': std_points,
        'q25': q25,
        'q75': q75,
        'proposed_threshold': proposed_threshold,
        'over_games': over_games,
        'under_games': under_games,
        'over_percentage': over_pct
    })

# Create DataFrame for easy viewing
threshold_df = pd.DataFrame(player_thresholds).round(2)
threshold_df = threshold_df.sort_values('avg_points', ascending=False)

print("Player Statistics and Proposed Thresholds:")
display(threshold_df[['player', 'games_played', 'avg_points', 'std_points', 
                     'proposed_threshold', 'over_percentage']])

In [None]:
# Visualize individual player distributions and thresholds
fig, axes = plt.subplots(3, 5, figsize=(20, 12))
axes = axes.flatten()

for i, player_name in enumerate(found_players.keys()):
    if i >= len(axes):
        break
        
    player_games = target_player_data[
        (target_player_data['fullName'] == player_name) & 
        (target_player_data['numMinutes'] > 0)
    ]['points']
    
    # Plot histogram
    axes[i].hist(player_games, bins=15, alpha=0.7, edgecolor='black')
    
    # Add threshold line
    threshold = player_games.mean()
    axes[i].axvline(threshold, color='red', linestyle='--', linewidth=2,
                   label=f'Threshold: {threshold:.1f}')
    
    axes[i].set_title(f'{player_name}\n({len(player_games)} games)', fontsize=10)
    axes[i].set_xlabel('Points')
    axes[i].set_ylabel('Frequency')
    axes[i].legend(fontsize=8)

# Hide unused subplots
for i in range(len(found_players), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.suptitle('Individual Player Points Distributions and Proposed Thresholds', 
             fontsize=16, y=1.02)
plt.show()

## 7. Temporal Analysis

In [None]:
# Convert gameDate to datetime for temporal analysis
target_player_data['gameDate'] = pd.to_datetime(target_player_data['gameDate'])
target_player_data['year'] = target_player_data['gameDate'].dt.year
target_player_data['month'] = target_player_data['gameDate'].dt.month

# Games by year
print("Games by Year - Target Players")
print("=" * 35)
year_counts = target_player_data['year'].value_counts().sort_index()
for year, count in year_counts.items():
    print(f"{year}: {count:,} games")

# Plot temporal trends
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Games by year
year_counts.plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Games by Year')
axes[0,0].set_xlabel('Year')
axes[0,0].set_ylabel('Number of Games')
axes[0,0].tick_params(axis='x', rotation=45)

# 2. Average points by year
yearly_avg = target_player_data.groupby('year')['points'].mean()
yearly_avg.plot(kind='line', marker='o', ax=axes[0,1])
axes[0,1].set_title('Average Points by Year')
axes[0,1].set_xlabel('Year')
axes[0,1].set_ylabel('Average Points')

# 3. Games by month (seasonality)
month_counts = target_player_data['month'].value_counts().sort_index()
month_counts.plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Games by Month')
axes[1,0].set_xlabel('Month')
axes[1,0].set_ylabel('Number of Games')

# 4. Recent data availability (last 2 years)
recent_data = target_player_data[target_player_data['year'] >= 2023]
if len(recent_data) > 0:
    recent_monthly = recent_data.groupby(['year', 'month']).size()
    recent_monthly.plot(kind='bar', ax=axes[1,1])
    axes[1,1].set_title('Recent Games (2023+) by Year-Month')
    axes[1,1].set_xlabel('Year-Month')
    axes[1,1].set_ylabel('Number of Games')
    axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 8. Feature Exploration

In [None]:
# Explore potential features for our models
print("Feature Exploration - Correlations with Points")
print("=" * 50)

# Select numeric columns that could be features
feature_columns = [
    'numMinutes', 'assists', 'reboundsTotal', 'fieldGoalsMade', 'fieldGoalsAttempted',
    'threePointersMade', 'threePointersAttempted', 'freeThrowsMade', 'freeThrowsAttempted',
    'reboundsDefensive', 'reboundsOffensive', 'steals', 'blocks', 'turnovers', 'foulsPersonal'
]

# Filter to columns that exist
existing_features = [col for col in feature_columns if col in target_player_data.columns]

# Calculate correlations with points
correlations = target_player_data[existing_features + ['points']].corr()['points'].sort_values(ascending=False)
correlations = correlations.drop('points')  # Remove self-correlation

print("Top correlations with points:")
for feature, corr in correlations.items():
    print(f"{feature:25s}: {corr:6.3f}")

# Visualize correlations
plt.figure(figsize=(10, 8))
correlations.plot(kind='barh')
plt.title('Feature Correlations with Points Scored')
plt.xlabel('Correlation Coefficient')
plt.tight_layout()
plt.show()

In [None]:
# Explore categorical features
print("Categorical Feature Analysis")
print("=" * 35)

# Home vs Away performance
if 'home' in target_player_data.columns:
    home_stats = target_player_data.groupby('home')['points'].agg(['count', 'mean', 'std'])
    print("Home vs Away Performance:")
    print(home_stats.round(2))
    print()

# Win vs Loss performance
if 'win' in target_player_data.columns:
    win_stats = target_player_data.groupby('win')['points'].agg(['count', 'mean', 'std'])
    print("Win vs Loss Performance:")
    print(win_stats.round(2))
    print()

# Performance by game type if available
if 'gameType' in target_player_data.columns:
    game_type_stats = target_player_data.groupby('gameType')['points'].agg(['count', 'mean', 'std'])
    print("Performance by Game Type:")
    print(game_type_stats.round(2))

## 9. Data Quality Summary and Next Steps

In [None]:
# Save our findings for next steps
print("DATA QUALITY SUMMARY")
print("=" * 40)

print(f"‚úì Successfully found {len(found_players)}/{len(TARGET_PLAYERS)} target players")
print(f"‚úì Total games for analysis: {len(target_player_data):,}")
print(f"‚úì Date range: {target_player_data['gameDate'].min().strftime('%Y-%m-%d')} to {target_player_data['gameDate'].max().strftime('%Y-%m-%d')}")
print(f"‚úì Average games per player: {len(target_player_data) / len(found_players):.0f}")
print(f"‚úì Data completeness: {target_player_data['points'].count() / len(target_player_data) * 100:.1f}% (points)")

print("\nKEY FINDINGS:")
print("=" * 15)
print(f"‚Ä¢ Points range: {target_player_data['points'].min():.0f} - {target_player_data['points'].max():.0f}")
print(f"‚Ä¢ Average points: {target_player_data['points'].mean():.1f} ¬± {target_player_data['points'].std():.1f}")
print(f"‚Ä¢ Games with 0 minutes: {(target_player_data['numMinutes'] == 0).sum():,} ({(target_player_data['numMinutes'] == 0).mean()*100:.1f}%)")
print(f"‚Ä¢ High scoring games (40+ pts): {(target_player_data['points'] >= 40).sum()}")

print("\nRECOMMENDATIONS:")
print("=" * 20)
print("1. Filter out games with 0 minutes played (DNP - Did Not Play)")
print("2. Use season averages as initial thresholds for over/under predictions")
print("3. Focus on games from 2020+ for model training (more recent patterns)")
print("4. Include minutes played, FG attempts, and assists as key features")
print("5. Consider home/away and win/loss as contextual features")

if missing_players:
    print(f"\n‚ö†Ô∏è  Missing players to investigate: {missing_players}")
    print("   Recommendation: Manual name matching or alternative player selection")

print("\nNEXT STEPS:")
print("=" * 15)
print("1. Create player thresholds CSV file")
print("2. Clean and filter the dataset")
print("3. Engineer features (rolling averages, rest days)")
print("4. Create train/test splits")
print("5. Implement baseline models")

In [None]:
# Save player thresholds to CSV for next phase
output_path = '../data/processed/'
import os
os.makedirs(output_path, exist_ok=True)

# Save thresholds
threshold_df.to_csv(f'{output_path}player_thresholds.csv', index=False)
print(f"‚úì Saved player thresholds to {output_path}player_thresholds.csv")

# Save found players list
found_players_df = pd.DataFrame([
    {'player_name': name, 'games_available': info['games']} 
    for name, info in found_players.items()
])
found_players_df.to_csv(f'{output_path}found_target_players.csv', index=False)
print(f"‚úì Saved found players list to {output_path}found_target_players.csv")

print("\nüìä EDA Complete! Ready for data cleaning and feature engineering phase.")