# 02 â€” Exploratory Data Analysis (EDA)

**Goal:** Understand data distributions, relationships, and candidate features for modeling.

**Checklist (edit as you go):**
- [x] Load cleaned dataset
- [x] Summary stats (describe)
- [x] Points distribution
- [ ] Home vs Away scoring
- [x] Correlation heatmap
- [x] Save plots to `../img/`


In [None]:
# Setup & data load
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

CLEAN_PATH = Path("../data/processed/team_games_clean.csv")
IMG_DIR = Path("../img"); IMG_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(CLEAN_PATH, parse_dates=["game_date"])
print("Loaded:", CLEAN_PATH, "| shape:", df.shape)
df.head(3)


## 1) Summary stats

In [None]:
# basic summary stats
print("Dataset overview:")
print(f"Shape: {df.shape}")
print(f"Date range: {df['game_date'].min()} to {df['game_date'].max()}")
print(f"Unique teams: {df['team_abbreviation'].nunique()}")
print(f"Unique games: {df['game_id'].nunique()}")
print()

# key numerical columns summary
key_cols = ['pts', 'fg_pct', 'reb', 'ast', 'tov', 'home', 'rest_days']
print("Summary stats for key columns:")
print(df[key_cols].describe().round(2))

## 2) Points distribution


In [None]:
plt.figure(figsize=(12, 8))

# histogram
plt.subplot(2, 2, 1)
plt.hist(df['pts'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Distribution of Team Points')
plt.xlabel('Points')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

# box plot
plt.subplot(2, 2, 2)
plt.boxplot(df['pts'], patch_artist=True, boxprops=dict(facecolor='lightgreen'))
plt.title('Points Box Plot')
plt.ylabel('Points')
plt.grid(True, alpha=0.3)

# points by home/away
plt.subplot(2, 2, 3)
home_pts = df[df['home'] == 1]['pts']
away_pts = df[df['home'] == 0]['pts']
plt.hist([home_pts, away_pts], bins=25, alpha=0.7, label=['Home', 'Away'], 
         color=['lightcoral', 'lightblue'])
plt.title('Points: Home vs Away')
plt.xlabel('Points')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)

# points over time (rolling average)
plt.subplot(2, 2, 4)
df_sorted = df.sort_values('game_date')
rolling_avg = df_sorted['pts'].rolling(window=50, center=True).mean()
plt.plot(df_sorted['game_date'], rolling_avg, color='red', linewidth=2)
plt.title('Points Rolling Average (50 games)')
plt.xlabel('Date')
plt.ylabel('Points')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(IMG_DIR / 'points_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

# summary stats for points
print("Points distribution summary:")
print(f"Mean: {df['pts'].mean():.1f}")
print(f"Median: {df['pts'].median():.1f}")
print(f"Std dev: {df['pts'].std():.1f}")
print(f"Min: {df['pts'].min()}")
print(f"Max: {df['pts'].max()}")
print(f"Home avg: {home_pts.mean():.1f}")
print(f"Away avg: {away_pts.mean():.1f}")
print(f"Home advantage: {home_pts.mean() - away_pts.mean():.1f} points")


The data shows teams score 113.9 points on average with home teams getting about 1.6 points more than away teams. Scoring ranges from 73 to 168 points. Distribution looks normal.

## 3) Home vs Away scoring


In [None]:
# TO-DO

## 4) Correlation heatmap


In [None]:
import numpy as np

# numerical columnns
numeric_cols = ['pts', 'fg_pct', 'fga', 'fgm', 'fg3_pct', 'ft_pct', 'reb', 'ast', 'tov', 'stl', 'blk', 'home', 'rest_days', 'opponent_pts', 'opponent_fg_pct', 'opponent_reb', 'opponent_tov']

corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
plt.imshow(corr_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
plt.colorbar(label='Correlation')

plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=45, ha='right')
plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)

for i in range(len(corr_matrix.columns)):
    for j in range(len(corr_matrix.columns)):
        plt.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}', 
                ha='center', va='center', fontsize=8)

plt.title('Correlation Heatmap - NBA Team Stats')
plt.tight_layout()
plt.savefig(IMG_DIR / 'correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

# top/bottom correlations with points
pts_corr = corr_matrix['pts'].sort_values(ascending=False)
print("Top correlations with points:")
print(pts_corr.head(10))
print()
print("Bottom correlations with points:")
print(pts_corr.tail(5))
