In [52]:
import pandas as pd
import numpy as np
import time
import os
import glob
from tqdm import tqdm
from nba_api.stats.endpoints import LeagueGameFinder
from nba_api.stats.static import teams
from datetime import datetime

In [45]:
# --- Helper Function to Get Team ID ---
def get_team_id(team_full_name):
    nba_teams = teams.get_teams()
    team = next((team for team in nba_teams if team['full_name'] == team_full_name), None)
    return team['id'] if team else None

In [46]:
# --- Define Team and Season ---
nba_teams = teams.get_teams()
team_names = [team["full_name"] for team in nba_teams]

team_ids = {}
for team_name in team_names:
    team_id = get_team_id(team_name)
    team_ids[team_name] = team_id

seasons = [
    "2024-25",
    "2023-24",
    "2022-23",
    "2021-22",
    "2020-21"
    ]

In [47]:
def get_team_games(team_id, seasons=seasons):
    lgf_dfs = []
    for season in seasons:
        # --- Retrieve Historical Game Data ---
        lgf = LeagueGameFinder(team_id_nullable=team_id, season_nullable=season)
        df_games = lgf.get_data_frames()[0]
        lgf_dfs.append(df_games)
        time.sleep(0.5)
        
    df_games = pd.concat(lgf_dfs, ignore_index=True)
    return df_games

In [48]:
data_teams = {}

for team_name, team_id in tqdm(team_ids.items(), desc="Processing Teams"):
    df_games = get_team_games(team_id)
    # Convert GAME_DATE to datetime and sort chronologically
    df_games["GAME_DATE"] = pd.to_datetime(df_games["GAME_DATE"])
    df_games = df_games.sort_values("GAME_DATE").reset_index(drop=True)
    data_teams[team_name] = df_games

Processing Teams: 100%|██████████| 30/30 [01:33<00:00,  3.12s/it]


In [49]:
# --- Feature Engineering: Pre-Game Rolling Averages and Rest Days ---
def compute_rolling_features(df, window=5):
    """
    For each game in df, compute rolling averages of key stats from the previous `window` games,
    and calculate days of rest since the previous game.
    """
    rolling_features = []
    for idx, row in df.iterrows():
        # All games before the current game
        previous_games = df.iloc[:idx]
        feature = {}
        if not previous_games.empty:
            # Use the last `window` games (or all if fewer than window)
            window_games = previous_games.tail(window)
            feature["rolling_pts"] = window_games["PTS"].mean()
            feature["rolling_reb"] = window_games["REB"].mean()
            feature["rolling_ast"] = window_games["AST"].mean()
            feature["rolling_fg_pct"] = window_games["FG_PCT"].mean()
            # Calculate rest days (difference between current and last game)
            last_game_date = previous_games["GAME_DATE"].iloc[-1]
            feature["rest_days"] = (row["GAME_DATE"] - last_game_date).days
        else:
            # For the first game, there are no previous stats
            feature["rolling_pts"] = 0
            feature["rolling_reb"] = 0
            feature["rolling_ast"] = 0
            feature["rolling_fg_pct"] = 0
            feature["rest_days"] = 0
        # Keep GAME_ID to merge features later
        feature["GAME_ID"] = row["GAME_ID"]
        rolling_features.append(feature)
    return pd.DataFrame(rolling_features)

In [50]:
for team_name, df_games in data_teams.items():
    # Compute rolling features with a 5-game window
    df_rolling = compute_rolling_features(df_games, window=5)
    
    # Merge the rolling features with the original game data
    df_model = pd.merge(df_games, df_rolling, on="GAME_ID", how="left")
    
    # Compute the union of columns from df_games and df_rolling
    expected_columns = set(df_games.columns).union(set(df_rolling.columns))
    # Get the columns of the merged DataFrame
    merged_columns = set(df_model.columns)
    # Check if they match
    print("Do we have all expected columns after merging?", expected_columns == merged_columns)
    
    # --- Pre-Game Context Features ---
    # Create a home/away indicator based on MATCHUP:
    # "vs." in the matchup usually indicates a home game; "@" indicates an away game.
    df_model["home_game"] = df_model["MATCHUP"].apply(lambda x: 1 if "vs." in x else 0)
    # Create the target variable:
    # We'll use the WL column ("W" for win, "L" for loss)
    df_model["win"] = df_model["WL"].apply(lambda x: 1 if x == "W" else 0)
    
    # --- Final DataFrame Preparation ---
    # We need to remove features that wouldn’t be available pre-game.
    # For instance, in-game performance stats (like PTS, REB, etc. for the current game) should be excluded.
    # We keep only the pre-game rolling averages, rest days, home indicator, and the target.
    features_to_keep = [
        "GAME_ID", "GAME_DATE", "home_game", "rolling_pts", "rolling_reb",
        "rolling_ast", "rolling_fg_pct", "rest_days", "win"
    ]
    df_model_final = df_model[features_to_keep].copy()
    
    print(f"Check if we have nan values:")
    nan_counts = df_model_final.isnull().sum()
    for col, count in nan_counts.items():
        if count > 0:
            print(f"{col} has {count} nan values")
        else:
            print("There are no nan values.")
    
    # Save model    
    df_model_final.to_csv(f"data/{team_name}_data.csv", index=False)
    print("Model saved!")

Do we have all expected columns after merging? True
Check if we have nan values:
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
Model saved!
Do we have all expected columns after merging? True
Check if we have nan values:
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
Model saved!
Do we have all expected columns after merging? True
Check if we have nan values:
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
Model saved!
Do we have all expected columns after mergi

In [55]:
# --- Step 0: Create a dictionary by reading CSV files from the directory ---
data_teams = {}
# Adjust the glob pattern if your directory structure differs.
csv_files = glob.glob(os.path.join("data", "*_data.csv"))
for file in csv_files:
    # Extract team name from the filename.
    # Assuming filenames like "Golden State Warriors Team_data.csv"
    base = os.path.basename(file)
    team_name = base.replace(" Team_data.csv", "")
    df = pd.read_csv(file)
    data_teams[team_name] = df

In [None]:
# --- (Optional) Display keys to check if the dictionary is built correctly ---
print("Teams loaded:")
print(list(data_teams.keys()))
print(len(data_teams))

Teams loaded:
['Los Angeles Clippers_data.csv', 'Phoenix Suns_data.csv', 'Portland Trail Blazers_data.csv', 'New Orleans Pelicans_data.csv', 'Dallas Mavericks_data.csv', 'Chicago Bulls_data.csv', 'Oklahoma City Thunder_data.csv', 'New York Knicks_data.csv', 'Milwaukee Bucks_data.csv', 'Sacramento Kings_data.csv', 'Atlanta Hawks_data.csv', 'Miami Heat_data.csv', 'Utah Jazz_data.csv', 'Los Angeles Lakers_data.csv', 'Memphis Grizzlies_data.csv', 'Philadelphia 76ers_data.csv', 'Minnesota Timberwolves_data.csv', 'Orlando Magic_data.csv', 'Golden State Warriors_data.csv', 'Denver Nuggets_data.csv', 'Brooklyn Nets_data.csv', 'Indiana Pacers_data.csv', 'Charlotte Hornets_data.csv', 'Toronto Raptors_data.csv', 'Houston Rockets_data.csv', 'San Antonio Spurs_data.csv', 'Cleveland Cavaliers_data.csv', 'Boston Celtics_data.csv', 'Detroit Pistons_data.csv', 'Washington Wizards_data.csv']
30


In [58]:
# --- Step 1: Combine all team DataFrames into one master DataFrame ---
df_all = pd.concat(data_teams.values(), ignore_index=True)

# --- Step 2: Split the master DataFrame into home and away games ---
df_home = df_all[df_all["home_game"] == 1].copy()
df_away = df_all[df_all["home_game"] == 0].copy()

# --- Step 3: Rename feature columns for clarity ---
# These are the columns you mentioned.
features = ["rolling_pts", "rolling_reb", "rolling_ast", "rolling_fg_pct", "rest_days", "win"]

# Rename features in the home games DataFrame (append '_home')
df_home = df_home.rename(columns={col: col + "_home" for col in features})
# Rename features in the away games DataFrame (append '_away')
df_away = df_away.rename(columns={col: col + "_away" for col in features})

# --- Step 4: Merge the home and away datasets on GAME_ID and GAME_DATE ---
df_games_merged = pd.merge(
    df_home, df_away, 
    on=["GAME_ID", "GAME_DATE"],
    how="inner"
)

In [59]:
print(f"Check if we have nan values:")
nan_counts = df_games_merged.isnull().sum()
for col, count in nan_counts.items():
    if count > 0:
        print(f"{col} has {count} nan values")
    else:
        print("There are no nan values.")

Check if we have nan values:
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.
There are no nan values.


In [60]:
print(df_games_merged.columns)

Index(['GAME_ID', 'GAME_DATE', 'home_game_x', 'rolling_pts_home',
       'rolling_reb_home', 'rolling_ast_home', 'rolling_fg_pct_home',
       'rest_days_home', 'win_home', 'home_game_y', 'rolling_pts_away',
       'rolling_reb_away', 'rolling_ast_away', 'rolling_fg_pct_away',
       'rest_days_away', 'win_away'],
      dtype='object')


In [None]:
# Save model
df_games_merged.to_csv("data/DATA.csv", index=False)

In [66]:
csv_files = glob.glob("data/DATA.csv")
if csv_files:
    df = pd.read_csv(csv_files[0])
    print("Check if we have nan values:")
    nan_counts = df.isnull().sum()
    for col, count in nan_counts.items():
        if count > 0:
            print(f"{col} has {count} nan values")
        else:
            print(f"{col}: There are no nan values.")
else:
    print("No CSV file found.")


Check if we have nan values:
GAME_ID: There are no nan values.
GAME_DATE: There are no nan values.
home_game_x: There are no nan values.
rolling_pts_home: There are no nan values.
rolling_reb_home: There are no nan values.
rolling_ast_home: There are no nan values.
rolling_fg_pct_home: There are no nan values.
rest_days_home: There are no nan values.
win_home: There are no nan values.
home_game_y: There are no nan values.
rolling_pts_away: There are no nan values.
rolling_reb_away: There are no nan values.
rolling_ast_away: There are no nan values.
rolling_fg_pct_away: There are no nan values.
rest_days_away: There are no nan values.
win_away: There are no nan values.
