In [1]:
import numpy as np
import pandas as pd
import matplotlib 
import time
import json
import requests
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import boxscoreadvancedv2, TeamGameLog, TeamGameLogs, leaguegamefinder

The goal of this project is to collect the advanced stats of past nba games, calculate a rolling average of those stats, and use the resulting data on a model to predict the winner of a game.
Let's first take a look at the regular stats of a nba game

In [None]:
def get_season_games(season, season_type="Regular Season"):
    """
    Fetch all NBA games for a specific season using a single API call
    
    Parameters:
    - season: Season string (e.g., "2022-23")
    - season_type: Type of season (e.g., "Regular Season", "Playoffs")
    
    Returns:
    - DataFrame containing all games for the season
    """
    # Create the game finder object
    game_finder = leaguegamefinder.LeagueGameFinder(
        player_or_team_abbreviation = 'T',
        season_nullable=season,
        season_type_nullable=season_type,
        league_id_nullable='00'  # NBA league ID
    )
    
    # Get the games dataframe
    games_df = game_finder.get_data_frames()[0]
    
    return games_df
season_games = get_season_games('2023-24')

def combine_home_away_games(all_home_games, all_away_games):
    all_away_games = all_away_games.drop(columns=['SEASON_ID', 'MIN', 'GAME_DATE', 'MATCHUP', 'WL', 'TEAM_NAME'])
    all_away_games = all_away_games.add_prefix('AWAY_')
    all_away_games.rename(columns={'AWAY_GAME_ID':'GAME_ID'}, inplace=True)
    
    return pd.merge(all_home_games, all_away_games, on="GAME_ID")

#using teamGameLogs
def get_advanced_stats(season):
    advanced_stats = TeamGameLogs(season_nullable = season, season_type_nullable = 'Regular Season', measure_type_player_game_logs_nullable = 'Advanced')
    df = advanced_stats.get_data_frames()[0]
    df = df.loc[:, ~df.columns.str.contains('RANK')]
    return df



In [4]:
# get averages of the following stats of  a team prior to a game:
# - OFF_RATING, DEF_RATING, NET_RATING, EFG_PCT, TS_PCT, PIE
# - num_games: number of games to consider before the game
# - game_id: the game to consider
# - team_abbreviation: the team to consider
def get_rolling_averages(stats_pd, column_names, num_games=10):
    rolling_average_pd = stats_pd.loc[:, ['GAME_DATE','GAME_ID', 'TEAM_ABBREVIATION']].copy()
    for team_id, stats in stats_pd.groupby(['TEAM_ID']):
        stats = stats.sort_values(by='GAME_DATE')
        for column in column_names:
            rolling_average_pd.loc[stats.index, column] = stats[column].shift(1).rolling(window=num_games, min_periods=10).mean().round(5)
    rolling_average_pd = rolling_average_pd.drop(columns='GAME_DATE')
    return rolling_average_pd

In [5]:
feature_names = ['NET_RATING', 'TS_PCT', 'PIE', '3PT_PCT']
start_year = 2014
end_year = 2024

In [6]:
def get_all_stats(start_year, end_year, features, season_type="Regular Season"):
    all_games_df = pd.DataFrame()
    advanced_df = pd.DataFrame()
    advanced_rolling_df = pd.DataFrame()
    
    for i in range(start_year, end_year):
        time.sleep(1)
        season = str(i) + "-" + str(i+1)[2:]
        print(season)
        games_season = get_season_games(season=season, season_type=season_type)
        advanced_season = get_advanced_stats(season)

        games_season = games_season.sort_values(by=['MATCHUP', 'GAME_DATE'])
        advanced_season = advanced_season.sort_values(by=['MATCHUP', 'GAME_DATE'])
        advanced_season['3PT_PCT'] = games_season['FG3_PCT']

        rolling_season = get_rolling_averages(advanced_season, feature_names)
        all_games_df = pd.concat([all_games_df, games_season])
        advanced_df = pd.concat([advanced_df, advanced_season])
        advanced_rolling_df = pd.concat([advanced_rolling_df, rolling_season])
    all_games_df = all_games_df.sort_values(by='GAME_DATE')
    advanced_df = advanced_df.sort_values(by=['TEAM_ABBREVIATION', 'GAME_DATE'])

    home_games = all_games_df[~all_games_df['MATCHUP'].str.contains('@')]
    away_games = all_games_df[all_games_df['MATCHUP'].str.contains('@')]
    unique_games = combine_home_away_games(home_games, away_games)
    return unique_games, advanced_df, advanced_rolling_df

unique_games, advanced_stats_df, rolling_df = get_all_stats(start_year, end_year, feature_names)


2014-15


  for team_id, stats in stats_pd.groupby(['TEAM_ID']):


2015-16


  for team_id, stats in stats_pd.groupby(['TEAM_ID']):


2016-17


  for team_id, stats in stats_pd.groupby(['TEAM_ID']):


2017-18


  for team_id, stats in stats_pd.groupby(['TEAM_ID']):


2018-19


  for team_id, stats in stats_pd.groupby(['TEAM_ID']):


2019-20


  for team_id, stats in stats_pd.groupby(['TEAM_ID']):


2020-21


  for team_id, stats in stats_pd.groupby(['TEAM_ID']):


2021-22


  for team_id, stats in stats_pd.groupby(['TEAM_ID']):


2022-23


  for team_id, stats in stats_pd.groupby(['TEAM_ID']):


2023-24


  for team_id, stats in stats_pd.groupby(['TEAM_ID']):


In [8]:
unique_games.describe()

Unnamed: 0,TEAM_ID,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,...,AWAY_FT_PCT,AWAY_OREB,AWAY_DREB,AWAY_REB,AWAY_AST,AWAY_STL,AWAY_BLK,AWAY_TOV,AWAY_PF,AWAY_PLUS_MINUS
count,11979.0,11979.0,11979.0,11979.0,11979.0,11979.0,11979.0,11979.0,11979.0,11979.0,...,11978.0,11979.0,11979.0,11979.0,11979.0,11979.0,11979.0,11979.0,11979.0,11979.0
mean,1610613000.0,241.678688,109.975541,40.570415,87.05351,0.466962,11.138075,30.743134,0.361171,17.696636,...,0.769266,10.191335,33.255781,43.447116,23.565907,7.640287,4.714333,13.503715,20.173804,-2.265598
std,8.651633,7.151068,13.064024,5.325264,7.354049,0.055312,4.099159,8.261164,0.090771,6.019321,...,0.104221,3.769506,5.377596,6.491445,5.186944,2.905149,2.439833,3.878762,4.327452,14.258937
min,1610613000.0,237.0,64.0,19.0,60.0,0.269,0.0,4.0,0.0,1.0,...,0.176,0.0,16.0,20.0,6.0,0.0,0.0,2.0,7.0,-73.0
25%,1610613000.0,240.0,101.0,37.0,82.0,0.429,8.0,25.0,0.3,13.0,...,0.704,7.0,30.0,39.0,20.0,6.0,3.0,11.0,17.0,-11.0
50%,1610613000.0,240.0,110.0,40.0,87.0,0.466,11.0,31.0,0.36,17.0,...,0.778,10.0,33.0,43.0,23.0,7.0,4.0,13.0,20.0,-3.0
75%,1610613000.0,241.0,119.0,44.0,92.0,0.505,14.0,36.0,0.419,22.0,...,0.842,13.0,37.0,48.0,27.0,9.0,6.0,16.0,23.0,8.0
max,1610613000.0,341.0,175.0,65.0,125.0,0.684,28.0,70.0,0.842,44.0,...,1.0,38.0,60.0,81.0,47.0,20.0,19.0,30.0,42.0,57.0


In [9]:
rolling_df.describe()

Unnamed: 0,NET_RATING,TS_PCT,PIE,3PT_PCT
count,20958.0,20958.0,20958.0,20958.0
mean,0.011717,0.5621,0.500086,0.357578
std,6.423475,0.029484,0.038784,0.029194
min,-22.41,0.4658,0.3441,0.2373
25%,-4.26,0.5416,0.4741,0.3378
50%,0.11,0.5616,0.5012,0.3572
75%,4.25,0.5826,0.5259,0.377
max,22.35,0.6644,0.6376,0.4837


In [8]:
features_df = unique_games[['GAME_DATE', 'GAME_ID', 'TEAM_ABBREVIATION', 'WL', 'AWAY_TEAM_ABBREVIATION']].copy()
features_df = features_df.merge(rolling_df, left_on=['GAME_ID', 'TEAM_ABBREVIATION'], right_on=['GAME_ID', 'TEAM_ABBREVIATION'])
away_stats_df = rolling_df.rename(columns={col: f'AWAY_{col}' for col in rolling_df.columns if col != 'GAME_ID'})

features_df = features_df.merge(away_stats_df, left_on=['GAME_ID', 'AWAY_TEAM_ABBREVIATION'], right_on=['GAME_ID', 'AWAY_TEAM_ABBREVIATION'])
print(features_df.columns)
features_df = features_df.loc[:, ~features_df.columns.duplicated()]
features_df = features_df.dropna()
display(features_df)
    

Index(['GAME_DATE', 'GAME_ID', 'TEAM_ABBREVIATION', 'WL',
       'AWAY_TEAM_ABBREVIATION', 'NET_RATING', 'TS_PCT', 'PIE', '3PT_PCT',
       'AWAY_NET_RATING', 'AWAY_TS_PCT', 'AWAY_PIE', 'AWAY_3PT_PCT'],
      dtype='object')


Unnamed: 0,GAME_DATE,GAME_ID,TEAM_ABBREVIATION,WL,AWAY_TEAM_ABBREVIATION,NET_RATING,TS_PCT,PIE,3PT_PCT,AWAY_NET_RATING,AWAY_TS_PCT,AWAY_PIE,AWAY_3PT_PCT
145,2014-11-17,0021400150,DET,L,ORL,-4.02,0.4970,0.4511,0.3246,-1.62,0.5464,0.4766,0.3650
149,2014-11-17,0021400146,CHA,L,DAL,-3.86,0.5119,0.4889,0.3513,9.73,0.5795,0.5752,0.3485
151,2014-11-17,0021400151,MEM,W,HOU,4.87,0.5276,0.5434,0.3454,9.93,0.5455,0.5433,0.3487
154,2014-11-18,0021400157,UTA,W,OKC,-5.55,0.5598,0.4942,0.3480,-3.97,0.5105,0.4835,0.3467
155,2014-11-18,0021400156,MIL,W,NYK,-0.38,0.5080,0.5071,0.4517,-2.84,0.5353,0.4678,0.3034
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11974,2024-04-14,0022301186,BOS,W,WAS,9.26,0.5907,0.5466,0.3678,-4.51,0.5712,0.4681,0.3669
11975,2024-04-14,0022301190,NYK,W,CHI,7.82,0.6008,0.5166,0.3546,0.14,0.5786,0.5143,0.3941
11976,2024-04-14,0022301199,LAC,L,HOU,3.51,0.5789,0.5106,0.3043,-5.47,0.5617,0.4631,0.3118
11977,2024-04-14,0022301200,SAC,W,POR,-0.52,0.5527,0.4915,0.3911,-12.56,0.5030,0.4177,0.3620


Classical ML

Predict Wins and Losses of NBA games using classical ML techniques like logistic regression and random forest regression by analyzing advanced NBA stats

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [10]:
features_df['HOME_WIN'] = features_df['WL'].apply(lambda x: 1 if x == 'W' else 0) 
all_features = feature_names + ['AWAY_' + feature_name for feature_name in feature_names]
scaler = StandardScaler()
features_df_scaled = features_df.copy()
features_df_scaled[all_features] = scaler.fit_transform(features_df[all_features])
display(features_df_scaled)


Unnamed: 0,GAME_DATE,GAME_ID,TEAM_ABBREVIATION,WL,AWAY_TEAM_ABBREVIATION,NET_RATING,TS_PCT,PIE,3PT_PCT,AWAY_NET_RATING,AWAY_TS_PCT,AWAY_PIE,AWAY_3PT_PCT,HOME_WIN
145,2014-11-17,0021400150,DET,L,ORL,-0.613737,-2.203425,-1.247204,-1.129231,-0.268773,-0.543896,-0.621667,0.251540,0
149,2014-11-17,0021400146,CHA,L,DAL,-0.588808,-1.696809,-0.274227,-0.217363,1.497028,0.575746,1.928154,-0.308552,0
151,2014-11-17,0021400151,MEM,W,HOU,0.771366,-1.162992,1.128611,-0.418862,1.528143,-0.574340,1.103212,-0.301763,1
154,2014-11-18,0021400157,UTA,W,OKC,-0.852118,-0.068157,-0.137804,-0.330066,-0.634379,-1.758252,-0.443231,-0.369653,1
155,2014-11-18,0021400156,MIL,W,NYK,-0.046608,-1.829413,0.194244,3.211532,-0.458577,-0.919365,-0.849237,-1.839470,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11974,2024-04-14,0022301186,BOS,W,WAS,1.455349,0.982477,1.210979,0.346150,-0.718391,0.294990,-0.841479,0.316036,1
11975,2024-04-14,0022301190,NYK,W,CHI,1.230990,1.325888,0.438775,-0.104661,0.005043,0.545303,0.353265,1.239339,1
11976,2024-04-14,0022301199,LAC,L,HOU,0.559472,0.581264,0.284334,-1.822524,-0.867745,-0.026358,-0.970780,-1.554332,0
11977,2024-04-14,0022301200,SAC,W,POR,-0.068421,-0.309564,-0.207302,1.141900,-1.970787,-2.011947,-2.144835,0.149705,1


In [11]:
X = features_df_scaled[all_features]
y = features_df_scaled['HOME_WIN']
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, stratify=y)
print(y_train.sum()/y_train.size)

0.5691144708423326


In [12]:
def print_model_results(model, y_pred):
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print("Cross-validation scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    display(confusion_matrix(y_true=y_test, y_pred=y_pred))
    print("AUC Score:", roc_auc_score(y_test, y_pred))

In [13]:
logistic_regression_model = LogisticRegressionCV(random_state=42, max_iter = 1000, solver='newton-cg')
logistic_regression_model.fit(X_train, y_train)

y_pred = logistic_regression_model.predict(X_test)
print_model_results(logistic_regression_model, y_pred)


Cross-validation scores: [0.63827235 0.63167367 0.64187163 0.64547091 0.63205282]
Mean accuracy: 0.6378682726839986


array([[394, 504],
       [262, 924]])

AUC Score: 0.6089210800091641


In [14]:
decision_tree_model = DecisionTreeClassifier(random_state=42, max_depth=5, class_weight='balanced')
decision_tree_model.fit(X_train, y_train)
y_pred_1 = decision_tree_model.predict(X_test)
print_model_results(decision_tree_model, y_pred_1)

Cross-validation scores: [0.61787642 0.61907618 0.61427714 0.6124775  0.58463385]
Mean accuracy: 0.6096682224179414


array([[559, 339],
       [511, 675]])

AUC Score: 0.5958171991722283


In [20]:
xgb_model = xgb.XGBClassifier(n_estimators=200, colsample_bytree=0.8, max_depth=5, learning_rate=0.1, objective="binary:logistic")
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print_model_results(xgb_model, y_pred_xgb)


Cross-validation scores: [0.62987403 0.63047391 0.60827834 0.61907618 0.62484994]
Mean accuracy: 0.6225104798968177


array([[421, 477],
       [307, 879]])

AUC Score: 0.604983155372441


In [18]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss"),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
y_pred_xgb = grid_search.predict(X_test)
print_model_results(grid_search, y_pred_xgb)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.7}


Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x136cc3e10>>
Traceback (most recent call last):
  File "/Users/j52zhao/anaconda3/lib/python3.11/site-packages/xgboost/core.py", line 582, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 


KeyboardInterrupt: 

In [21]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf_model.fit(X_train, y_train)

rf_pred= rf_model.predict(X_test)
print_model_results(rf_model, rf_pred)

Cross-validation scores: [0.63467307 0.64787043 0.63947211 0.63167367 0.62244898]
Mean accuracy: 0.6352276483478815


array([[418, 480],
       [290, 896]])

AUC Score: 0.6104797244767274
