# Collect data

In [1]:
%pip install nba_api

Note: you may need to restart the kernel to use updated packages.


In [2]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/31/2020', league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42020,1610612762,UTA,Utah Jazz,42000223,2021-06-12,UTA @ LAC,L,238,106,...,0.882,9,29,38,15,7,1,13,22,-26.0
1,42020,1610612746,LAC,LA Clippers,42000223,2021-06-12,LAC vs. UTA,W,239,132,...,0.813,10,31,41,21,6,2,9,18,26.0
2,42020,1610612743,DEN,Denver Nuggets,42000233,2021-06-11,DEN vs. PHX,L,240,102,...,0.625,18,28,46,21,4,3,14,21,-14.0
3,42020,1610612755,PHI,Philadelphia 76ers,42000203,2021-06-11,PHI @ ATL,W,240,127,...,0.641,10,26,36,28,8,4,10,26,16.0
4,42020,1610612737,ATL,Atlanta Hawks,42000203,2021-06-11,ATL vs. PHI,L,240,111,...,0.844,11,25,36,15,6,1,11,28,-16.0


# Clean and Explore data

In [3]:
games.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [4]:
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

In [5]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
0,Utah Jazz,0042000223,2021-06-12,UTA @ LAC,L,-26.0
1,LA Clippers,0042000223,2021-06-12,LAC vs. UTA,W,26.0
2,Denver Nuggets,0042000233,2021-06-11,DEN vs. PHX,L,-14.0
3,Philadelphia 76ers,0042000203,2021-06-11,PHI @ ATL,W,16.0
4,Atlanta Hawks,0042000203,2021-06-11,ATL vs. PHI,L,-16.0
...,...,...,...,...,...,...
3299,Dallas Mavericks,0021900722,2020-01-31,DAL @ HOU,L,-7.0
3300,New Orleans Pelicans,0021900723,2020-01-31,NOP vs. MEM,W,28.0
3301,Denver Nuggets,0021900724,2020-01-31,DEN @ MIL,W,12.0
3302,Portland Trail Blazers,0021900726,2020-01-31,POR @ LAL,W,8.0


final DataFrame: two columns: 1.result of game: target 2.score statistics comparing two teams: feature

In [6]:
import pandas as pd

In [7]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3304 entries, 0 to 3303
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TEAM_NAME   3304 non-null   object 
 1   GAME_ID     3304 non-null   object 
 2   GAME_DATE   3304 non-null   object 
 3   MATCHUP     3304 non-null   object 
 4   WL          3304 non-null   object 
 5   PLUS_MINUS  3304 non-null   float64
dtypes: float64(1), object(5)
memory usage: 155.0+ KB


In [8]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

In [9]:
games['GAME_DATE']

0      2021-06-12
1      2021-06-12
2      2021-06-11
3      2021-06-11
4      2021-06-11
          ...    
3299   2020-01-31
3300   2020-01-31
3301   2020-01-31
3302   2020-01-31
3303   2020-01-31
Name: GAME_DATE, Length: 3304, dtype: datetime64[ns]

In [10]:
games = games.sort_values('GAME_DATE')
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
3303,Oklahoma City Thunder,0021900725,2020-01-31,OKC @ PHX,W,4.0
3290,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0
3291,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0
3292,Toronto Raptors,0021900720,2020-01-31,TOR @ DET,W,13.0
3293,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0
...,...,...,...,...,...,...
4,Atlanta Hawks,0042000203,2021-06-11,ATL vs. PHI,L,-16.0
3,Philadelphia 76ers,0042000203,2021-06-11,PHI @ ATL,W,16.0
2,Denver Nuggets,0042000233,2021-06-11,DEN vs. PHX,L,-14.0
1,LA Clippers,0042000223,2021-06-12,LAC vs. UTA,W,26.0


In [11]:
games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

In [12]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
3303,Oklahoma City Thunder,0021900725,2020-01-31,OKC @ PHX,W,4.0,
3290,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,
3291,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0,
3292,Toronto Raptors,0021900720,2020-01-31,TOR @ DET,W,13.0,
3293,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0,
...,...,...,...,...,...,...,...
4,Atlanta Hawks,0042000203,2021-06-11,ATL vs. PHI,L,-16.0,3.300000
3,Philadelphia 76ers,0042000203,2021-06-11,PHI @ ATL,W,16.0,7.433333
2,Denver Nuggets,0042000233,2021-06-11,DEN vs. PHX,L,-14.0,1.533333
1,LA Clippers,0042000223,2021-06-12,LAC vs. UTA,W,26.0,4.900000


In [13]:
games[games['TEAM_NAME']=='Toronto Raptors'].head(35)

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
3292,Toronto Raptors,21900720,2020-01-31,TOR @ DET,W,13.0,
3265,Toronto Raptors,21900740,2020-02-02,TOR vs. CHI,W,27.0,
3216,Toronto Raptors,21900758,2020-02-05,TOR vs. IND,W,1.0,
3205,Toronto Raptors,21900772,2020-02-07,TOR @ IND,W,9.0,
3173,Toronto Raptors,21900781,2020-02-08,TOR vs. BKN,W,1.0,
3149,Toronto Raptors,21900796,2020-02-10,TOR vs. MIN,W,11.0,
3127,Toronto Raptors,21900809,2020-02-12,TOR @ BKN,L,-10.0,
3078,Toronto Raptors,21900829,2020-02-21,TOR vs. PHX,W,17.0,
3053,Toronto Raptors,21900843,2020-02-23,TOR vs. IND,W,46.0,
3023,Toronto Raptors,21900858,2020-02-25,TOR vs. MIL,L,-11.0,


In [14]:
msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

In [15]:
games_home.shape

(1652, 7)

In [16]:
games_away.shape

(1652, 7)

In [17]:
games_home

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
3290,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,
3291,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0,
3293,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0,
3295,Brooklyn Nets,0021900721,2020-01-31,BKN vs. CHI,W,15.0,
3297,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0,
...,...,...,...,...,...,...,...
8,Utah Jazz,0042000222,2021-06-10,UTA vs. LAC,W,6.0,8.833333
7,Milwaukee Bucks,0042000213,2021-06-10,MIL vs. BKN,W,3.0,4.933333
4,Atlanta Hawks,0042000203,2021-06-11,ATL vs. PHI,L,-16.0,3.300000
2,Denver Nuggets,0042000233,2021-06-11,DEN vs. PHX,L,-14.0,1.533333


In [18]:
games_away

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
3303,Oklahoma City Thunder,0021900725,2020-01-31,OKC @ PHX,W,4.0,
3292,Toronto Raptors,0021900720,2020-01-31,TOR @ DET,W,13.0,
3294,Memphis Grizzlies,0021900723,2020-01-31,MEM @ NOP,L,-28.0,
3302,Portland Trail Blazers,0021900726,2020-01-31,POR @ LAL,W,8.0,
3299,Dallas Mavericks,0021900722,2020-01-31,DAL @ HOU,L,-7.0,
...,...,...,...,...,...,...,...
9,LA Clippers,0042000222,2021-06-10,LAC @ UTA,L,-6.0,5.700000
6,Brooklyn Nets,0042000213,2021-06-10,BKN @ MIL,L,-3.0,6.566667
5,Phoenix Suns,0042000233,2021-06-11,PHX @ DEN,W,14.0,5.233333
3,Philadelphia 76ers,0042000203,2021-06-11,PHI @ ATL,W,16.0,7.433333


In [19]:
games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away
0,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,,Toronto Raptors,2020-01-31,TOR @ DET,W,13.0,
1,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0,,Oklahoma City Thunder,2020-01-31,OKC @ PHX,W,4.0,
2,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0,,Denver Nuggets,2020-01-31,DEN @ MIL,W,12.0,
3,Brooklyn Nets,0021900721,2020-01-31,BKN vs. CHI,W,15.0,,Chicago Bulls,2020-01-31,CHI @ BKN,L,-15.0,
4,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0,,Portland Trail Blazers,2020-01-31,POR @ LAL,W,8.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,Utah Jazz,0042000222,2021-06-10,UTA vs. LAC,W,6.0,8.833333,LA Clippers,2021-06-10,LAC @ UTA,L,-6.0,5.700000
1648,Milwaukee Bucks,0042000213,2021-06-10,MIL vs. BKN,W,3.0,4.933333,Brooklyn Nets,2021-06-10,BKN @ MIL,L,-3.0,6.566667
1649,Atlanta Hawks,0042000203,2021-06-11,ATL vs. PHI,L,-16.0,3.300000,Philadelphia 76ers,2021-06-11,PHI @ ATL,W,16.0,7.433333
1650,Denver Nuggets,0042000233,2021-06-11,DEN vs. PHX,L,-14.0,1.533333,Phoenix Suns,2021-06-11,PHX @ DEN,W,14.0,5.233333


In [20]:
games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [21]:
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away,avg_30_plus_minus_diff
0,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,,Toronto Raptors,2020-01-31,TOR @ DET,W,13.0,,
1,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0,,Oklahoma City Thunder,2020-01-31,OKC @ PHX,W,4.0,,
2,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0,,Denver Nuggets,2020-01-31,DEN @ MIL,W,12.0,,
3,Brooklyn Nets,0021900721,2020-01-31,BKN vs. CHI,W,15.0,,Chicago Bulls,2020-01-31,CHI @ BKN,L,-15.0,,
4,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0,,Portland Trail Blazers,2020-01-31,POR @ LAL,W,8.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,Utah Jazz,0042000222,2021-06-10,UTA vs. LAC,W,6.0,8.833333,LA Clippers,2021-06-10,LAC @ UTA,L,-6.0,5.700000,3.133333
1648,Milwaukee Bucks,0042000213,2021-06-10,MIL vs. BKN,W,3.0,4.933333,Brooklyn Nets,2021-06-10,BKN @ MIL,L,-3.0,6.566667,-1.633333
1649,Atlanta Hawks,0042000203,2021-06-11,ATL vs. PHI,L,-16.0,3.300000,Philadelphia 76ers,2021-06-11,PHI @ ATL,W,16.0,7.433333,-4.133333
1650,Denver Nuggets,0042000233,2021-06-11,DEN vs. PHX,L,-14.0,1.533333,Phoenix Suns,2021-06-11,PHX @ DEN,W,14.0,5.233333,-3.700000


In [22]:
games_merged[['WL_home', 'avg_30_plus_minus_diff']]

Unnamed: 0,WL_home,avg_30_plus_minus_diff
0,L,
1,L,
2,L,
3,W,
4,L,
...,...,...
1647,W,3.133333
1648,W,-1.633333
1649,L,-4.133333
1650,L,-3.700000


In [23]:
games_model = games_merged[['WL_home', 'avg_30_plus_minus_diff']].dropna()
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
399,L,-5.366667
400,W,1.933333
401,L,-6.233333
402,W,2.466667
403,L,-5.600000
...,...,...
1647,W,3.133333
1648,W,-1.633333
1649,L,-4.133333
1650,L,-3.700000


In [24]:
games_model['WL_home'] = games_model['WL_home'].map({'W': 1, 'L': 0})

In [25]:
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
399,0,-5.366667
400,1,1.933333
401,0,-6.233333
402,1,2.466667
403,0,-5.600000
...,...,...
1647,1,3.133333
1648,1,-1.633333
1649,0,-4.133333
1650,0,-3.700000


# Build the predictive model, hyperparameter tuning, evaluation

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
df_train, df_test = train_test_split(games_model, stratify=games_model['WL_home'], test_size=0.2, random_state=7)

In [28]:
df_train.shape

(919, 2)

In [29]:
df_test.shape

(230, 2)

In [30]:
target = 'WL_home'
X_train = df_train.drop(columns=target)
y_train = df_train[target]

X_test = df_test.drop(columns=target)
y_test= df_test[target]

In [31]:
# %pip install xgboost

In [32]:
import xgboost as xgb

In [33]:
clf = xgb.XGBClassifier(use_label_encoder=False, random_state=7)

In [34]:
clf.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=7,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [35]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)

In [36]:
accuracy_score(y_test, y_pred)

0.6043478260869565

In [37]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform

In [38]:
hyp_params = {'learning_rate': loguniform(0.0001, 1),
              'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
              'subsample': [0.7, 0.8, 0.9, 1.0],
              'n_estimators': [50, 100, 150, 200]}

In [39]:
random_hyp = RandomizedSearchCV(estimator=clf, 
                                param_distributions=hyp_params, 
                                n_iter=20, 
                                cv=7,
                                scoring='accuracy',
                                random_state=7)

In [40]:
random_hyp.fit(X_train, y_train)



RandomizedSearchCV(cv=7,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           gpu_id=-1, importance_type='gain',
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           min_child_weight=1, missing=nan,
                                           monotone_constraints='()',
                                           n_estimators=100, n_jobs=4,
                                           num_pa...
                                           scale_pos_weight=1, subsample=1,
                                           tree_method='exac

In [41]:
random_hyp.best_params_

{'learning_rate': 0.0005227662863531309,
 'max_depth': 2,
 'n_estimators': 100,
 'subsample': 0.7}

In [42]:
model_hyp = random_hyp.best_estimator_

In [43]:
y_pred_hyp = model_hyp.predict(X_test)

In [44]:
accuracy_score(y_test, y_pred_hyp)

0.6391304347826087

# Deploy the model with FastAPI

In [45]:
from joblib import dump, load
dump(model_hyp, 'model_nba.joblib') 
model_saved = load('model_nba.joblib') 

In [46]:
accuracy_score(y_test, model_saved.predict(X_test))

0.6391304347826087

In [47]:
# copied all from above
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/31/2020', league_id_nullable='00')
games = gamefinder.get_data_frames()[0]

games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

import pandas as pd

games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

games = games.sort_values('GAME_DATE')

games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))

games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [48]:
team_home='Toronto Raptors'
team_away='Boston Celtics'

import numpy as np
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/01/2021',
                                           league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

msk_home = (games['TEAM_NAME'] == team_home)
games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
home_plus_minus = games_30_home['PLUS_MINUS'].mean()

msk_away = (games['TEAM_NAME'] == team_away)
games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
away_plus_minus = games_30_away['PLUS_MINUS'].mean()

games_diff=home_plus_minus - away_plus_minus

predict_home_win=model_saved.predict(np.array([games_diff]))[0]
predict_winning_probability=model_saved.predict_proba(np.array([games_diff]))[0][1]

In [49]:
def predict_games(team_home, team_away):
    gamefinder = leaguegamefinder.LeagueGameFinder(
        date_from_nullable='01/01/2021',
        league_id_nullable='00')
    games = gamefinder.get_data_frames()[0]
    games = games[
        ['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

    msk_home = (games['TEAM_NAME'] == team_home)
    games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
    home_plus_minus = games_30_home['PLUS_MINUS'].mean()

    msk_away = (games['TEAM_NAME'] == team_away)
    games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
    away_plus_minus = games_30_away['PLUS_MINUS'].mean()

    games_diff = home_plus_minus - away_plus_minus

    predict_home_win = model_saved.predict(np.array([games_diff]))[0]
    predict_winning_probability = model_saved.predict_proba(np.array([games_diff]))[0][1]
    return predict_home_win, predict_winning_probability

In [50]:
predict_games('Boston Celtics','Toronto Raptors')

(1, 0.5020537)