In [1]:
import pandas as pd
from itables import show
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
import warnings 
warnings.filterwarnings('ignore')

## Read and Clean


In [12]:
df = pd.read_csv("nba_games.csv",index_col=0)
df = df.sort_values(by='date')
df= df.reset_index(drop=True)

#removes extra columns
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"] #

In [13]:
def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

In [14]:
# Group the data by team
df = df.groupby(by="team",group_keys=False).apply(add_target)

# For games where there is no next game
df["target"][pd.isnull(df["target"])] = 2

# To create a new column for target intead of using False/True in the won column
df["target"] = df["target"].astype(int, errors="ignore")

In [15]:
# Make sure that there's equal wins and losses
df["won"].value_counts()

False    8886
True     8886
Name: won, dtype: int64

In [16]:
# Check how many wins, losses and no next game
df["target"].value_counts()

1    8872
0    8870
2      30
Name: target, dtype: int64

In [17]:
nulls = pd.isnull(df) # check if there's null values
nulls = nulls.sum() # see how many nulls
nulls = nulls[nulls>0] 

In [18]:
# Only select columns that don't have nulls
valid_columns = df.columns[~df.columns.isin(nulls.index)]

# Select the correct columns
df = df[valid_columns].copy()

In [19]:
# Initialize Ridge Classifier
rr = RidgeClassifier(alpha=1)

# Split Data into 3 parts
split = TimeSeriesSplit(n_splits=3)

# Trains model using different features and picks the best features
sfs = SequentialFeatureSelector(rr, n_features_to_select=30,direction="forward",cv=split)

In [20]:
# Now we want to normalize our data and scale it from 0-1

# columns we don't want to scale as they arent' scaleable
removed_columns = ["season","date","won","target","team_opp","team"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

Scale the data using a scaler

In [21]:
# Initialize the Scaler
scaler = MinMaxScaler()

# Use the scaler on the selected columns
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [22]:
df.head() # What the scaled data looks like

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.277279,0.554502,0.317647,GSW,0.451923,1.0,2016,2015-10-27,False,0
1,0.0,0.431818,0.5,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.160462,0.345972,0.317647,CHI,0.317308,1.0,2016,2015-10-27,False,1
2,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.088575,0.232227,0.329412,CLE,0.298077,0.0,2016,2015-10-27,True,1
3,0.0,0.5,0.529412,0.37799,0.310345,0.393939,0.356295,0.44186,0.333333,0.893816,...,0.215661,0.530806,0.505882,NOP,0.298077,0.0,2016,2015-10-27,True,1
4,0.0,0.409091,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.019255,0.203791,0.317647,DET,0.403846,0.0,2016,2015-10-27,False,1


## Using a classifier to Select the best predictors

In [23]:
# fit the sequential features selector
# will pick the best 30 features now 
sfs.fit(df[selected_columns],df["target"])

In [25]:
# These are the best 30 features in a list
predictors = list(selected_columns[sfs.get_support()])
predictors

['mp',
 'fg%',
 '3p%',
 'orb',
 'ts%',
 'usg%',
 '3p%_max',
 'ft_max',
 'fta_max',
 '+/-_max',
 'drb%_max',
 'trb%_max',
 'tov%_max',
 'usg%_max',
 'mp_opp',
 'fg_opp',
 '3p_opp',
 'ft%_opp',
 'blk_opp',
 'usg%_opp',
 'fga_max_opp',
 '3p_max_opp',
 'ft_max_opp',
 'ft%_max_opp',
 'blk_max_opp',
 'pf_max_opp',
 'pts_max_opp',
 'drb%_max_opp',
 'blk%_max_opp',
 'usg%_max_opp']

##### "backtest" Function Purpose:
-split data by season
<br>
-use past seasons to predict the future seasons
<br>
-start input is min amount of season required

In [26]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        curSeason = seasons[i]
        
        # split the data into train and test
        train = data[data["season"] < curSeason]
        test = data[data["season"]== curSeason]
        
        # fit the model 
        model.fit(train[predictors], train["target"])
        
        # use the model to predict 
        preds = model.predict(test[predictors]) #creates a numpy array
        preds = pd.Series(preds, index=test.index) #converts it into an easier to use pandas series
        
        # combines actuals and predictions to make it easier to compare
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual","prediction"] #rename
        
        # combine to the initial df
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [27]:
# run the model 
predictions = backtest(df, rr, predictors)

In [28]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions["actual"] != 2] # take out 
accuracy_score(predictions["actual"],predictions["prediction"]) # Accuracy of the model

0.5485110470701249

In [29]:
# Check accuracy for home vs away team 
# Good to use for baseline accuracy
df.groupby("home").apply(lambda x: x[x["won"]== 1].shape[0]/x.shape[0])

home
0.0    0.428314
1.0    0.571686
dtype: float64

In [30]:
# add a few extra columns to help compute rolling averages
df_rolling = df[list(selected_columns) + ["won","team","season"]]

In [31]:
# Create function to take previous 10 rows and compute the moving average

def find_team_averages(team):
    rolling = team.rolling(10).mean() # groups rows and in our case previous 10 rows
    return rolling

# taking rolling averages specific to the team and season
df_rolling = df_rolling.groupby(["team","season"], group_keys=False).apply(find_team_averages)

In [32]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.381818,0.292647,0.428230,0.468966,0.477273,0.448100,0.434884,0.373016,0.764177,...,0.0570,0.1113,0.471908,0.170603,0.431754,0.522353,0.348077,0.5,0.6,2022.0
17768,0.0,0.502273,0.364706,0.517703,0.455172,0.481818,0.440736,0.320930,0.282540,0.757993,...,0.0716,0.1171,0.374109,0.321566,0.642654,0.564706,0.392308,0.4,0.7,2022.0
17769,0.0,0.354545,0.279412,0.404545,0.437931,0.465152,0.429572,0.434884,0.385714,0.736639,...,0.0591,0.1113,0.483229,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0
17770,0.0,0.354545,0.294118,0.389952,0.434483,0.459091,0.431710,0.406977,0.357143,0.754142,...,0.0572,0.1111,0.483229,0.172144,0.460190,0.472941,0.344231,0.5,0.5,2022.0


In [33]:
# Rename rolling columns to add a 10 so they'req distinquishable
rolling_cols = [f"{col}_10"for col in df_rolling.columns]
df_rolling.columns = rolling_cols

# add rolling df to the original df
df = pd.concat([df,df_rolling],axis=1)

# drop the first 10 games of the season
df = df.dropna()

df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10
243,0.0,0.522727,0.382353,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.0628,0.0679,0.413522,0.124134,0.361611,0.449412,0.347115,0.4,0.8,2016.0
251,0.0,0.659091,0.426471,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.0613,0.0772,0.469497,0.219641,0.394787,0.531765,0.324038,0.5,1.0,2016.0
252,0.0,0.386364,0.382353,0.358852,0.206897,0.181818,0.445368,0.511628,0.412698,0.827305,...,0.0625,0.1145,0.437841,0.138126,0.507109,0.360000,0.351923,0.6,0.4,2016.0
253,0.0,0.500000,0.382353,0.497608,0.344828,0.318182,0.475059,0.325581,0.349206,0.593932,...,0.0646,0.0759,0.512159,0.133633,0.277251,0.388235,0.308654,0.4,0.6,2016.0
256,0.0,0.318182,0.132353,0.500000,0.275862,0.272727,0.432304,0.581395,0.444444,0.879813,...,0.0741,0.0982,0.313312,0.179974,0.500000,0.471765,0.380769,0.5,0.4,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.0570,0.1113,0.471908,0.170603,0.431754,0.522353,0.348077,0.5,0.6,2022.0
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.0716,0.1171,0.374109,0.321566,0.642654,0.564706,0.392308,0.4,0.7,2022.0
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.0591,0.1113,0.483229,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.0572,0.1111,0.483229,0.172144,0.460190,0.472941,0.344231,0.5,0.5,2022.0


In [34]:
# Creates a column for the next game 
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

# adds the column
def add_col(df,col_name):
    return df.groupby("team",group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [35]:
df = df.copy()

In [36]:
# pull in opponent data
full = df.merge(
    df[rolling_cols + ["team_opp_next","date_next","team"]],
    left_on=["team","date_next"], 
    right_on=["team_opp_next","date_next"])

In [37]:
full

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,season_10_y,team_opp_next_y,team_y
0,0.00,0.477273,0.500000,0.375598,0.379310,0.348485,0.483373,0.441860,0.396825,0.730455,...,0.380294,0.273427,0.270616,0.478824,0.308654,0.6,0.7,2016.0,SAC,TOR
1,0.00,0.340909,0.250000,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.437212,0.124904,0.404739,0.408235,0.428846,0.2,0.3,2016.0,TOR,SAC
2,0.50,0.409091,0.455882,0.330144,0.482759,0.515152,0.437055,0.372093,0.412698,0.568261,...,0.504403,0.153273,0.344076,0.384706,0.319231,0.7,0.5,2016.0,CLE,DET
3,0.25,0.545455,0.544118,0.416268,0.413793,0.454545,0.419240,0.186047,0.142857,0.883314,...,0.467505,0.276508,0.352607,0.482353,0.316346,0.7,0.6,2016.0,GSW,TOR
4,0.00,0.340909,0.558824,0.186603,0.206897,0.469697,0.203088,0.139535,0.111111,0.854142,...,0.413732,0.156739,0.470142,0.391765,0.436538,0.6,0.1,2016.0,DEN,NOP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15769,0.00,0.545455,0.426471,0.511962,0.448276,0.469697,0.440618,0.372093,0.365079,0.659277,...,0.457128,0.235173,0.562085,0.552941,0.429808,0.4,0.6,2022.0,BOS,GSW
15770,0.00,0.477273,0.455882,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.471908,0.170603,0.431754,0.522353,0.348077,0.5,0.6,2022.0,GSW,BOS
15771,0.00,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.431761,0.242875,0.567773,0.575294,0.394231,0.4,0.7,2022.0,BOS,GSW
15772,0.00,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.483229,0.174711,0.438863,0.483529,0.350000,0.5,0.5,2022.0,GSW,BOS


In [38]:
full[["team_x","team_opp_next_x","team_y","team_opp_next_y","date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,SAC,TOR,TOR,SAC,2015-11-15
1,TOR,SAC,SAC,TOR,2015-11-15
2,CLE,DET,DET,CLE,2015-11-17
3,GSW,TOR,TOR,GSW,2015-11-17
4,DEN,NOP,NOP,DEN,2015-11-17
...,...,...,...,...,...
15769,BOS,GSW,GSW,BOS,2022-06-10
15770,GSW,BOS,BOS,GSW,2022-06-13
15771,BOS,GSW,GSW,BOS,2022-06-13
15772,GSW,BOS,BOS,GSW,2022-06-16


In [39]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [40]:
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team_opp',
 'team']

In [41]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [43]:
full2 = full.drop(columns=['team_x',
                           'team_opp',
                           'date',
                           'team_opp_next_x',
                           'date_next',
                          'team_opp_next_y',
                          'team_y',
                          'season',
                          'date',
                          'won',
                          'target',
                          'team_opp'])

In [44]:
sfs.fit(full2[selected_columns], full["target"])

In [46]:
predictors = list(selected_columns[sfs.get_support()])

In [47]:
predictors

['mp',
 'orb',
 'ast',
 'tov',
 'usg%',
 'pf_max',
 'trb%_max',
 'stl%_max',
 'mp_opp',
 'usg%_opp',
 'usg%_10_x',
 'ft%_max_10_x',
 '3par_max_10_x',
 'usg%_opp_10_x',
 'stl_max_opp_10_x',
 'won_10_x',
 'home_next',
 'drb_10_y',
 'trb%_10_y',
 'usg%_10_y',
 'ft_max_10_y',
 'efg%_max_10_y',
 'tov%_max_10_y',
 'trb%_opp_10_y',
 'usg%_opp_10_y',
 'fga_max_opp_10_y',
 'fta_max_opp_10_y',
 'ft%_max_opp_10_y',
 'orb%_max_opp_10_y',
 'won_10_y']

In [48]:
predictions = backtest(full,rr,predictors)

In [49]:
accuracy_score(predictions["actual"], predictions["prediction"])

0.6296296296296297

In [50]:
sfs.get_support().size

413

In [51]:
selected_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'stl%_max_opp_10_y', 'blk%_max_opp_10_y', 'tov%_max_opp_10_y',
       'usg%_max_opp_10_y', 'ortg_max_opp_10_y', 'drtg_max_opp_10_y',
       'total_opp_10_y', 'home_opp_10_y', 'won_10_y', 'season_10_y'],
      dtype='object', length=413)

Next Steps<br>
-Use something else beside ridge classifier <br>
-try different number of features <br>
-try different rolling averages <br>
-compare last 5 games to last 10 games <br>
-predict future rows, insert up to date box scores