In [67]:
import torch
# ! pip install numpy==2.0
print(torch.__version__)

2.2.2


## Preparing Data For Machine Learning

In [127]:
import pandas as pd

# index_col indicates what column to use as indexes (row labels) of the dataframe. 
# 1 row = 1 game
df = pd.read_csv('nba_games.csv', index_col=0)
# sort values by column with header 'date', 8 seasons of data 2015-2022
df = df.sort_values('date')
# reset indexes for sorted values, need drop=True to reassign index values
df = df.reset_index(drop=True)
# delete unwanted and/or repeated columns
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

# function when applied helps group data by individual team and shifts "won" category one column to the left
def add_target(team):
    team = team.copy()
    team["target"] = team["won"].shift(-1)
    return team

df = df.groupby("team", group_keys=False).apply(add_target)
df

  df = df.groupby("team", group_keys=False).apply(add_target)


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,False
1,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,True
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,True
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,True
4,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,34.0,85.0,0.400,15.0,38.0,0.395,14.0,19.0,0.737,...,36.3,133.0,112.0,GSW,107,0,2022,2022-06-10,False,False
17768,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,True
17769,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,...,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False,False
17770,240.0,34.0,80.0,0.425,11.0,28.0,0.393,11.0,12.0,0.917,...,31.5,186.0,111.0,GSW,103,0,2022,2022-06-16,False,


In [128]:
# example of only washington games being included
df[df["team"] == "WAS"]



Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
11,240.0,33.0,84.0,0.393,7.0,28.0,0.250,15.0,25.0,0.600,...,25.2,134.0,98.0,ORL,87,1,2016,2015-10-28,True,True
63,240.0,35.0,68.0,0.515,12.0,21.0,0.571,36.0,44.0,0.818,...,32.4,138.0,122.0,MIL,113,1,2016,2015-10-30,True,False
69,240.0,38.0,90.0,0.422,6.0,22.0,0.273,28.0,33.0,0.848,...,28.7,153.0,108.0,NYK,117,0,2016,2015-10-31,False,True
124,240.0,42.0,87.0,0.483,8.0,23.0,0.348,10.0,14.0,0.714,...,30.0,160.0,109.0,SAS,99,0,2016,2015-11-04,True,False
161,240.0,36.0,88.0,0.409,8.0,25.0,0.320,18.0,23.0,0.783,...,41.6,146.0,103.0,BOS,118,1,2016,2015-11-06,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17471,240.0,38.0,82.0,0.463,8.0,21.0,0.381,18.0,20.0,0.900,...,34.4,245.0,116.0,BOS,144,1,2022,2022-04-03,False,True
17483,240.0,56.0,101.0,0.554,13.0,30.0,0.433,7.0,14.0,0.500,...,32.8,168.0,132.0,MIN,114,1,2022,2022-04-05,True,False
17510,240.0,42.0,86.0,0.488,10.0,35.0,0.286,9.0,11.0,0.818,...,60.2,159.0,114.0,ATL,118,1,2022,2022-04-06,False,False
17538,240.0,35.0,69.0,0.507,9.0,28.0,0.321,13.0,17.0,0.765,...,35.5,135.0,104.0,NYK,114,0,2022,2022-04-08,False,False


In [123]:
# Assign a 2 to null values for target column, meaning no data for the last game
df["target"][pd.isnull(df["target"])] = 2
# convert target data from boolean data type to an integer
# so 0 = loss, 1 = win, 2 means don't have data for the next game
df["target"] = df["target"].astype(int, errors="ignore")

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["target"][pd.isnull(df["target"])] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnul

In [124]:
# get rid of columns with null values from dataframe
nulls = pd.isnull(df)
nulls = nulls.sum()
nulls = nulls[nulls > 0]
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,0
1,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
4,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,34.0,85.0,0.400,15.0,38.0,0.395,14.0,19.0,0.737,...,36.3,133.0,112.0,GSW,107,0,2022,2022-06-10,False,0
17768,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,1
17769,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,...,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False,0
17770,240.0,34.0,80.0,0.425,11.0,28.0,0.393,11.0,12.0,0.917,...,31.5,186.0,111.0,GSW,103,0,2022,2022-06-16,False,2


## Selecting the best features for the model

In [None]:
# importing sklearn functions
# ensures you're using past games to predict future games
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
# importing a specific machine learnign model called RidgeClassifier
from sklearn.linear_model import RidgeClassifier
# import scaler to scale desired values to between 0 and 1
from sklearn.preprocessing import MinMaxScaler

# assign variables for utilizing functions
rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
# uses 30/142 columns of features model will think are best to use
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split)
scaler = MinMaxScaler()

# removed_columns are values you don't want to scale
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
# select remaining columns outside of removed_columns for scaling
selected_columns = df.columns[~df.columns.isin(removed_columns)]
df[selected_columns] = scaler.fit_transform(df[selected_columns])


In [75]:
# use Sequential Features Selector to fit best data from selected_columns and target
sfs.fit(df[selected_columns], df["target"])

In [80]:
# select best features of the selected columns
predictors = list(selected_columns[sfs.get_support()])

## Creating a baseline model

In [88]:
# function makes predictions for us
# ensures past seasons are used for predicting future seasons
# start=2 ensures we use at least 2 seasons of data before making predictions for the following season
# step=1 means we're only making predictions for one season at a time
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []

    # sorts all season data by season
    seasons = sorted(data["season"].unique())

    for i in range(start, len(seasons), step):
        # i starts at 2
        season = seasons[i]

        # grabs 2016, 2017
        train = data[data["season"] < season]
        # grabs 2018
        test = data[data["season"] == season]

        # utilize model per prediction interation
        model.fit(train[predictors], train["target"])

        # make predictions based on test set and help us understand
        # how good model is based on historical data
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)

        # treat target and prediction as two separate columns wth axis=1
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)

    return pd.concat(all_predictions)

In [94]:
# assign result of model run to predictions variable
predictions = backtest(df, rr, predictors)
predictions

Unnamed: 0,actual,prediction
5250,1,1
5251,1,1
5252,0,0
5253,1,0
5254,0,1
...,...,...
17767,0,0
17768,1,1
17769,0,1
17770,2,1


In [116]:
# current model is correct 54.7% of the time at predicting wins
from sklearn.metrics import accuracy_score

accuracy_score(predictions["actual"], predictions["prediction"])

0.5471969333972209

In [117]:
#group data by home games, won 57% of time at home
df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

  df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])


home
0    0.428314
1    0.571686
dtype: float64

# Documented Attempts to Improving Model Performance and Those Results:
- As opposed to only using one game to predict the next game, we could take an average of how current team performed in last 10 games from current season, using rolling averages
- Factor in their home and away games, as home has a higher winning percentage
- Also merge in opponent's team performance in their last 10 games, performing similar operations to it such as removing invalid columns.
- Adding this info to backtest improves accuracy from 54.7 -> 62.9%

# Suggestions on how to further Improve Model Performance
- Use a more powerful model then a Ridge Regression Model
- Try out specific different features as opposed to model randomly selecting 30
- Try different rolling averages other than 10 games
- Identify Hot Streaks and Cold Streaks

