In [7]:
import pandas as pd
data = pd.read_csv("../data/nba_games_scaled.csv", index_col=0)

In [8]:
from sklearn.model_selection import TimeSeriesSplit     # import for feature selection (only want to train with certain data)
# ensures we are only using past data to predict future outcomes (not vice versa!)

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=0.1) # rr -> Ridge Regression (allows for classification) -> the MODEL itself
split = TimeSeriesSplit(n_splits=3) # cross-validation strategy, 3 splits for time series cross-validation

sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split) # feature selection strategy: will train the model (rr) using different sets of features

In [9]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"] # these columns are not to be used in the model
# to optimize Ridge Regression performance, we scale the columns to values between 0 and 1
# removed_columns includes columns that are NOT to be scaled

In [10]:
selected_columns = data.columns[~data.columns.isin(removed_columns)] # negate columns in removed_columns

In [11]:
sfs.fit(data[selected_columns], data["target"]) # fit the feature selector to the data
# uses rr model to try different combinations of features to find the best set of features that predict TARGET (the outcome of the NEXT game)

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=0.1),
                          n_features_to_select=30)

In [12]:
import joblib
joblib.dump(sfs, '../models/ridge_regression/feature_selector_unoptimized.pkl') # save the feature selector to a file for later use
joblib.dump(rr, '../models/ridge_regression/model_unoptimized.pkl') # save the model to a file for later use


['../models/ridge_regression/model_unoptimized.pkl']