In [2]:
import pandas as pd
full = pd.read_csv("../data/nba_games_rolling_averages.csv", index_col=0)

In [3]:
from sklearn.model_selection import TimeSeriesSplit     # import for feature selection (only want to train with certain data)
# ensures we are only using past data to predict future outcomes (not vice versa!)

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=0.1) # rr -> Ridge Regression (allows for classification) -> the MODEL itself
split = TimeSeriesSplit(n_splits=3) # cross-validation strategy, 3 splits for time series cross-validation

sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split) # feature selection strategy: will train the model (rr) using different sets of features

In [4]:
# logic to choose columns to use for training

removed_columns = list(full.columns[full.dtypes == "object"]) + ['season', 'date', 'won', 'target', 'team', 'team_opp']
selected_columns = full.columns[~full.columns.isin(removed_columns)] # negate columns in removed_columns

In [None]:
sfs.fit(full[selected_columns], full["target"]) # refit feature selector to new data with rolling averages

In [None]:
import joblib
joblib.dump(sfs, '../models/ridge_regression/feature_selector_optimized.pkl') # save the feature selector to a file for later use
joblib.dump(rr, '../models/ridge_regression/model_optimized.pkl') # save the feature selector to a file for later use