In [2]:
import joblib
sfs = joblib.load('../models/xg_boost/feature_selector_xgb.pkl') # load the new feature selector from file for future use
xgb  = joblib.load('../models/xg_boost/model_xgb.pkl') # load the new model from file for future use

In [4]:
import pandas as pd

# 1. Load the same rolling averages dataset
full = pd.read_csv("../data/nba_games_rolling_averages.csv", index_col=0)

# 2. Remove non-numeric and target-related columns
removed_columns = list(full.columns[full.dtypes == "object"]) + [
    'season', 'date', 'won', 'target', 'team', 'team_opp'
]
selected_columns = full.columns[~full.columns.isin(removed_columns)]
predictors = list(selected_columns[sfs.get_support()]) # get the selected features from the feature selector

In [5]:
def backtest(data, model, predictors, start=2, step=1):
    # since we have time series data, we cannot split data up with usual cross-validation (must use historical data to predict future outcomes)
    # function will split data into season, use past seasons to predict future seasons

    # start = 2: 2 seasons min to start predicting (ex: using 2016/2017 to predict 2018)

    all_predictions = [] # list of dataframes for predictions of one season

    seasons = sorted(data["season"].unique()) # sort data by season

    for i in range(start, len(seasons), step):
        season = seasons[i] # each loop iteration is a new season
        train = data[data["season"] < season] # training data is ALL DATA BEFORE CURRENT SEASON
        test = data[data["season"] == season] # current season is tested/predicted

        model.fit(train[predictors], train["target"]) # calls model to find combinations in training data that give us most accuracy to target

        preds = model.predict(test[predictors]) # allow model to make predictions on DIFFERENT DATA than training data
        # predictions made using training data will yield unrealistically high accuracy as expected

        preds = pd.Series (preds, index=test.index) #convert predictions to Series with same index as test data

        combined = pd.concat( [test["target"], preds], axis=1) # combine predictions and real outcomes into one dataframe/table
        combined.columns = ["actual", "prediction"] # rename columns

        all_predictions.append(combined) # add predictions to list of predictions
    
    return pd.concat(all_predictions) #combine all seasons into one dataframe at the end

In [6]:
predictions = backtest(full, xgb, predictors) # run backtest function with new data, model, and predictors (metrics)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(predictions["actual"], predictions["prediction"]) # accuracy of prediction (compare actual to predictions)


# 63.1% accuracy with a lightweight XGBoost model - slight DECLINE from Ridge Regression (64%)
# greater number of trees (n-predictors) likely would have performed better
# however, the model's complexity and training time would also increase
# ex: my 500 n-predictor model ran 10 hours without finishing
# trade-off: accuracy vs. training time!

0.6317923763179237