In [3]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.4
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from xgboost import XGBClassifier
import joblib

In [5]:
# 1. Load the same rolling averages dataset
full = pd.read_csv("../data/nba_games_rolling_averages.csv", index_col=0)

# 2. Remove non-numeric and target-related columns
removed_columns = list(full.columns[full.dtypes == "object"]) + [
    'season', 'date', 'won', 'target', 'team', 'team_opp'
]
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [12]:
# 3. Create an XGBoost model
xgb = XGBClassifier(
    n_estimators=120,       # number of boosting rounds (kept small for speed in SFS)
    learning_rate=0.1,      # step size shrinkage; higher means faster convergence but slightly less precision
    max_depth=3,            # max depth of each tree; shallower trees train faster and avoid overfitting small noise
    min_child_weight=5,     # min sum of instance weights needed in a child; higher values = more conservative splits
    subsample=0.8,          # % of training rows sampled for each tree; adds randomness for generalization
    colsample_bytree=0.8,   # % of features sampled for each tree; also helps generalization
    gamma=0,                # min loss reduction to make a split; 0 means allow all beneficial splits
    reg_alpha=0.0,          # L1 regularization term; higher values encourage sparsity in weights
    reg_lambda=1.0,         # L2 regularization term; higher values shrink weights to reduce overfitting
    tree_method="hist",     # histogram-based tree growth; much faster than exact on CPU
    n_jobs=4,               # number of parallel threads to use for training
    random_state=42,        # random seed for reproducibility
    eval_metric="logloss"   # evaluation metric; log loss is standard for binary classification
)

In [13]:
# 4. Time series split for feature selection
split = TimeSeriesSplit(n_splits=3)

# 5. Sequential feature selection with XGBoost
sfs = SequentialFeatureSelector(
    xgb,
    n_features_to_select=30,
    direction="forward",
    cv=split
)

In [14]:
sfs.fit(full[selected_columns], full["target"])
# 6. Save the model and selected features using XGBoost

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=XGBClassifier(base_score=None, booster=None,
                                                  callbacks=None,
                                                  colsample_bylevel=None,
                                                  colsample_bynode=None,
                                                  colsample_bytree=0.8,
                                                  device=None,
                                                  early_stopping_rounds=None,
                                                  enable_categorical=False,
                                                  eval_metric='logloss',
                                                  feature_types=None, gamma=0,
                                                  grow_...one,
                                                  importance_type=None,
                            

In [15]:
joblib.dump(sfs, '../models/xg_boost/feature_selector_xgb.pkl')
joblib.dump(xgb, '../models/xg_boost/model_xgb.pkl')

['../models/xg_boost/model_xgb.pkl']