setup.
get our features

In [13]:
# from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
import pandas as pd
import numpy as np


# features we're working with below
numerical_features = ['rs_rating_diff', 'po_rating_diff', 'series_diff', 'series_game_number']
categorical_features = ['team_a_home'] # not including season_start_year- pipeline will run after splitting, we want to drop it. target already dropped

# load, drop columns. will drop season start year later- need it for the seasonal CV/test splitting
games_df = pd.read_csv("output/playoff_features.csv")
games_df = games_df.dropna()

games_df = games_df.drop(columns=['game_id', 'game_date', 'team_a_name', 'team_b_name'])
games_df.head()

Unnamed: 0,team_a_home,series_game_number,team_a_series_wins,team_b_series_wins,series_diff,season_start_year,team_a_win,team_a_po_rating,team_a_po_rating_var,team_b_po_rating,team_b_po_rating_var,team_a_rs_rating,team_a_rs_rating_var,team_b_rs_rating,team_b_rs_rating_var
0,1,1.0,0,0,0,2009,1,30.566319,9.632559,24.958216,8.52126,39.456857,6.396895,31.614513,5.668099
1,1,2.0,1,0,1,2009,1,30.403871,9.433143,24.817969,9.065302,39.456857,6.396895,31.614513,5.668099
2,0,3.0,2,0,2,2009,0,30.330679,9.263566,24.686693,8.728343,39.456857,6.396895,31.614513,5.668099
3,0,4.0,2,1,1,2009,1,32.113724,9.537321,28.347514,9.273886,39.456857,6.396895,31.614513,5.668099
4,1,5.0,3,1,2,2009,1,28.617827,8.617034,24.799651,8.607696,39.456857,6.396895,31.614513,5.668099


pipeline? don't need it lol.

In [14]:
model = XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1,
    seed=42
)

set up train/test split, set up rolling window cross validation splits, apply normalization pipeline + fit model
seeing the first basic result here

In [15]:
# split off the test set
seasons = sorted(games_df['season_start_year'].unique())
train_seasons = seasons[:-1] # all but the last season
test_season = seasons[-1] # last season
train_df, test_df = games_df[games_df['season_start_year'].isin(train_seasons)], games_df[games_df['season_start_year'] == test_season]
X_test, y_test = test_df.drop("team_a_win", axis=1), test_df["team_a_win"]


# split into train and test sets
def rolling_window_splits(df : pd.DataFrame, season_col="season_start_year", train_size=10, n_splits=5):
    
    # sorted list of unique seasons
    seasons = sorted(df[season_col].unique())
    
    # for each CV split... 
    for i in range(n_splits):
        train_seasons = seasons[i : i + train_size]
        val_season = seasons[i + train_size]
        
        train_idx = df[df[season_col].isin(train_seasons)].index
        val_idx = df[df[season_col] == val_season].index
        yield train_idx, val_idx

X_train, y_train = train_df.drop("team_a_win", axis=1), train_df["team_a_win"]
scores = cross_val_score(model, X_train, y_train, cv=rolling_window_splits(train_df, train_size=10, n_splits=5), n_jobs=-1)
print(scores)
print(f"Mean CV score: {scores.mean():.3f} +/- {scores.std():.3f}")

# loop of CV splits
# for train_idx, val_idx in rolling_window_splits(games_df, train_size=10, n_splits=5):
#     train_df = games_df.loc[train_idx].drop(columns=["season_start_year"])
#     val_df = games_df.loc[val_idx].drop(columns=["season_start_year"])
    
#     X_train, y_train = train_df.drop("team_a_win", axis=1), train_df["team_a_win"]
#     X_val, y_val = val_df.drop("team_a_win", axis=1), val_df["team_a_win"]
    
#     pipeline.fit(X_train, y_train)
    

[0.55421687 0.49411765 0.55172414 0.45238095 0.5       ]
Mean CV score: 0.510 +/- 0.038


hyperparameter tuning now- start with random search

In [16]:

# Define the parameter distributions
param_dist = {
    'n_estimators': randint(100, 800),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),         # 0.01 to 0.31
    'subsample': uniform(0.6, 0.4),              # 0.6 to 1.0
    'colsample_bytree': uniform(0.6, 0.4),       # 0.6 to 1.0
    'min_child_weight': randint(1, 10),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 1),                  # L1 regularization
    'reg_lambda': uniform(0.5, 1.0)              # L2 regularization
}

xgb = XGBClassifier(eval_metric='logloss')
search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=100,                  # number of random configs to try
    scoring='accuracy',         # or 'neg_log_loss' / 'roc_auc'
    cv=rolling_window_splits(train_df, train_size=10, n_splits=5),
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit to your training data
search.fit(X_train, y_train)

# Best parameters
print(search.best_params_)




Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'colsample_bytree': np.float64(0.9497270419417668), 'gamma': np.float64(0.25761802743210543), 'learning_rate': np.float64(0.3019331047810313), 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 502, 'reg_alpha': np.float64(0.8217906368444492), 'reg_lambda': np.float64(0.8450826278223867), 'subsample': np.float64(0.7390476857930735)}


grid search based on random search results

In [17]:

# random_search_best_params = {'colsample_bytree': np.float64(0.9497270419417668), 
#                              'gamma': np.float64(0.25761802743210543), 
#                              'learning_rate': np.float64(0.3019331047810313), 
#                              'max_depth': 7, 
#                              'min_child_weight': 3, 
#                              'n_estimators': 502, 
#                              'reg_alpha': np.float64(0.8217906368444492), 
#                              'reg_lambda': np.float64(0.8450826278223867), 
#                              'subsample': np.float64(0.7390476857930735)}


def generate_search_grid(params):
    grid = {}
    for key, value in params.items():
        if isinstance(value, np.float64):
            grid[key] = [max(.001, value - 0.1*value), value, min(1, value + 0.1*value)]
        elif isinstance(value, int):
            grid[key] = [min(value - 1, value - value//10), value, max(value + 1, value + value//10)]
    return grid

param_grid = generate_search_grid(search.best_params_)
# print(param_grid)
    
grid = GridSearchCV(xgb, param_grid, cv=rolling_window_splits(train_df, train_size=10, n_splits=5), n_jobs=-1)
grid.fit(X_train, y_train)
print(f"Best CV score: {grid.best_score_:.3f}")
print(f"Best params: {grid.best_params_}")

Best CV score: 0.567
Best params: {'colsample_bytree': np.float64(0.9497270419417668), 'gamma': np.float64(0.283379830175316), 'learning_rate': np.float64(0.3019331047810313), 'max_depth': 8, 'min_child_weight': 4, 'n_estimators': 452, 'reg_alpha': np.float64(0.9039697005288941), 'reg_lambda': np.float64(0.8450826278223867), 'subsample': np.float64(0.8129524543723808)}


train the final model with the full training set + the results from grid searching

In [None]:
post_cv_best_model = grid.best_estimator_
post_cv_best_model.fit(X_train, y_train)

# evaluate on the test set
test_score = post_cv_best_model.score(X_test, y_test)
print(f"Test score: {test_score:.3f}")

# save the model
import joblib
joblib.dump(post_cv_best_model, "output/boosted_tree.pkl")


Test score: 0.508


['output/logistig_reg_model.pkl']