setup.
get our features

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import pandas as pd

# features we're working with below
normalized_features = ['rs_rating_diff', 'po_rating_diff', 'series_diff', 'series_game_number']
other_features = ['team_a_home'] # not including season_start_year- pipeline will run after splitting, we want to drop it. target already dropped

# polynomial features
# poly_feature_names = ['1', 'num__rs_rating_diff', 'num__po_rating_diff', 'num__series_diff',
#  'num__series_game_number', 'cat__team_a_home', 'num__rs_rating_diff^2',
#  'num__rs_rating_diff num__po_rating_diff',
#  'num__rs_rating_diff num__series_diff',
#  'num__rs_rating_diff num__series_game_number',
#  'num__rs_rating_diff cat__team_a_home', 'num__po_rating_diff^2',
#  'num__po_rating_diff num__series_diff',
#  'num__po_rating_diff num__series_game_number',
#  'num__po_rating_diff cat__team_a_home', 'num__series_diff^2',
#  'num__series_diff num__series_game_number',
#  'num__series_diff cat__team_a_home', 'num__series_game_number^2',
#  'num__series_game_number cat__team_a_home', 'cat__team_a_home^2']

# load, drop columns. will drop season start year later- need it for the seasonal CV/test splitting
games_df = pd.read_csv("output/playoff_features.csv")
games_df = games_df.dropna()
games_df["rs_rating_diff"] = games_df["team_a_rs_rating"] - games_df["team_b_rs_rating"]
games_df["po_rating_diff"] = games_df["team_a_po_rating"] - games_df["team_b_po_rating"]
games_df = games_df.drop(columns=['game_id', 'game_date', 'team_a_name', 'team_b_name', 'team_a_series_wins', 'team_b_series_wins',
                                  'team_a_rs_rating', 'team_b_rs_rating', 'team_a_po_rating', 'team_b_po_rating',
                                  'team_a_rs_rating_var', 'team_b_rs_rating_var', 'team_a_po_rating_var', 'team_b_po_rating_var'])
games_df.head()

Unnamed: 0,team_a_home,series_game_number,series_diff,season_start_year,team_a_win,rs_rating_diff,po_rating_diff
0,1,1.0,0,2009,1,7.842345,5.608103
1,1,2.0,1,2009,1,7.842345,5.585902
2,0,3.0,2,2009,0,7.842345,5.643986
3,0,4.0,1,2009,1,7.842345,3.766211
4,1,5.0,2,2009,1,7.842345,3.818176


Intermediary, run once step- need to get the column names 

In [20]:
# col_transformer = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), normalized_features),
#         ('cat', 'passthrough', other_features)
#     ]
# )

# col_transformer.fit(games_df[normalized_features + other_features])
# col_names = col_transformer.get_feature_names_out()
# # print(col_names)
# poly = PolynomialFeatures(degree=2)
# poly.fit(col_transformer.transform(games_df[normalized_features + other_features]))
# poly_feature_names = poly.get_feature_names_out(col_names)
# print(poly_feature_names)

create normalization pipeline, define our model

In [21]:
# pipeline - want to transform po ratings and rs ratings into diff variables
# want to normalize rs_rating_diff, po_rating_diff

# class ToDataFrame(BaseEstimator, TransformerMixin):
#     def __init__(self, column_names):
#         self.column_names = column_names

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return pd.DataFrame(X, columns=self.column_names)

# # used to drop squared categoricals after polynomial expansion
# class ColumnDropper(BaseEstimator, TransformerMixin):
#     def __init__(self, columns_to_drop):
#         self.columns_to_drop = columns_to_drop

#     def fit(self, X, y=None):
#         # Does nothing, since we're not learning anything from the data
#         return self

#     def transform(self, X):
#         # Drops the specified columns
        
#         return X.drop(columns=self.columns_to_drop)
    
normalizer = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), normalized_features),
        ('cat', 'passthrough', other_features)
    ]
)

pipeline = Pipeline([
    ('norm', normalizer),
    ('poly', PolynomialFeatures(degree=2)),
    # ('to_df', ToDataFrame(poly_feature_names)),
    # ('dropper', ColumnDropper(columns_to_drop=[f'{binary_feature}^2' for binary_feature in other_features])),
    ('classifier', LogisticRegression()) # l2 regularization, log-loss, default solver, single class, binary classification - defaults are good as this is more or less the base case
])

set up train/test split, set up rolling window cross validation splits, apply normalization pipeline + fit model
seeing the first basic result here

In [22]:
# split off the test set
seasons = sorted(games_df['season_start_year'].unique())
train_seasons = seasons[:-1] # all but the last season
test_season = seasons[-1] # last season
train_df, test_df = games_df[games_df['season_start_year'].isin(train_seasons)], games_df[games_df['season_start_year'] == test_season]
X_test, y_test = test_df.drop("team_a_win", axis=1), test_df["team_a_win"]


# split into train and test sets
def rolling_window_splits(df : pd.DataFrame, season_col="season_start_year", train_size=10, n_splits=5):
    
    # sorted list of unique seasons
    seasons = sorted(df[season_col].unique())
    
    # for each CV split... 
    for i in range(n_splits):
        train_seasons = seasons[i : i + train_size]
        val_season = seasons[i + train_size]
        
        train_idx = df[df[season_col].isin(train_seasons)].index
        val_idx = df[df[season_col] == val_season].index
        yield train_idx, val_idx

X_train, y_train = train_df.drop("team_a_win", axis=1), train_df["team_a_win"]
scores = cross_val_score(pipeline, X_train, y_train, cv=rolling_window_splits(train_df, train_size=10, n_splits=5), n_jobs=-1)
print(scores)
print(f"Mean CV score: {scores.mean():.3f} +/- {scores.std():.3f}")

# loop of CV splits
# for train_idx, val_idx in rolling_window_splits(games_df, train_size=10, n_splits=5):
#     train_df = games_df.loc[train_idx].drop(columns=["season_start_year"])
#     val_df = games_df.loc[val_idx].drop(columns=["season_start_year"])
    
#     X_train, y_train = train_df.drop("team_a_win", axis=1), train_df["team_a_win"]
#     X_val, y_val = val_df.drop("team_a_win", axis=1), val_df["team_a_win"]
    
#     pipeline.fit(X_train, y_train)
    

[0.51807229 0.49411765 0.54022989 0.54761905 0.59756098]
Mean CV score: 0.540 +/- 0.035


hyperparameter tuning on C (equivalent to lambda), training on the full non-test set, and getting our final model

In [23]:


param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100] 
}
grid = GridSearchCV(pipeline, param_grid, cv=rolling_window_splits(train_df, train_size=10, n_splits=5), n_jobs=-1)
grid.fit(X_train, y_train)
print(f"Best CV score: {grid.best_score_:.3f}")
print(f"Best params: {grid.best_params_}")

Best CV score: 0.544
Best params: {'classifier__C': 0.1}


train the final model with the full training set + the results from grid searching

In [None]:
post_cv_best_pipeline = grid.best_estimator_
post_cv_best_pipeline.fit(X_train, y_train)

# evaluate on the test set
test_score = post_cv_best_pipeline.score(X_test, y_test)
print(f"Test score: {test_score:.3f}")

# save the model
import joblib
joblib.dump(pipeline, "output/logistig_reg_model.pkl")


Test score: 0.508


['output/logistig_reg_model.pkl']