# March Machine Learning Mania
In this notebook, we:
- Try different models/features and assess their performance.
- Simulate multiple 2024 March Madness brackets.

## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gc import collect
import os
import sys
import tqdm

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, log_loss, accuracy_score, confusion_matrix, classification_report
from xgboost import XGBRegressor, XGBClassifier

# display 100 rows and 100 columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# global random seed
SEED = 0

# set numpy seed
np.random.seed(SEED)

## Load Data

In [3]:
# root dirs
root = 'data/'
mroot = 'data/mens/'
wroot = 'data/womens/'

# load in features compact
features = pd.read_csv('data/processed/features_detailed.csv')

# team names
teams = pd.concat([pd.read_csv(mroot + 'MTeams.csv'), pd.read_csv(wroot + 'WTeams.csv')], ignore_index=True)

# create a map for team names
team_map = teams.set_index('TeamID')['TeamName']

## Rearranging the Data
When we input a new row into our model (for the 2024 bracket), the winner could be the first or the second team in the row that we input into our model. Currently, all rows have the winning team on the left, and this will cause our model to learn this locational information. We need to rearrange the order of winners and losers, to where it is essentially random to prevent the model from learning this.

In [6]:
features.columns[:100]

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WSeed', 'LSeed', 'WRegionSeed', 'LRegionSeed', 'WRegion',
       'LRegion', 'WPlayIn', 'LPlayIn', 'Wnum_games', 'Wwin_ratio',
       'Wavg_Score_for', 'Wavg_Score_against', 'Wstd_Score_for',
       'Wstd_Score_against', 'Wavg_FGM_for', 'Wavg_FGM_against',
       'Wstd_FGM_for', 'Wstd_FGM_against', 'Wavg_FGA_for', 'Wavg_FGA_against',
       'Wstd_FGA_for', 'Wstd_FGA_against', 'Wavg_FGM3_for',
       'Wavg_FGM3_against', 'Wstd_FGM3_for', 'Wstd_FGM3_against',
       'Wavg_FGA3_for', 'Wavg_FGA3_against', 'Wstd_FGA3_for',
       'Wstd_FGA3_against', 'Wavg_FTM_for', 'Wavg_FTM_against', 'Wstd_FTM_for',
       'Wstd_FTM_against', 'Wavg_FTA_for', 'Wavg_FTA_against', 'Wstd_FTA_for',
       'Wstd_FTA_against', 'Wavg_OR_for', 'Wavg_OR_against', 'Wstd_OR_for',
       'Wstd_OR_against', 'Wavg_DR_for', 'Wavg_DR_against', 'Wstd_DR_for',
       'Wstd_DR_against', 'Wavg_Ast_for', 'Wavg_Ast_against', 'Wstd_Ast_f

In [3]:
# drop columns that can't be used as features
features = features.drop(columns=(['DayNum', 'WRegion', 'Wseed_diff', 'LRegion', 'Lseed_diff', 'abs_seed_diff']))

In [4]:
# rename W, L to A, B
features.columns = [x.replace('W', 'A_', 1) if x[0] == 'W' else x for x in features.columns]
features.columns = [x.replace('L', 'B_', 1) if x[0] == 'L' else x for x in features.columns]

# check
features.head()

Unnamed: 0,Season,A_TeamID,A_Score,B_TeamID,B_Score,A_Loc,NumOT,A_Seed,B_Seed,A_FullSeed,B_FullSeed,A_PlayIn,B_PlayIn,A_max_win_diff,A_max_loss_diff,A_num_games,A_win_ratio,A_mean_ppg,A_mean_papg,A_std_ppg,A_std_papg,A_mean_diff,A_std_diff,A_home_win_ratio,A_away_win_ratio,A_neutral_win_ratio,A_1_pos_win_missing,A_1_pos_loss_missing,A_ot_win_missing,A_ot_loss_missing,A_1_pos_game_ratio,A_1_pos_win_ratio,A_ot_ratio,A_ot_win_ratio,A_recent_win_ratio,A_recent_mean_pts_for,A_recent_mean_pts_against,A_recent_mean_score_diff,A_recent_std_pts_for,A_recent_std_pts_against,A_recent_std_score_diff,B_max_win_diff,B_max_loss_diff,B_num_games,B_win_ratio,B_mean_ppg,B_mean_papg,B_std_ppg,B_std_papg,B_mean_diff,B_std_diff,B_home_win_ratio,B_away_win_ratio,B_neutral_win_ratio,B_1_pos_win_missing,B_1_pos_loss_missing,B_ot_win_missing,B_ot_loss_missing,B_1_pos_game_ratio,B_1_pos_win_ratio,B_ot_ratio,B_ot_win_ratio,B_recent_win_ratio,B_recent_mean_pts_for,B_recent_mean_pts_against,B_recent_mean_score_diff,B_recent_std_pts_for,B_recent_std_pts_against,B_recent_std_score_diff,B_Loc,round,A_seed_win_prob,B_seed_win_prob
0,1985,1116,63,1234,54,N,0,9,8,X09,X08,0,0,35,20,33,0.636364,65.333333,61.69697,11.332454,9.055902,3.636364,9.620253,0.909091,0.333333,0.7,0,0,1,1,0.363636,0.25,0.0,0.0,0.666667,74.166667,61.5,12.666667,15.382169,8.624899,8.286535,49,20,30,0.666667,69.733333,59.266667,12.375335,11.243712,10.466667,13.416676,0.833333,0.333333,0.666667,0,0,1,1,0.233333,0.142857,0.0,0.0,0.333333,62.5,63.833333,-1.333333,7.207249,15.625833,9.061518,N,1,0.45202,0.54798
1,1985,1120,59,1345,58,N,0,11,6,Z11,Z06,0,0,42,19,29,0.62069,70.344828,66.655172,13.52334,11.519276,3.689655,12.768266,0.666667,0.454545,0.833333,0,0,0,0,0.482759,0.428571,0.068966,0.5,0.833333,63.166667,57.0,6.166667,14.250731,6.689544,9.27811,31,43,25,0.68,69.12,65.32,10.537755,8.837016,3.8,13.685843,0.714286,0.6,1.0,0,0,1,1,0.4,0.2,0.0,0.0,0.666667,66.333333,68.833333,-2.5,12.270108,4.778424,11.172884,N,1,0.340909,0.659091
2,1985,1207,68,1250,43,N,0,1,16,W01,W16,0,0,41,2,27,0.925926,75.740741,60.074074,11.475417,10.523929,15.666667,10.490513,0.923077,0.857143,1.0,0,1,1,1,0.074074,1.0,0.0,0.0,1.0,83.666667,64.333333,19.333333,10.366613,10.726913,8.594572,18,31,29,0.37931,65.758621,70.206897,8.131251,8.504503,-4.448276,16.284512,0.333333,0.363636,0.5,0,0,0,1,0.724138,0.238095,0.034483,1.0,0.833333,69.5,70.0,-0.5,7.593857,7.071068,1.5,N,1,0.986842,0.013158
3,1985,1229,58,1425,55,N,0,9,8,Y09,Y08,0,0,31,13,27,0.740741,71.592593,65.62963,10.506689,10.519915,5.962963,8.956982,0.785714,0.6,1.0,0,0,1,1,0.37037,0.4,0.0,0.0,0.666667,72.0,64.833333,7.166667,9.006171,6.940221,8.36328,33,21,28,0.678571,68.392857,64.607143,9.272442,8.751871,3.785714,9.777817,0.642857,0.727273,0.666667,0,0,1,1,0.428571,0.5,0.0,0.0,0.5,68.833333,68.166667,0.666667,6.879922,13.083068,11.56143,N,1,0.45202,0.54798
4,1985,1242,49,1325,38,N,0,3,14,Z03,Z14,0,0,27,19,30,0.766667,76.033333,70.4,10.420842,11.20687,5.633333,8.081707,1.0,0.6,0.5,0,0,1,1,0.3,0.333333,0.0,0.0,0.833333,76.833333,72.166667,4.666667,7.46101,5.228129,5.416026,23,17,27,0.740741,67.555556,63.0,9.638866,8.801914,4.555556,7.092059,1.0,0.583333,0.6,0,0,1,1,0.333333,0.333333,0.0,0.0,0.833333,68.333333,58.166667,10.166667,7.455423,4.941322,7.762087,N,1,0.872093,0.127907


In [5]:
# choose (half) random rows to flip
np.random.seed(SEED)
flip = np.random.choice(features.index, int(features.shape[0] / 2), replace=False)

# flip A, B cols
for col in features.columns:
    if col[0] == 'A':
        features.loc[flip, col], features.loc[flip, col.replace('A_', 'B_')] = features.loc[flip, col.replace('A_', 'B_')].values, features.loc[flip, col].values

# delete vars
del flip, col
collect()

# rearrange cols
features = features.reindex(sorted(features.columns), axis=1)

# check
features.head()

Unnamed: 0,A_1_pos_game_ratio,A_1_pos_loss_missing,A_1_pos_win_missing,A_1_pos_win_ratio,A_FullSeed,A_Loc,A_PlayIn,A_Score,A_Seed,A_TeamID,A_away_win_ratio,A_home_win_ratio,A_max_loss_diff,A_max_win_diff,A_mean_diff,A_mean_papg,A_mean_ppg,A_neutral_win_ratio,A_num_games,A_ot_loss_missing,A_ot_ratio,A_ot_win_missing,A_ot_win_ratio,A_recent_mean_pts_against,A_recent_mean_pts_for,A_recent_mean_score_diff,A_recent_std_pts_against,A_recent_std_pts_for,A_recent_std_score_diff,A_recent_win_ratio,A_seed_win_prob,A_std_diff,A_std_papg,A_std_ppg,A_win_ratio,B_1_pos_game_ratio,B_1_pos_loss_missing,B_1_pos_win_missing,B_1_pos_win_ratio,B_FullSeed,B_Loc,B_PlayIn,B_Score,B_Seed,B_TeamID,B_away_win_ratio,B_home_win_ratio,B_max_loss_diff,B_max_win_diff,B_mean_diff,B_mean_papg,B_mean_ppg,B_neutral_win_ratio,B_num_games,B_ot_loss_missing,B_ot_ratio,B_ot_win_missing,B_ot_win_ratio,B_recent_mean_pts_against,B_recent_mean_pts_for,B_recent_mean_score_diff,B_recent_std_pts_against,B_recent_std_pts_for,B_recent_std_score_diff,B_recent_win_ratio,B_seed_win_prob,B_std_diff,B_std_papg,B_std_ppg,B_win_ratio,NumOT,Season,round
0,0.363636,0,0,0.25,X09,N,0,63,9,1116,0.333333,0.909091,20,35,3.636364,61.69697,65.333333,0.7,33,1,0.0,1,0.0,61.5,74.166667,12.666667,8.624899,15.382169,8.286535,0.666667,0.45202,9.620253,9.055902,11.332454,0.636364,0.233333,0,0,0.142857,X08,N,0,54,8,1234,0.333333,0.833333,20,49,10.466667,59.266667,69.733333,0.666667,30,1,0.0,1,0.0,63.833333,62.5,-1.333333,15.625833,7.207249,9.061518,0.333333,0.54798,13.416676,11.243712,12.375335,0.666667,0,1985,1
1,0.4,0,0,0.2,Z06,N,0,58,6,1345,0.6,0.714286,43,31,3.8,65.32,69.12,1.0,25,1,0.0,1,0.0,68.833333,66.333333,-2.5,4.778424,12.270108,11.172884,0.666667,0.659091,13.685843,8.837016,10.537755,0.68,0.482759,0,0,0.428571,Z11,N,0,59,11,1120,0.454545,0.666667,19,42,3.689655,66.655172,70.344828,0.833333,29,0,0.068966,0,0.5,57.0,63.166667,6.166667,6.689544,14.250731,9.27811,0.833333,0.340909,12.768266,11.519276,13.52334,0.62069,0,1985,1
2,0.724138,0,0,0.238095,W16,N,0,43,16,1250,0.363636,0.333333,31,18,-4.448276,70.206897,65.758621,0.5,29,1,0.034483,0,1.0,70.0,69.5,-0.5,7.071068,7.593857,1.5,0.833333,0.013158,16.284512,8.504503,8.131251,0.37931,0.074074,1,0,1.0,W01,N,0,68,1,1207,0.857143,0.923077,2,41,15.666667,60.074074,75.740741,1.0,27,1,0.0,1,0.0,64.333333,83.666667,19.333333,10.726913,10.366613,8.594572,1.0,0.986842,10.490513,10.523929,11.475417,0.925926,0,1985,1
3,0.37037,0,0,0.4,Y09,N,0,58,9,1229,0.6,0.785714,13,31,5.962963,65.62963,71.592593,1.0,27,1,0.0,1,0.0,64.833333,72.0,7.166667,6.940221,9.006171,8.36328,0.666667,0.45202,8.956982,10.519915,10.506689,0.740741,0.428571,0,0,0.5,Y08,N,0,55,8,1425,0.727273,0.642857,21,33,3.785714,64.607143,68.392857,0.666667,28,1,0.0,1,0.0,68.166667,68.833333,0.666667,13.083068,6.879922,11.56143,0.5,0.54798,9.777817,8.751871,9.272442,0.678571,0,1985,1
4,0.333333,0,0,0.333333,Z14,N,0,38,14,1325,0.583333,1.0,17,23,4.555556,63.0,67.555556,0.6,27,1,0.0,1,0.0,58.166667,68.333333,10.166667,4.941322,7.455423,7.762087,0.833333,0.127907,7.092059,8.801914,9.638866,0.740741,0.3,0,0,0.333333,Z03,N,0,49,3,1242,0.6,1.0,19,27,5.633333,70.4,76.033333,0.5,30,1,0.0,1,0.0,72.166667,76.833333,4.666667,5.228129,7.46101,5.416026,0.833333,0.872093,8.081707,11.20687,10.420842,0.766667,0,1985,1


In [6]:
# one-hot encode A_Loc col, drop loc cols
features = pd.get_dummies(features, columns=['A_Loc'], dtype=int)
features = features.drop(columns=['A_Loc_N', 'B_Loc'])

# one-hot encode 'round' col, drop one col
features = pd.get_dummies(features, columns=['round'], dtype=int)
features = features.drop(columns=['round_1'])

## Create Labels

In [7]:
# create regression label, drop cols
features['score_diff'] = features['A_Score'] - features['B_Score']
features = features.drop(columns=['A_Score', 'B_Score'])

# create binary label
features['win'] = features['score_diff'].apply(lambda x: 1 if x > 0 else 0)

# check
features.head()

Unnamed: 0,A_1_pos_game_ratio,A_1_pos_loss_missing,A_1_pos_win_missing,A_1_pos_win_ratio,A_FullSeed,A_PlayIn,A_Seed,A_TeamID,A_away_win_ratio,A_home_win_ratio,A_max_loss_diff,A_max_win_diff,A_mean_diff,A_mean_papg,A_mean_ppg,A_neutral_win_ratio,A_num_games,A_ot_loss_missing,A_ot_ratio,A_ot_win_missing,A_ot_win_ratio,A_recent_mean_pts_against,A_recent_mean_pts_for,A_recent_mean_score_diff,A_recent_std_pts_against,A_recent_std_pts_for,A_recent_std_score_diff,A_recent_win_ratio,A_seed_win_prob,A_std_diff,A_std_papg,A_std_ppg,A_win_ratio,B_1_pos_game_ratio,B_1_pos_loss_missing,B_1_pos_win_missing,B_1_pos_win_ratio,B_FullSeed,B_PlayIn,B_Seed,B_TeamID,B_away_win_ratio,B_home_win_ratio,B_max_loss_diff,B_max_win_diff,B_mean_diff,B_mean_papg,B_mean_ppg,B_neutral_win_ratio,B_num_games,B_ot_loss_missing,B_ot_ratio,B_ot_win_missing,B_ot_win_ratio,B_recent_mean_pts_against,B_recent_mean_pts_for,B_recent_mean_score_diff,B_recent_std_pts_against,B_recent_std_pts_for,B_recent_std_score_diff,B_recent_win_ratio,B_seed_win_prob,B_std_diff,B_std_papg,B_std_ppg,B_win_ratio,NumOT,Season,A_Loc_A,A_Loc_H,round_2,round_3,round_4,round_5,round_6,score_diff,win
0,0.363636,0,0,0.25,X09,0,9,1116,0.333333,0.909091,20,35,3.636364,61.69697,65.333333,0.7,33,1,0.0,1,0.0,61.5,74.166667,12.666667,8.624899,15.382169,8.286535,0.666667,0.45202,9.620253,9.055902,11.332454,0.636364,0.233333,0,0,0.142857,X08,0,8,1234,0.333333,0.833333,20,49,10.466667,59.266667,69.733333,0.666667,30,1,0.0,1,0.0,63.833333,62.5,-1.333333,15.625833,7.207249,9.061518,0.333333,0.54798,13.416676,11.243712,12.375335,0.666667,0,1985,0,0,0,0,0,0,0,9,1
1,0.4,0,0,0.2,Z06,0,6,1345,0.6,0.714286,43,31,3.8,65.32,69.12,1.0,25,1,0.0,1,0.0,68.833333,66.333333,-2.5,4.778424,12.270108,11.172884,0.666667,0.659091,13.685843,8.837016,10.537755,0.68,0.482759,0,0,0.428571,Z11,0,11,1120,0.454545,0.666667,19,42,3.689655,66.655172,70.344828,0.833333,29,0,0.068966,0,0.5,57.0,63.166667,6.166667,6.689544,14.250731,9.27811,0.833333,0.340909,12.768266,11.519276,13.52334,0.62069,0,1985,0,0,0,0,0,0,0,-1,0
2,0.724138,0,0,0.238095,W16,0,16,1250,0.363636,0.333333,31,18,-4.448276,70.206897,65.758621,0.5,29,1,0.034483,0,1.0,70.0,69.5,-0.5,7.071068,7.593857,1.5,0.833333,0.013158,16.284512,8.504503,8.131251,0.37931,0.074074,1,0,1.0,W01,0,1,1207,0.857143,0.923077,2,41,15.666667,60.074074,75.740741,1.0,27,1,0.0,1,0.0,64.333333,83.666667,19.333333,10.726913,10.366613,8.594572,1.0,0.986842,10.490513,10.523929,11.475417,0.925926,0,1985,0,0,0,0,0,0,0,-25,0
3,0.37037,0,0,0.4,Y09,0,9,1229,0.6,0.785714,13,31,5.962963,65.62963,71.592593,1.0,27,1,0.0,1,0.0,64.833333,72.0,7.166667,6.940221,9.006171,8.36328,0.666667,0.45202,8.956982,10.519915,10.506689,0.740741,0.428571,0,0,0.5,Y08,0,8,1425,0.727273,0.642857,21,33,3.785714,64.607143,68.392857,0.666667,28,1,0.0,1,0.0,68.166667,68.833333,0.666667,13.083068,6.879922,11.56143,0.5,0.54798,9.777817,8.751871,9.272442,0.678571,0,1985,0,0,0,0,0,0,0,3,1
4,0.333333,0,0,0.333333,Z14,0,14,1325,0.583333,1.0,17,23,4.555556,63.0,67.555556,0.6,27,1,0.0,1,0.0,58.166667,68.333333,10.166667,4.941322,7.455423,7.762087,0.833333,0.127907,7.092059,8.801914,9.638866,0.740741,0.3,0,0,0.333333,Z03,0,3,1242,0.6,1.0,19,27,5.633333,70.4,76.033333,0.5,30,1,0.0,1,0.0,72.166667,76.833333,4.666667,5.228129,7.46101,5.416026,0.833333,0.872093,8.081707,11.20687,10.420842,0.766667,0,1985,0,0,0,0,0,0,0,-11,0


In [8]:
# create adjusted score diff col (score diff is inversely scaled by NumOT periods), drop NumOT
features['score_diff_adj'] = features['score_diff'] / (2 ** features['NumOT'])

# check
features[features['NumOT'] > 0][['score_diff', 'NumOT', 'score_diff_adj']].sort_values(by='NumOT', ascending=False).head()

Unnamed: 0,score_diff,NumOT,score_diff_adj
654,-8,3,-1.0
1425,4,2,1.0
1676,7,2,1.75
1118,-4,2,-1.0
1166,1,2,0.25


__score_diff__ and __win__ can both be used as labels. They are both calculated with respect to team A.

In [9]:
# drop cols now that we have adjusted score diff
features = features.drop(columns=['score_diff', 'NumOT'])

## Differenced Features
In combination with the team A/B data, we will also try training models using the differences between both teams' aggregated stats.

In [10]:
# # create diff cols (features between 2 teams)
# cols_to_diff = ['1_pos_game_ratio', '1_pos_win_ratio', 'away_win_ratio', 'home_win_ratio', 'max_loss_diff', 'max_win_diff', 'mean_diff', 'mean_papg', 'mean_ppg', 
#                 'neutral_win_ratio', 'num_games', 'ot_ratio', 'ot_win_ratio', 'recent_mean_pts_against', 'recent_mean_pts_for', 'recent_mean_score_diff', 'recent_std_pts_against', 
#                 'recent_std_pts_for', 'recent_std_score_diff', 'recent_win_ratio', 'std_diff', 'std_papg', 'std_ppg', 'win_ratio', 'seed_win_prob']

# # features to be kept but shouldn't be differenced
# cols_to_keep = ['A_TeamID', 'A_FullSeed', 'A_1_pos_loss_missing', 'A_1_pos_win_missing', 'A_ot_loss_missing', 'A_ot_win_missing', 'A_PlayIn', 'B_TeamID', 'B_FullSeed', 
#                 'B_1_pos_loss_missing', 'B_1_pos_win_missing', 'B_ot_loss_missing', 'B_ot_win_missing', 'B_PlayIn', 'A_Loc_A', 'A_Loc_H', 'round_2', 'round_3', 'round_4', 'round_5', 
#                 'round_6', 'score_diff_adj', 'win']

# # create diff df
# features_diff = features[cols_to_keep].copy()

# # create diff cols
# for col in cols_to_diff:
#     features_diff[col + '_diff'] = features['A_' + col] - features['B_' + col]

# # check
# features_diff.head()

## Split Data

In [11]:
# split on gender, drop cols
mfeatures = features[features['A_TeamID'] < 3000]
wfeatures = features[features['A_TeamID'] >= 3000]
# del features

# same for diff data
# mfeatures_diff = features_diff[features_diff['A_TeamID'] < 3000]
# wfeatures_diff = features_diff[features_diff['A_TeamID'] >= 3000]

## Chalk Bracket
Here, we will simply predict the better seed. If seeds are equal (in rounds 5 and 6), we will predict the team with the better win_ratio.

From the 38 NCAA Men's tournaments that we have access to, the better seed wins __70.62%__ of the time. In the women's, the better seed wins __76.51%__ of the time. In conjunction to predicting the better seed (when seeds are different), we will predict a full __chalk__ bracket by taking the team with a higher regular season win ratio when the seeds are the same.

In [12]:
def get_dummy_preds(data):
    """
    Function to get dummy predictions based on seed and win percentage
    """

    # create a container
    dummy_preds = []

    # loop through the dataframe based on conditions
    for idx, row in data.iterrows():
        if data.loc[idx, "A_Seed"] < data.loc[idx, "B_Seed"]:
            dummy_preds.append(1)
        elif data.loc[idx, "A_Seed"] > data.loc[idx, "B_Seed"]:
            dummy_preds.append(0)
        else:
            if data.loc[idx, "A_win_ratio"] > data.loc[idx, "B_win_ratio"]:
                dummy_preds.append(1)
            else:
                dummy_preds.append(0)
    
    return np.array(dummy_preds)

### Men's

In [13]:
# get dummy preds
mchalk_preds = get_dummy_preds(mfeatures)

# compare preds to win col
mchalk_acc = accuracy_score(mfeatures['win'], mchalk_preds)

print(f"Accuracy of dummy predictions: {mchalk_acc*100:.2f}%.")

# delete vars
del mchalk_preds, mchalk_acc

Accuracy of dummy predictions: 71.04%.


### Women's

In [14]:
# get dummy preds
wchalk_preds = get_dummy_preds(wfeatures)

# compare preds to win col
wchalk_acc = accuracy_score(wfeatures['win'], wchalk_preds)

print(f"Accuracy of dummy predictions: {wchalk_acc*100:.2f}%")

# delete vars
del wchalk_preds, wchalk_acc

Accuracy of dummy predictions: 77.65%


In [15]:
# drop cols that may not be useful features
mfeatures = mfeatures.drop(columns=(['Season', 'A_Seed', 'B_Seed']))
wfeatures = wfeatures.drop(columns=(['Season', 'A_Seed', 'B_Seed']))

## Regression/Classification
This problem can be treated as a regression problem (label is __score differential__). It can also be treated as binary classification (label is __win/loss__).

We will try the original aggregated stats for both team A and B as features (mfeatures/wfeatures), and also the differences between the two teams' stats (mcomp_diff/wcomp_diff).

In [16]:
# non feature cols
non_feat_cols = ['A_TeamID', 'A_FullSeed', 'B_TeamID', 'B_FullSeed', 'win', 'score_diff_adj']

# look at num feats for the 2 datasets
print(f'Features of both team A and B: {mfeatures.shape[1] - len(non_feat_cols)}')
# print(f'Features of the DIFFERENCE between team A and B: {mfeatures_diff.shape[1] - len(non_feat_cols)}')

Features of both team A and B: 67


### Data Preprocessing

In [17]:
# create feature order list so our models are trained on the same feature order
sorted_cols = sorted(mfeatures.columns)
sorted_cols.remove('A_Loc_A')
sorted_cols.remove('A_Loc_H')
sorted_cols.remove('score_diff_adj')
sorted_cols.remove('win')
sorted_cols.extend(['A_Loc_A', 'A_Loc_H', 'score_diff_adj', 'win'])

# same for diff
# diff_cols = sorted(mfeatures_diff.columns)
# diff_cols.remove('A_Loc_A')
# diff_cols.remove('A_Loc_H')
# diff_cols.remove('score_diff_adj')
# diff_cols.remove('win')
# diff_cols.extend(['A_Loc_A', 'A_Loc_H', 'score_diff_adj', 'win'])

In [18]:
# function that takes in estimator, data, models_df, tournament, test_size
def run_model(estimator, features, regression, models_df, tournament, folds=5):
    """
    Run a model on data and save results to models_df.

    Parameters
    ----------
    estimator : sklearn estimator
        Estimator to use for modeling.
    data : pd.DataFrame
        Data to model.
    regression : bool
        Whether to model score differential (regression) or win/loss (classification).
    models_df : pd.DataFrame
        DataFrame to save results to.
    tournament : str
        Gender - 'M' or 'F'.
    folds : int
        Number of cross-validation folds to use.

    Returns
    -------
    models_df.tail() : pd.DataFrame
        Last 5 rows of dataframe with results from model.
    """
    
    # create copy to avoid modification
    data = features.copy()

    # define which col order to use
    if data.shape[1] == 73:
        data = data[sorted_cols]
    elif data.shape[1] == 48:
        data = data[diff_cols]
    else:
        print(f'Number of features: {data.shape[1]}')
        raise ValueError('Invalid number of features.')

    # drop unused cols
    data = data.drop(columns=['A_TeamID', 'B_TeamID', 'A_FullSeed', 'B_FullSeed'])

    # define cross-validation
    kf = KFold(n_splits=folds, shuffle=True, random_state=SEED)

    # create scaler
    scaler = MinMaxScaler()

    # predicting score differential
    if regression:
        # define X and y
        X = data.drop(columns=['score_diff_adj', 'win'])
        y = data['score_diff_adj']

        # initialize lists to store metrics
        rmse_scores_train = []
        rmse_scores_test = []
        r2_scores_train = []
        r2_scores_test = []
        acc_scores_train = []
        acc_scores_test = []
        r1_accs = []
        r2_accs = []
        r3_accs = []
        r4_accs = []
        r5_accs = []
        r6_accs = []

        for train_index, test_index in kf.split(X):
            # split data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # scale data
            X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
            X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

            # fit model
            estimator.fit(X_train_scaled, y_train)

            # predict
            train_preds = estimator.predict(X_train_scaled)
            test_preds = pd.Series(estimator.predict(X_test_scaled), index=y_test.index)

            # evaluate
            rmse_scores_train.append(mean_squared_error(y_train, train_preds, squared=False))
            rmse_scores_test.append(mean_squared_error(y_test, test_preds, squared=False))
            r2_scores_train.append(r2_score(y_train, train_preds))
            r2_scores_test.append(r2_score(y_test, test_preds))
            acc_scores_train.append(accuracy_score(np.sign(y_train), np.sign(train_preds)))
            acc_scores_test.append(accuracy_score(np.sign(y_test), np.sign(test_preds)))

            # get indices of each round
            r1_idx = y_test.index[(data.loc[y_test.index, 'round_2'] == 0) & (data.loc[y_test.index, 'round_3'] == 0) & (data.loc[y_test.index, 'round_4'] == 0) & \
                                  (data.loc[y_test.index, 'round_5'] == 0) & (data.loc[y_test.index, 'round_6'] == 0)]
            r2_idx = y_test.index[data.loc[y_test.index, 'round_2'] == 1]
            r3_idx = y_test.index[data.loc[y_test.index, 'round_3'] == 1]
            r4_idx = y_test.index[data.loc[y_test.index, 'round_4'] == 1]
            r5_idx = y_test.index[data.loc[y_test.index, 'round_5'] == 1]
            r6_idx = y_test.index[data.loc[y_test.index, 'round_6'] == 1]

            # get accuracy of each round
            r1_accs.append(accuracy_score(np.sign(y_test.loc[r1_idx]), np.sign(test_preds.loc[r1_idx])))
            r2_accs.append(accuracy_score(np.sign(y_test.loc[r2_idx]), np.sign(test_preds.loc[r2_idx])))
            r3_accs.append(accuracy_score(np.sign(y_test.loc[r3_idx]), np.sign(test_preds.loc[r3_idx])))
            r4_accs.append(accuracy_score(np.sign(y_test.loc[r4_idx]), np.sign(test_preds.loc[r4_idx])))
            r5_accs.append(accuracy_score(np.sign(y_test.loc[r5_idx]), np.sign(test_preds.loc[r5_idx])))
            r6_accs.append(accuracy_score(np.sign(y_test.loc[r6_idx]), np.sign(test_preds.loc[r6_idx])))

        # average the metrics across folds
        train_rmse = np.mean(rmse_scores_train)
        test_rmse = np.mean(rmse_scores_test)
        train_r2 = np.mean(r2_scores_train)
        test_r2 = np.mean(r2_scores_test)
        train_acc = np.mean(acc_scores_train)
        test_acc = np.mean(acc_scores_test)
        r1_acc = np.mean(r1_accs)
        r2_acc = np.mean(r2_accs)
        r3_acc = np.mean(r3_accs)
        r4_acc = np.mean(r4_accs)
        r5_acc = np.mean(r5_accs)
        r6_acc = np.mean(r6_accs)
        
        # Save results to models_df
        models_df.loc[len(models_df.index)] = [tournament, 'A_adj_score_diff', estimator, X.shape[1], X.columns.to_list(), folds, train_r2, test_r2, train_rmse, test_rmse, train_acc, test_acc, r1_acc, r2_acc, r3_acc, r4_acc, r5_acc, r6_acc]

        # delete variables
        del X, y, kf, scaler, train_index, test_index, X_train, X_test, y_train, y_test, train_preds, test_preds, rmse_scores_train, rmse_scores_test, r2_scores_train, r2_scores_test, acc_scores_train, acc_scores_test, r1_idx, r2_idx, r3_idx, \
        r4_idx, r5_idx, r6_idx, r1_accs, r2_accs, r3_accs, r4_accs, r5_accs, r6_accs, train_rmse, test_rmse, train_r2, test_r2, train_acc, test_acc, r1_acc, r2_acc, r3_acc, r4_acc, r5_acc, r6_acc
        collect()
        
        return models_df.tail()
    
    # predicting win/loss
    else:
        # define X and y
        X = data.drop(columns=['score_diff_adj', 'win'])
        y = data['win']

        # initialize lists to store metrics
        log_loss_train = []
        log_loss_test = []
        acc_scores_train = []
        acc_scores_test = []
        r1_accs = []
        r2_accs = []
        r3_accs = []
        r4_accs = []
        r5_accs = []
        r6_accs = []

        for train_index, test_index in kf.split(X):
            # split data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # scale data
            X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
            X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

            # fit model
            estimator.fit(X_train_scaled, y_train)

            # predict
            train_preds = estimator.predict(X_train_scaled)
            test_preds = pd.Series(estimator.predict(X_test_scaled), index=y_test.index)


            # evaluate
            log_loss_train.append(log_loss(y_train, train_preds))
            log_loss_test.append(log_loss(y_test, test_preds))
            acc_scores_train.append(accuracy_score(np.sign(y_train), np.sign(train_preds)))
            acc_scores_test.append(accuracy_score(np.sign(y_test), np.sign(test_preds)))

            # get indices of each round
            r1_idx = y_test.index[(data.loc[y_test.index, 'round_2'] == 0) & (data.loc[y_test.index, 'round_3'] == 0) & (data.loc[y_test.index, 'round_4'] == 0) & \
                                  (data.loc[y_test.index, 'round_5'] == 0) & (data.loc[y_test.index, 'round_6'] == 0)]
            r2_idx = y_test.index[data.loc[y_test.index, 'round_2'] == 1]
            r3_idx = y_test.index[data.loc[y_test.index, 'round_3'] == 1]
            r4_idx = y_test.index[data.loc[y_test.index, 'round_4'] == 1]
            r5_idx = y_test.index[data.loc[y_test.index, 'round_5'] == 1]
            r6_idx = y_test.index[data.loc[y_test.index, 'round_6'] == 1]

            # get accuracy of each round
            r1_accs.append(accuracy_score(np.sign(y_test.loc[r1_idx]), np.sign(test_preds.loc[r1_idx])))
            r2_accs.append(accuracy_score(np.sign(y_test.loc[r2_idx]), np.sign(test_preds.loc[r2_idx])))
            r3_accs.append(accuracy_score(np.sign(y_test.loc[r3_idx]), np.sign(test_preds.loc[r3_idx])))
            r4_accs.append(accuracy_score(np.sign(y_test.loc[r4_idx]), np.sign(test_preds.loc[r4_idx])))
            r5_accs.append(accuracy_score(np.sign(y_test.loc[r5_idx]), np.sign(test_preds.loc[r5_idx])))
            r6_accs.append(accuracy_score(np.sign(y_test.loc[r6_idx]), np.sign(test_preds.loc[r6_idx])))

        # average the metrics across folds
        train_log_loss = np.mean(log_loss_train)
        test_log_loss = np.mean(log_loss_test)
        train_acc = np.mean(acc_scores_train)
        test_acc = np.mean(acc_scores_test)
        r1_acc = np.mean(r1_accs)
        r2_acc = np.mean(r2_accs)
        r3_acc = np.mean(r3_accs)
        r4_acc = np.mean(r4_accs)
        r5_acc = np.mean(r5_accs)
        r6_acc = np.mean(r6_accs)
        
        # save results to models_df
        models_df.loc[len(models_df.index)] = [tournament, 'A_Win', estimator, X.shape[1], X.columns.to_list(), 5, train_log_loss, test_log_loss, train_acc, test_acc, r1_acc, r2_acc, r3_acc, r4_acc, r5_acc, r6_acc]

        # delete variables
        del X, y, kf, scaler, train_index, test_index, X_train, X_test, y_train, y_test, train_preds, test_preds, log_loss_train, log_loss_test, acc_scores_train, acc_scores_test, r1_idx, r2_idx, r3_idx, \
        r4_idx, r5_idx, r6_idx, r1_accs, r2_accs, r3_accs, r4_accs, r5_accs, r6_accs, train_log_loss, test_log_loss, train_acc, test_acc, r1_acc, r2_acc, r3_acc, r4_acc, r5_acc, r6_acc
        collect()
        
        return models_df.tail()

### Regression (Predicting <ins>Score Differential</ins>)

#### Men's

In [25]:
# create a df to hold regression models
# reg_df = pd.DataFrame(columns=['Tournament', 'Label', 'Model', 'Num_Features', 'Features', 'Num_CV_Folds', 'Train_R2', 'Val_R2', 'Train_RMSE', 'Val_RMSE', 'Train_Acc', 'Val_Acc', 'Val_r1_acc', 'Val_r2_acc', 'Val_r3_acc', 'Val_r4_acc', 'Val_r5_acc', 'Val_r6_acc'])

# load reg df
reg_df = pd.read_csv('models/regression_models.csv')

In [None]:
# define regression models
models = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_jobs=-1), XGBRegressor(n_jobs=-1), SVR(), KNeighborsRegressor(n_jobs=-1)]
datasets = [mfeatures] # , mfeatures_diff

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, features=dataset, regression=True, models_df=reg_df, tournament='M')

In [26]:
# inspect
reg_df[reg_df['Tournament'] == 'M'].sort_values(by='Val_Acc', ascending=False).head()

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_R2,Val_R2,Train_RMSE,Val_RMSE,Train_Acc,Val_Acc,Val_r1_acc,Val_r2_acc,Val_r3_acc,Val_r4_acc,Val_r5_acc,Val_r6_acc
80,M,A_adj_score_diff,LinearRegression(n_jobs=-1),67,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.429856,0.239511,11.045665,11.701572,0.724928,0.70456,0.742861,0.690977,0.684622,0.560981,0.612601,0.636825
0,M,A_adj_score_diff,LinearRegression(n_jobs=-1),70,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.431068,0.232711,11.033918,11.749827,0.725868,0.702054,0.741293,0.677911,0.693672,0.568981,0.599267,0.670159
60,M,A_adj_score_diff,LinearRegression(n_jobs=-1),69,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.430107,0.235875,11.043223,11.729381,0.724301,0.702054,0.740778,0.681194,0.690814,0.554085,0.612601,0.670159
20,M,A_adj_score_diff,LinearRegression(n_jobs=-1),68,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.430615,0.232794,11.03822,11.750188,0.724405,0.699541,0.741371,0.679361,0.683224,0.552981,0.568498,0.670159
40,M,A_adj_score_diff,LinearRegression(n_jobs=-1),67,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.426528,0.235529,11.078017,11.735523,0.723674,0.698284,0.739631,0.674414,0.689038,0.563176,0.570549,0.647937


#### Women's

In [None]:
# define regression models
models = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_jobs=-1), XGBRegressor(n_jobs=-1), SVR(), KNeighborsRegressor(n_jobs=-1)]
datasets = [wfeatures] # , wfeatures_diff

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, features=dataset, regression=True, models_df=reg_df, tournament='W')

In [27]:
# inspect
reg_df[reg_df['Tournament'] == 'W'].sort_values(by='Val_Acc', ascending=False).head()

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_R2,Val_R2,Train_RMSE,Val_RMSE,Train_Acc,Val_Acc,Val_r1_acc,Val_r2_acc,Val_r3_acc,Val_r4_acc,Val_r5_acc,Val_r6_acc
77,W,A_adj_score_diff,SVR(),43,"['1_pos_game_ratio_diff', '1_pos_win_ratio_dif...",5,0.559517,0.536657,13.834787,14.154019,0.797937,0.792381,0.820084,0.780558,0.752188,0.798012,0.586703,0.857143
70,W,A_adj_score_diff,LinearRegression(n_jobs=-1),69,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.670684,0.634342,11.962705,12.573288,0.800476,0.790476,0.824023,0.785752,0.745282,0.738041,0.611703,0.8
16,W,A_adj_score_diff,SVR(),70,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.568059,0.542477,13.699965,14.065637,0.797937,0.790476,0.816518,0.786284,0.75435,0.787485,0.546319,0.857143
88,W,A_adj_score_diff,SVR(),67,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.548524,0.52407,14.006432,14.345835,0.79873,0.789206,0.820107,0.778728,0.745654,0.777485,0.571319,0.857143
50,W,A_adj_score_diff,LinearRegression(n_jobs=-1),67,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.665771,0.629615,12.051561,12.653069,0.801746,0.789206,0.822682,0.780642,0.740131,0.769094,0.604652,0.761905


In [None]:
# save reg_df
reg_df.to_csv('models/regression_models.csv', index=False)

### Classification (Predicting <ins>Win/Loss</ins>)

In [None]:
# create a df to hold classification models
# class_df = pd.DataFrame(columns=['Tournament', 'Label', 'Model', 'Num_Features', 'Features', 'Num_CV_Folds', 'Train_LogLoss', 'Val_LogLoss', 'Train_Acc', 'Val_Acc', 'Val_r1_acc', 'Val_r2_acc', 'Val_r3_acc', 'Val_r4_acc', 'Val_r5_acc', 'Val_r6_acc'])

# load class df
class_df = pd.read_csv('models/classification_models.csv')

#### Men's

In [None]:
# define regression models
models = [LogisticRegression(n_jobs=-1, random_state=SEED), RandomForestClassifier(n_jobs=-1), XGBClassifier(n_jobs=-1), SVC(), KNeighborsClassifier(n_jobs=-1)]
datasets = [mfeatures] # , mfeatures_diff

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, features=dataset, regression=False, models_df=class_df, tournament='M')

In [None]:
# inspect
class_df[class_df['Tournament'] == 'M'].sort_values(by='Val_LogLoss').head()

#### Women's

In [None]:
# define regression models
models = [LogisticRegression(n_jobs=-1, random_state=SEED), RandomForestClassifier(n_jobs=-1), XGBClassifier(n_jobs=-1), SVC(), KNeighborsClassifier(n_jobs=-1)]
datasets = [wfeatures] # , wfeatures_diff

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, features=dataset, regression=False, models_df=class_df, tournament='W')

In [None]:
# inspect
class_df[class_df['Tournament'] == 'W'].sort_values(by='Val_LogLoss').head()

In [None]:
# save class_df
class_df.to_csv('models/classification_models.csv', index=False)

- After manual model inspection, Linear Regression performed the best when we replaced the seed with seed_win_prob and used the team A/B data.

## Model Tuning
Now, we'll look at the best performing models for both the men's and women's bracket so that we can make 2024 predictions.

## Simultate N Brackets

In [19]:
# root dirs
root = 'data/'
mroot = 'data/mens/'
wroot = 'data/womens/'

# load data, get slots for 2024, drop play-ins
slots = pd.read_csv(mroot + 'MNCAATourneySlots.csv')
slots = slots[slots['Season'] == 2024]
slots = slots[slots['Slot'].str.contains('R')].reset_index(drop=True)

# load seed data
seeds_2024 = pd.read_csv(root + '2024_tourney_seeds.csv')

# load in 2024 data
df_2024 = pd.read_csv(root + 'processed/2024_features.csv').drop(columns=['Region', 'Season'])

# drop teams not in 2024 tourney
df_2024 = df_2024[df_2024['TeamID'].isin(seeds_2024['TeamID'])]

# drop play in char
df_2024['FullSeed'] = df_2024['FullSeed'].apply(lambda x: x[:3])

# split into mens and womens
mdf_2024 = df_2024[df_2024['TeamID'] < 3000].reset_index(drop=True)
wdf_2024 = df_2024[df_2024['TeamID'] >= 3000].reset_index(drop=True)

# delete vars
del root, mroot, wroot, seeds_2024, df_2024

In [20]:
# men's historical seed differential winning percentages
mens_seed_win = [(-15, 0.013157894736842105),(-14, 0.04276315789473684),(-13, 0.07236842105263158),(-12, 0.10013769889840882),(-11, 0.12790697674418605),(-10, 0.16451214758997013),(-9, 0.2011173184357542),(-8, 0.25),(-7, 0.30364372469635625),(-6, 0.3222764078027236), 
                 (-5, 0.3409090909090909),(-4, 0.3565744192715802),(-3, 0.3722397476340694),(-2, 0.41212997482713576),(-1, 0.45202020202020204),(0, 0.5),(1, 0.547979797979798),(2, 0.5878700251728642),(3, 0.6277602523659306),(4, 0.6434255807284198),(5, 0.6590909090909092), 
                 (6, 0.6777235921972764),(7, 0.6963562753036437),(8, 0.75),(9, 0.7988826815642458),(10, 0.8354878524100299),(11, 0.872093023255814),(12, 0.8998623011015912),(13, 0.9276315789473684),(14, 0.9572368421052632),(15, 0.9868421052631579)]

# women's historical seed differential winning percentages
womens_seed_win = [(-15, 0.01),(-14, 0.017182890855457225),(-13, 0.024365781710914452),(-12, 0.03154867256637168),(-11, 0.0387315634218289),(-10, 0.04591445427728613),(-9, 0.05309734513274336),(-8, 0.17006802721088435),(-7, 0.17197452229299362),(-6, 0.22679185884764624), 
                   (-5, 0.28160919540229884),(-4, 0.30250672536072387),(-3, 0.32340425531914896),(-2, 0.3844744048872972),(-1, 0.44554455445544555),(0, 0.5),(1, 0.5544554455445545),(2, 0.6155255951127028),(3, 0.676595744680851),(4, 0.6974932746392761),(5, 0.7183908045977012), 
                   (6, 0.7732081411523537),(7, 0.8280254777070064),(8, 0.8299319727891157),(9, 0.9469026548672567),(10, 0.9540855457227139),(11, 0.9612684365781711),(12, 0.9684513274336283),(13, 0.9756342182890856),(14, 0.9828171091445428),(15, 0.99)]

In [33]:
def generate_bracket(data, estimator, tournament, num_brackets, slots_df=slots):
    """
    Generate a single bracket for the 2024 NCAA tournament.

    Parameters
    ----------
    data : pd.DataFrame
        Regular season data for the 2024 teams competing in the tournament.
    estimator : sklearn estimator
        Pre-trained estimator to use for modeling.
    tournament : str
        'M' or 'W'.
    num_brackets : int
        Number of brackets to generate.
    slots : pd.DataFrame
        Slots for the 2024 tournament.

    Returns
    -------
    all_brackets : pd.DataFrame
        DataFrame with the predicted outcomes of the tournament.
    
    """

    # get a copies of data to avoid modifying the original
    features = data.copy()

    # define scaler
    scaler = MinMaxScaler()

    # create empty df for all brackets
    all_brackets = pd.DataFrame()

    # loop for each bracket
    for n in range(1, num_brackets+1):
        # create bracket-specific slots table
        slots = slots_df.copy()

        # create empty results for round
        result_df = pd.DataFrame(columns=["Slot", "Team"])

        # 6 rounds in a single bracket
        for i in range(1, 7):
            # get slots for round
            slots_round = slots[slots['Slot'].str.contains(f'R{i}')].reset_index(drop=True)

            # holds data for each matchup
            round_matchups = []

            # loop through the slots
            for idx, row in slots_round.iterrows():
                # get team A and team B
                A = features[features['FullSeed'] == row['StrongSeed']].reset_index(drop=True)
                B = features[features['FullSeed'] == row['WeakSeed']].reset_index(drop=True)

                # rename cols
                A = A.add_prefix('A_')
                B = B.add_prefix('B_')

                # create matchup dataframe
                combined = pd.concat([A, B], axis=1)

                # append combined row to the list
                round_matchups.append(combined)

            # concatenate all matchup rows into a single DataFrame
            round_df = pd.concat(round_matchups, axis=0).reset_index(drop=True)
            
            # calculate seed diff
            round_df['A_seed_diff'] = round_df['B_Seed'] - round_df['A_Seed']

            # get seed win probability
            def get_seed_win_prob(x, tournament=tournament): 
                seed_list = mens_seed_win if tournament == 'M' else womens_seed_win

                seed_win_list = [y[1] for y in seed_list if y[0] == x]
                if seed_win_list:
                    return seed_win_list[0]
                else:
                    print(f"No seed win probability found for seed difference {x}.")
                    return None

            # add cols
            round_df['A_seed_win_prob'] = round_df['A_seed_diff'].apply(get_seed_win_prob)
            round_df['B_seed_win_prob'] = 1 - round_df['A_seed_win_prob']

            # add some cols that were in training data
            round_df['A_Loc_A'] = 0
            round_df['A_Loc_H'] = 0
            round_df[['round_2', 'round_3', 'round_4', 'round_5', 'round_6']] = 0
            if i == 1:
                pass
            else:
                round_df[f'round_{i}'] = 1

            # drop cols
            round_df = round_df.drop(columns=['A_Seed', 'B_Seed', 'A_seed_diff'])

            # add placeholder cols for sorting
            round_df['score_diff_adj'] = 0
            round_df['win'] = 0
            round_df = round_df[sorted_cols]

            # define X and reorder cols
            X = round_df.drop(columns=['A_TeamID', 'A_FullSeed', 'B_TeamID', 'B_FullSeed', 'score_diff_adj', 'win'])

            # scale data
            X = scaler.fit_transform(X)
            
            # predict the outcomes of the round
            preds = estimator.predict_proba(X)

            # if n == 1:
            #     print(f'\n\nround {i}:')
            #     print(f'preds before: {preds}')

            # generate random values, update preds
            random_values = np.random.rand(len(preds))
            preds = (random_values > preds[:, 0]).astype(int)

            # if n == 1:
            #     print(f'random values: {random_values}')
            #     print(f'preds after: {preds}')

            # replace preds with full seed of winning team
            preds = np.where(preds > 0, round_df['A_FullSeed'], round_df['B_FullSeed'])

            for slot, winner_seed in zip(slots_round['Slot'], preds):
                # save results to result_df
                result_df.loc[len(result_df.index)] = [slot, winner_seed]

            # edit slots df for next round
            if i != 6:
                next_round_slots = slots[slots['Slot'].str.contains(f'R{i+1}')]

                for idx, row in next_round_slots.iterrows():
                    # get the teams playing in that slot for the next round
                    team1 = result_df[result_df['Slot'] == row['StrongSeed']]['Team'].values[0]
                    team2 = result_df[result_df['Slot'] == row['WeakSeed']]['Team'].values[0]

                    # update the slots df
                    slots.loc[slots['Slot'] == row['Slot'], 'StrongSeed'] = team1
                    slots.loc[slots['Slot'] == row['Slot'], 'WeakSeed'] = team2

            # drop teams that have been eliminated
            # features = features[features['FullSeed'].isin(result_df['Team'])].reset_index(drop=True)

        # add bracket col
        result_df['Bracket'] = n

        # append to all_brackets
        all_brackets = pd.concat([all_brackets, result_df], axis=0)

    # add tournament col
    all_brackets['Tournament'] = tournament

    return all_brackets


### Men's

In [22]:
# define scaler
scaler = MinMaxScaler()

In [23]:
# train best men's model on entire dataset
mmodel_reg = LinearRegression(n_jobs=-1)

# sort features
mfeatures = mfeatures[sorted_cols]

# define X and y
X = mfeatures.drop(columns=['A_TeamID', 'A_FullSeed', 'B_TeamID', 'B_FullSeed', 'score_diff_adj', 'win'])
y = mfeatures['score_diff_adj']

# scale data
X_scaled = scaler.fit_transform(X)

# fit model
mmodel_reg.fit(X_scaled, y)

# get train metrics
train_preds = mmodel_reg.predict(X_scaled)
r2 = r2_score(y, train_preds)
rmse = mean_squared_error(y, train_preds, squared=False)
acc = accuracy_score(np.sign(y), np.sign(train_preds))

print(f'Model R2: {r2:.4f}')
print(f'Model RMSE: {rmse:.4f}')
print(f'Model Accuracy: {acc:.4f}')

# delete vars
del X, y, X_scaled, train_preds, r2, rmse, acc

Model R2: 0.4284
Model RMSE: 11.1122
Model Accuracy: 0.7192


In [24]:
# classification
mmodel_class = LogisticRegression(n_jobs=-1)

# sort features
mfeatures = mfeatures[sorted_cols]

# define X and y
X = mfeatures.drop(columns=['A_TeamID', 'A_FullSeed', 'B_TeamID', 'B_FullSeed', 'score_diff_adj', 'win'])
y = mfeatures['win']

# scale data
X_scaled = scaler.fit_transform(X)

# fit model
mmodel_class.fit(X_scaled, y)

# get train metrics
train_preds = mmodel_class.predict(X_scaled)
loss = log_loss(y, train_preds)
acc = accuracy_score(y, train_preds)

print(f'Model Log Loss: {loss:.4f}')
print(f'Model Accuracy: {acc:.4f}')

# delete vars
del X, y, X_scaled, train_preds, loss, acc

Model Log Loss: 9.5694
Model Accuracy: 0.7229


In [34]:
# generate men's bracket
# result_m_reg = generate_bracket(data=mdf_2024, estimator=mmodel_reg, tournament='M')
m_brackets = generate_bracket(data=mdf_2024, estimator=mmodel_class, tournament='M', num_brackets=1)

# delete vars
# del mmodel_reg, mmodel_class

In [36]:
m_brackets.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62
Slot,R1W1,R1W2,R1W3,R1W4,R1W5,R1W6,R1W7,R1W8,R1X1,R1X2,R1X3,R1X4,R1X5,R1X6,R1X7,R1X8,R1Y1,R1Y2,R1Y3,R1Y4,R1Y5,R1Y6,R1Y7,R1Y8,R1Z1,R1Z2,R1Z3,R1Z4,R1Z5,R1Z6,R1Z7,R1Z8,R2W1,R2W2,R2W3,R2W4,R2X1,R2X2,R2X3,R2X4,R2Y1,R2Y2,R2Y3,R2Y4,R2Z1,R2Z2,R2Z3,R2Z4,R3W1,R3W2,R3X1,R3X2,R3Y1,R3Y2,R3Z1,R3Z2,R4W1,R4X1,R4Y1,R4Z1,R5WX,R5YZ,R6CH
Team,W01,W02,W03,W04,W12,W06,W10,W09,X01,X15,X14,X04,X12,X11,X10,X08,Y01,Y02,Y14,Y04,Y12,Y11,Y10,Y09,Z01,Z02,Z14,Z13,Z12,Z11,Z07,Z09,W01,W02,W06,W04,X01,X10,X11,X04,Y01,Y02,Y11,Y04,Z01,Z02,Z11,Z12,W01,W06,X01,X11,Y04,Y02,Z12,Z02,W01,X01,Y02,Z12,W01,Z12,W01
Bracket,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Tournament,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M,M


### Women's

In [37]:
# train best men's model on entire dataset
wmodel_reg = LinearRegression(n_jobs=-1)

# define X and y
X = wfeatures.drop(columns=['A_TeamID', 'A_FullSeed', 'B_TeamID', 'B_FullSeed', 'score_diff_adj', 'win'])
y = wfeatures['score_diff_adj']

# scale data
X_scaled = scaler.fit_transform(X)

# fit model
wmodel_reg.fit(X_scaled, y)

# get train metrics
train_preds = wmodel_reg.predict(X_scaled)
r2 = r2_score(y, train_preds)
rmse = mean_squared_error(y, train_preds, squared=False)
acc = accuracy_score(np.sign(y), np.sign(train_preds))

print(f'Model R2: {r2:.4f}')
print(f'Model RMSE: {rmse:.4f}')
print(f'Model Accuracy: {acc:.4f}')

# delete vars
del X, y, X_scaled, train_preds, r2, rmse, acc

Model R2: 0.6467
Model RMSE: 12.3932
Model Accuracy: 0.7956


In [38]:
# classification
wmodel_class = LogisticRegression(random_state=SEED)

# define X and y
X = wfeatures.drop(columns=['A_TeamID', 'A_FullSeed', 'B_TeamID', 'B_FullSeed', 'score_diff_adj', 'win'])
y = wfeatures['win']

# scale data
X_scaled = scaler.fit_transform(X)

# fit model
wmodel_class.fit(X_scaled, y)

# get train metrics
train_preds = wmodel_class.predict(X_scaled)
loss = log_loss(y, train_preds)
acc = accuracy_score(y, train_preds)

print(f'Model Log Loss: {loss:.4f}')
print(f'Model Accuracy: {acc:.4f}')

# delete vars
del X, y, X_scaled, train_preds, loss, acc

Model Log Loss: 6.8640
Model Accuracy: 0.8013


In [39]:
# generate women's bracket
# result_w_reg = generate_bracket(data=wdf_2024, estimator=wmodel_reg, tournament='W')
w_brackets = generate_bracket(data=wdf_2024, estimator=wmodel_class, tournament='W', num_brackets=1)

# delete vars
# del wmodel_reg, wmodel_class

In [40]:
w_brackets.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62
Slot,R1W1,R1W2,R1W3,R1W4,R1W5,R1W6,R1W7,R1W8,R1X1,R1X2,R1X3,R1X4,R1X5,R1X6,R1X7,R1X8,R1Y1,R1Y2,R1Y3,R1Y4,R1Y5,R1Y6,R1Y7,R1Y8,R1Z1,R1Z2,R1Z3,R1Z4,R1Z5,R1Z6,R1Z7,R1Z8,R2W1,R2W2,R2W3,R2W4,R2X1,R2X2,R2X3,R2X4,R2Y1,R2Y2,R2Y3,R2Y4,R2Z1,R2Z2,R2Z3,R2Z4,R3W1,R3W2,R3X1,R3X2,R3Y1,R3Y2,R3Z1,R3Z2,R4W1,R4X1,R4Y1,R4Z1,R5WX,R5YZ,R6CH
Team,W01,W02,W03,W04,W05,W11,W10,W09,X01,X02,X03,X04,X12,X06,X10,X09,Y01,Y02,Y03,Y04,Y12,Y11,Y10,Y09,Z01,Z02,Z03,Z04,Z05,Z06,Z10,Z09,W01,W02,W03,W05,X01,X02,X06,X04,Y01,Y02,Y03,Y04,Z01,Z10,Z03,Z05,W01,W02,X04,X02,Y01,Y02,Z01,Z03,W01,X02,Y01,Z03,W01,Y01,Y01
Bracket,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Tournament,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W,W


## Submission

In [None]:
# num brackets
n_brackets = 100000

# combine results
submission = pd.concat([result_m_class, result_w_class])
submission = submission.reset_index(drop=True)
submission.index.names = ['RowId']

# reorder
submission = submission[['Tournament', 'Bracket', 'Slot', 'Team']]

In [None]:
# save
submission.to_csv('submission.csv')
submission.head()