# Predictive Model
In this notebook, we:
- Try different models and assess their performance.
- Predict on the 2024 March Madness bracket.
- Simulate multiple brackets.

## Imports

In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import tqdm

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, log_loss, accuracy_score, confusion_matrix, classification_report
from xgboost import XGBRegressor, XGBClassifier

# display 100 rows and 100 columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

# global random seed
SEED = 0

# set numpy seed
np.random.seed(SEED)

## Load Data

In [240]:
# load in features compact
fcomp = pd.read_csv('data/processed/features_compact.csv')

# load in features detailed
# fdet = pd.read_csv('data/processed/features_detailed.csv')

## Rearranging the Data
When we input a new row into our model (for the 2024 bracket), the winner could be the first or the second team in the row that we input into our model. Currently, all rows have the winning team on the left, and this will cause our model to learn this locational information. We need to rearrange the order of winners and losers, to where it is essentially random to prevent the model from learning this.

In [241]:
# # subtract starting yeaer from season col (to represent years since first tourney). this may capture changes in game play over time
fcomp['Season'] = np.where(fcomp['WTeamID'] < 3000, fcomp['Season'] - 1985, fcomp['Season'] - 1998)

# drop columns that can't be used as features
cols_def_drop = ['DayNum', 'WRegion', 'Wseed_diff', 'LRegion', 'Lseed_diff', 'abs_seed_diff']

# drop cols
fcomp = fcomp.drop(columns=(cols_def_drop))

In [242]:
# rename W, L to A, B
fcomp.columns = [x.replace('W', 'A_', 1) if x[0] == 'W' else x for x in fcomp.columns]
fcomp.columns = [x.replace('L', 'B_', 1) if x[0] == 'L' else x for x in fcomp.columns]

# check
fcomp.head()

Unnamed: 0,Season,A_TeamID,A_Score,B_TeamID,B_Score,A_Loc,NumOT,A_Seed,B_Seed,A_FullSeed,B_FullSeed,A_PlayIn,B_PlayIn,A_max_win_diff,A_max_loss_diff,A_num_games,A_win_ratio,A_mean_ppg,A_mean_papg,A_std_ppg,A_std_papg,A_mean_diff,A_std_diff,A_home_win_ratio,A_away_win_ratio,A_neutral_win_ratio,A_1_pos_win_missing,A_1_pos_loss_missing,A_ot_win_missing,A_ot_loss_missing,A_1_pos_game_ratio,A_1_pos_win_ratio,A_ot_ratio,A_ot_win_ratio,A_recent_win_ratio,A_recent_mean_pts_for,A_recent_mean_pts_against,A_recent_mean_score_diff,A_recent_std_pts_for,A_recent_std_pts_against,A_recent_std_score_diff,B_max_win_diff,B_max_loss_diff,B_num_games,B_win_ratio,B_mean_ppg,B_mean_papg,B_std_ppg,B_std_papg,B_mean_diff,B_std_diff,B_home_win_ratio,B_away_win_ratio,B_neutral_win_ratio,B_1_pos_win_missing,B_1_pos_loss_missing,B_ot_win_missing,B_ot_loss_missing,B_1_pos_game_ratio,B_1_pos_win_ratio,B_ot_ratio,B_ot_win_ratio,B_recent_win_ratio,B_recent_mean_pts_for,B_recent_mean_pts_against,B_recent_mean_score_diff,B_recent_std_pts_for,B_recent_std_pts_against,B_recent_std_score_diff,B_Loc,round,A_seed_win_prob,B_seed_win_prob
0,0,1116,63,1234,54,N,0,9,8,X09,X08,0,0,35,20,33,0.636364,65.333333,61.69697,11.332454,9.055902,3.636364,9.620253,0.909091,0.333333,0.7,0,0,1,1,0.363636,0.25,0.0,0.0,0.666667,74.166667,61.5,12.666667,15.382169,8.624899,8.286535,49,20,30,0.666667,69.733333,59.266667,12.375335,11.243712,10.466667,13.416676,0.833333,0.333333,0.666667,0,0,1,1,0.233333,0.142857,0.0,0.0,0.333333,62.5,63.833333,-1.333333,7.207249,15.625833,9.061518,N,1,0.45202,0.54798
1,0,1120,59,1345,58,N,0,11,6,Z11,Z06,0,0,42,19,29,0.62069,70.344828,66.655172,13.52334,11.519276,3.689655,12.768266,0.666667,0.454545,0.833333,0,0,0,0,0.482759,0.428571,0.068966,0.5,0.833333,63.166667,57.0,6.166667,14.250731,6.689544,9.27811,31,43,25,0.68,69.12,65.32,10.537755,8.837016,3.8,13.685843,0.714286,0.6,1.0,0,0,1,1,0.4,0.2,0.0,0.0,0.666667,66.333333,68.833333,-2.5,12.270108,4.778424,11.172884,N,1,0.340909,0.659091
2,0,1207,68,1250,43,N,0,1,16,W01,W16,0,0,41,2,27,0.925926,75.740741,60.074074,11.475417,10.523929,15.666667,10.490513,0.923077,0.857143,1.0,0,1,1,1,0.074074,1.0,0.0,0.0,1.0,83.666667,64.333333,19.333333,10.366613,10.726913,8.594572,18,31,29,0.37931,65.758621,70.206897,8.131251,8.504503,-4.448276,16.284512,0.333333,0.363636,0.5,0,0,0,1,0.724138,0.238095,0.034483,1.0,0.833333,69.5,70.0,-0.5,7.593857,7.071068,1.5,N,1,0.986842,0.013158
3,0,1229,58,1425,55,N,0,9,8,Y09,Y08,0,0,31,13,27,0.740741,71.592593,65.62963,10.506689,10.519915,5.962963,8.956982,0.785714,0.6,1.0,0,0,1,1,0.37037,0.4,0.0,0.0,0.666667,72.0,64.833333,7.166667,9.006171,6.940221,8.36328,33,21,28,0.678571,68.392857,64.607143,9.272442,8.751871,3.785714,9.777817,0.642857,0.727273,0.666667,0,0,1,1,0.428571,0.5,0.0,0.0,0.5,68.833333,68.166667,0.666667,6.879922,13.083068,11.56143,N,1,0.45202,0.54798
4,0,1242,49,1325,38,N,0,3,14,Z03,Z14,0,0,27,19,30,0.766667,76.033333,70.4,10.420842,11.20687,5.633333,8.081707,1.0,0.6,0.5,0,0,1,1,0.3,0.333333,0.0,0.0,0.833333,76.833333,72.166667,4.666667,7.46101,5.228129,5.416026,23,17,27,0.740741,67.555556,63.0,9.638866,8.801914,4.555556,7.092059,1.0,0.583333,0.6,0,0,1,1,0.333333,0.333333,0.0,0.0,0.833333,68.333333,58.166667,10.166667,7.455423,4.941322,7.762087,N,1,0.872093,0.127907


In [243]:
# choose (half) random rows to flip
np.random.seed(SEED)
flip = np.random.choice(fcomp.index, int(fcomp.shape[0] / 2), replace=False)

# flip A, B cols
for col in fcomp.columns:
    if col[0] == 'A':
        fcomp.loc[flip, col], fcomp.loc[flip, col.replace('A_', 'B_')] = fcomp.loc[flip, col.replace('A_', 'B_')].values, fcomp.loc[flip, col].values

# rearrange cols
df = fcomp.reindex(sorted(fcomp.columns), axis=1)

# check
df.head()

Unnamed: 0,A_1_pos_game_ratio,A_1_pos_loss_missing,A_1_pos_win_missing,A_1_pos_win_ratio,A_FullSeed,A_Loc,A_PlayIn,A_Score,A_Seed,A_TeamID,A_away_win_ratio,A_home_win_ratio,A_max_loss_diff,A_max_win_diff,A_mean_diff,A_mean_papg,A_mean_ppg,A_neutral_win_ratio,A_num_games,A_ot_loss_missing,A_ot_ratio,A_ot_win_missing,A_ot_win_ratio,A_recent_mean_pts_against,A_recent_mean_pts_for,A_recent_mean_score_diff,A_recent_std_pts_against,A_recent_std_pts_for,A_recent_std_score_diff,A_recent_win_ratio,A_seed_win_prob,A_std_diff,A_std_papg,A_std_ppg,A_win_ratio,B_1_pos_game_ratio,B_1_pos_loss_missing,B_1_pos_win_missing,B_1_pos_win_ratio,B_FullSeed,B_Loc,B_PlayIn,B_Score,B_Seed,B_TeamID,B_away_win_ratio,B_home_win_ratio,B_max_loss_diff,B_max_win_diff,B_mean_diff,B_mean_papg,B_mean_ppg,B_neutral_win_ratio,B_num_games,B_ot_loss_missing,B_ot_ratio,B_ot_win_missing,B_ot_win_ratio,B_recent_mean_pts_against,B_recent_mean_pts_for,B_recent_mean_score_diff,B_recent_std_pts_against,B_recent_std_pts_for,B_recent_std_score_diff,B_recent_win_ratio,B_seed_win_prob,B_std_diff,B_std_papg,B_std_ppg,B_win_ratio,NumOT,Season,round
0,0.363636,0,0,0.25,X09,N,0,63,9,1116,0.333333,0.909091,20,35,3.636364,61.69697,65.333333,0.7,33,1,0.0,1,0.0,61.5,74.166667,12.666667,8.624899,15.382169,8.286535,0.666667,0.45202,9.620253,9.055902,11.332454,0.636364,0.233333,0,0,0.142857,X08,N,0,54,8,1234,0.333333,0.833333,20,49,10.466667,59.266667,69.733333,0.666667,30,1,0.0,1,0.0,63.833333,62.5,-1.333333,15.625833,7.207249,9.061518,0.333333,0.54798,13.416676,11.243712,12.375335,0.666667,0,0,1
1,0.4,0,0,0.2,Z06,N,0,58,6,1345,0.6,0.714286,43,31,3.8,65.32,69.12,1.0,25,1,0.0,1,0.0,68.833333,66.333333,-2.5,4.778424,12.270108,11.172884,0.666667,0.659091,13.685843,8.837016,10.537755,0.68,0.482759,0,0,0.428571,Z11,N,0,59,11,1120,0.454545,0.666667,19,42,3.689655,66.655172,70.344828,0.833333,29,0,0.068966,0,0.5,57.0,63.166667,6.166667,6.689544,14.250731,9.27811,0.833333,0.340909,12.768266,11.519276,13.52334,0.62069,0,0,1
2,0.724138,0,0,0.238095,W16,N,0,43,16,1250,0.363636,0.333333,31,18,-4.448276,70.206897,65.758621,0.5,29,1,0.034483,0,1.0,70.0,69.5,-0.5,7.071068,7.593857,1.5,0.833333,0.013158,16.284512,8.504503,8.131251,0.37931,0.074074,1,0,1.0,W01,N,0,68,1,1207,0.857143,0.923077,2,41,15.666667,60.074074,75.740741,1.0,27,1,0.0,1,0.0,64.333333,83.666667,19.333333,10.726913,10.366613,8.594572,1.0,0.986842,10.490513,10.523929,11.475417,0.925926,0,0,1
3,0.37037,0,0,0.4,Y09,N,0,58,9,1229,0.6,0.785714,13,31,5.962963,65.62963,71.592593,1.0,27,1,0.0,1,0.0,64.833333,72.0,7.166667,6.940221,9.006171,8.36328,0.666667,0.45202,8.956982,10.519915,10.506689,0.740741,0.428571,0,0,0.5,Y08,N,0,55,8,1425,0.727273,0.642857,21,33,3.785714,64.607143,68.392857,0.666667,28,1,0.0,1,0.0,68.166667,68.833333,0.666667,13.083068,6.879922,11.56143,0.5,0.54798,9.777817,8.751871,9.272442,0.678571,0,0,1
4,0.333333,0,0,0.333333,Z14,N,0,38,14,1325,0.583333,1.0,17,23,4.555556,63.0,67.555556,0.6,27,1,0.0,1,0.0,58.166667,68.333333,10.166667,4.941322,7.455423,7.762087,0.833333,0.127907,7.092059,8.801914,9.638866,0.740741,0.3,0,0,0.333333,Z03,N,0,49,3,1242,0.6,1.0,19,27,5.633333,70.4,76.033333,0.5,30,1,0.0,1,0.0,72.166667,76.833333,4.666667,5.228129,7.46101,5.416026,0.833333,0.872093,8.081707,11.20687,10.420842,0.766667,0,0,1


In [244]:
# one-hot encode A_Loc col, drop loc cols
fcomp = pd.get_dummies(fcomp, columns=['A_Loc'], dtype=int)
fcomp = fcomp.drop(columns=['A_Loc_N', 'B_Loc'])

# one-hot encode 'round' col, drop one col
fcomp = pd.get_dummies(fcomp, columns=['round'], dtype=int)
fcomp = fcomp.drop(columns=['round_1'])

## Create Labels

In [245]:
# create regression label, drop cols
fcomp['score_diff'] = fcomp['A_Score'] - fcomp['B_Score']
fcomp = fcomp.drop(columns=['A_Score', 'B_Score'])

# create binary label
fcomp['win'] = fcomp['score_diff'].apply(lambda x: 1 if x > 0 else 0)

# check
fcomp.head()

Unnamed: 0,Season,A_TeamID,B_TeamID,NumOT,A_Seed,B_Seed,A_FullSeed,B_FullSeed,A_PlayIn,B_PlayIn,A_max_win_diff,A_max_loss_diff,A_num_games,A_win_ratio,A_mean_ppg,A_mean_papg,A_std_ppg,A_std_papg,A_mean_diff,A_std_diff,A_home_win_ratio,A_away_win_ratio,A_neutral_win_ratio,A_1_pos_win_missing,A_1_pos_loss_missing,A_ot_win_missing,A_ot_loss_missing,A_1_pos_game_ratio,A_1_pos_win_ratio,A_ot_ratio,A_ot_win_ratio,A_recent_win_ratio,A_recent_mean_pts_for,A_recent_mean_pts_against,A_recent_mean_score_diff,A_recent_std_pts_for,A_recent_std_pts_against,A_recent_std_score_diff,B_max_win_diff,B_max_loss_diff,B_num_games,B_win_ratio,B_mean_ppg,B_mean_papg,B_std_ppg,B_std_papg,B_mean_diff,B_std_diff,B_home_win_ratio,B_away_win_ratio,B_neutral_win_ratio,B_1_pos_win_missing,B_1_pos_loss_missing,B_ot_win_missing,B_ot_loss_missing,B_1_pos_game_ratio,B_1_pos_win_ratio,B_ot_ratio,B_ot_win_ratio,B_recent_win_ratio,B_recent_mean_pts_for,B_recent_mean_pts_against,B_recent_mean_score_diff,B_recent_std_pts_for,B_recent_std_pts_against,B_recent_std_score_diff,A_seed_win_prob,B_seed_win_prob,A_Loc_A,A_Loc_H,round_2,round_3,round_4,round_5,round_6,score_diff,win
0,0,1116,1234,0,9,8,X09,X08,0,0,35,20,33,0.636364,65.333333,61.69697,11.332454,9.055902,3.636364,9.620253,0.909091,0.333333,0.7,0,0,1,1,0.363636,0.25,0.0,0.0,0.666667,74.166667,61.5,12.666667,15.382169,8.624899,8.286535,49,20,30,0.666667,69.733333,59.266667,12.375335,11.243712,10.466667,13.416676,0.833333,0.333333,0.666667,0,0,1,1,0.233333,0.142857,0.0,0.0,0.333333,62.5,63.833333,-1.333333,7.207249,15.625833,9.061518,0.45202,0.54798,0,0,0,0,0,0,0,9,1
1,0,1345,1120,0,6,11,Z06,Z11,0,0,31,43,25,0.68,69.12,65.32,10.537755,8.837016,3.8,13.685843,0.714286,0.6,1.0,0,0,1,1,0.4,0.2,0.0,0.0,0.666667,66.333333,68.833333,-2.5,12.270108,4.778424,11.172884,42,19,29,0.62069,70.344828,66.655172,13.52334,11.519276,3.689655,12.768266,0.666667,0.454545,0.833333,0,0,0,0,0.482759,0.428571,0.068966,0.5,0.833333,63.166667,57.0,6.166667,14.250731,6.689544,9.27811,0.659091,0.340909,0,0,0,0,0,0,0,-1,0
2,0,1250,1207,0,16,1,W16,W01,0,0,18,31,29,0.37931,65.758621,70.206897,8.131251,8.504503,-4.448276,16.284512,0.333333,0.363636,0.5,0,0,0,1,0.724138,0.238095,0.034483,1.0,0.833333,69.5,70.0,-0.5,7.593857,7.071068,1.5,41,2,27,0.925926,75.740741,60.074074,11.475417,10.523929,15.666667,10.490513,0.923077,0.857143,1.0,0,1,1,1,0.074074,1.0,0.0,0.0,1.0,83.666667,64.333333,19.333333,10.366613,10.726913,8.594572,0.013158,0.986842,0,0,0,0,0,0,0,-25,0
3,0,1229,1425,0,9,8,Y09,Y08,0,0,31,13,27,0.740741,71.592593,65.62963,10.506689,10.519915,5.962963,8.956982,0.785714,0.6,1.0,0,0,1,1,0.37037,0.4,0.0,0.0,0.666667,72.0,64.833333,7.166667,9.006171,6.940221,8.36328,33,21,28,0.678571,68.392857,64.607143,9.272442,8.751871,3.785714,9.777817,0.642857,0.727273,0.666667,0,0,1,1,0.428571,0.5,0.0,0.0,0.5,68.833333,68.166667,0.666667,6.879922,13.083068,11.56143,0.45202,0.54798,0,0,0,0,0,0,0,3,1
4,0,1325,1242,0,14,3,Z14,Z03,0,0,23,17,27,0.740741,67.555556,63.0,9.638866,8.801914,4.555556,7.092059,1.0,0.583333,0.6,0,0,1,1,0.333333,0.333333,0.0,0.0,0.833333,68.333333,58.166667,10.166667,7.455423,4.941322,7.762087,27,19,30,0.766667,76.033333,70.4,10.420842,11.20687,5.633333,8.081707,1.0,0.6,0.5,0,0,1,1,0.3,0.333333,0.0,0.0,0.833333,76.833333,72.166667,4.666667,7.46101,5.228129,5.416026,0.127907,0.872093,0,0,0,0,0,0,0,-11,0


In [246]:
# create adjusted score diff col (score diff is inversely scaled by NumOT periods), drop NumOT
fcomp['score_diff_adj'] = fcomp['score_diff'] / (2 ** fcomp['NumOT'])

# check
fcomp[fcomp['NumOT'] > 0][['score_diff', 'NumOT', 'score_diff_adj']].sort_values(by='NumOT', ascending=False).head()

Unnamed: 0,score_diff,NumOT,score_diff_adj
654,-8,3,-1.0
1425,4,2,1.0
1676,7,2,1.75
1118,-4,2,-1.0
1166,1,2,0.25


__score_diff__ and __win__ can both be used as labels. They are both calculated with respect to team A.

In [247]:
# drop some cols
fcomp = fcomp.drop(columns=['score_diff', 'NumOT'])

# split on gender, drop cols
mfcomp = fcomp[fcomp['A_TeamID'] < 3000]
wfcomp = fcomp[fcomp['A_TeamID'] >= 3000]

In [248]:
# create diff cols (features between 2 teams)
cols_to_diff = ['1_pos_game_ratio', '1_pos_win_ratio', 'away_win_ratio', 'home_win_ratio', 'max_loss_diff', 'max_win_diff', 'mean_diff', 'mean_papg', 'mean_ppg', 
                'neutral_win_ratio', 'num_games', 'ot_ratio', 'ot_win_ratio', 'recent_mean_pts_against', 'recent_mean_pts_for', 'recent_mean_score_diff', 'recent_std_pts_against', 
                'recent_std_pts_for', 'recent_std_score_diff', 'recent_win_ratio', 'std_diff', 'std_papg', 'std_ppg', 'win_ratio', 'Seed', 'seed_win_prob']

cols_to_keep = ['Season', 'A_TeamID', 'A_FullSeed', 'A_1_pos_loss_missing', 'A_1_pos_win_missing', 'A_ot_loss_missing', 'A_ot_win_missing', 'A_PlayIn', 'B_TeamID', 'B_FullSeed', 
                'B_1_pos_loss_missing', 'B_1_pos_win_missing', 'B_ot_loss_missing', 'B_ot_win_missing', 'B_PlayIn', 'A_Loc_A', 'A_Loc_H', 'round_2', 'round_3', 'round_4', 'round_5', 
                'round_6', 'score_diff_adj', 'win']

# create diff df
fcomp_diff = fcomp[cols_to_keep].copy()

# create diff cols
for col in cols_to_diff:
    fcomp_diff[col + '_diff'] = fcomp['A_' + col] - fcomp['B_' + col]

# check
fcomp_diff.head()

Unnamed: 0,Season,A_TeamID,A_FullSeed,A_1_pos_loss_missing,A_1_pos_win_missing,A_ot_loss_missing,A_ot_win_missing,A_PlayIn,B_TeamID,B_FullSeed,B_1_pos_loss_missing,B_1_pos_win_missing,B_ot_loss_missing,B_ot_win_missing,B_PlayIn,A_Loc_A,A_Loc_H,round_2,round_3,round_4,round_5,round_6,score_diff_adj,win,1_pos_game_ratio_diff,1_pos_win_ratio_diff,away_win_ratio_diff,home_win_ratio_diff,max_loss_diff_diff,max_win_diff_diff,mean_diff_diff,mean_papg_diff,mean_ppg_diff,neutral_win_ratio_diff,num_games_diff,ot_ratio_diff,ot_win_ratio_diff,recent_mean_pts_against_diff,recent_mean_pts_for_diff,recent_mean_score_diff_diff,recent_std_pts_against_diff,recent_std_pts_for_diff,recent_std_score_diff_diff,recent_win_ratio_diff,std_diff_diff,std_papg_diff,std_ppg_diff,win_ratio_diff,Seed_diff,seed_win_prob_diff
0,0,1116,X09,0,0,1,1,0,1234,X08,0,0,1,1,0,0,0,0,0,0,0,0,9.0,1,0.130303,0.107143,0.0,0.075758,0,-14,-6.830303,2.430303,-4.4,0.033333,3,0.0,0.0,-2.333333,11.666667,14.0,-7.000934,8.174919,-0.774983,0.333333,-3.796423,-2.187809,-1.042881,-0.030303,1,-0.09596
1,0,1345,Z06,0,0,1,1,0,1120,Z11,0,0,0,0,0,0,0,0,0,0,0,0,-1.0,0,-0.082759,-0.228571,0.145455,0.047619,24,-11,0.110345,-1.335172,-1.224828,0.166667,-4,-0.068966,-0.5,11.833333,3.166667,-8.666667,-1.91112,-1.980623,1.894773,-0.166667,0.917577,-2.682259,-2.985585,0.05931,-5,0.318182
2,0,1250,W16,0,0,1,0,0,1207,W01,1,0,1,1,0,0,0,0,0,0,0,0,-25.0,0,0.650064,-0.761905,-0.493506,-0.589744,29,-23,-20.114943,10.132822,-9.98212,-0.5,2,0.034483,1.0,5.666667,-14.166667,-19.833333,-3.655845,-2.772756,-7.094572,-0.166667,5.793999,-2.019426,-3.344166,-0.546616,15,-0.973684
3,0,1229,Y09,0,0,1,1,0,1425,Y08,0,0,1,1,0,0,0,0,0,0,0,0,3.0,1,-0.058201,-0.1,-0.127273,0.142857,-8,-2,2.177249,1.022487,3.199735,0.333333,-1,0.0,0.0,-3.333333,3.166667,6.5,-6.142847,2.126248,-3.198151,0.166667,-0.820835,1.768045,1.234247,0.062169,1,-0.09596
4,0,1325,Z14,0,0,1,1,0,1242,Z03,0,0,1,1,0,0,0,0,0,0,0,0,-11.0,0,0.033333,0.0,-0.016667,0.0,-2,-4,-1.077778,-7.4,-8.477778,0.1,-3,0.0,0.0,-14.0,-8.5,5.5,-0.286807,-0.005587,2.346062,0.0,-0.989648,-2.404957,-0.781976,-0.025926,11,-0.744186


In [249]:
# split on gender, drop team IDs and numOT
mfcomp_diff = fcomp_diff[fcomp_diff['A_TeamID'] < 3000]
wfcomp_diff = fcomp_diff[fcomp_diff['A_TeamID'] >= 3000]

## Chalk Bracket
Here, we will simply predict the better seed. If seeds are equal (in rounds 5 and 6), we will predict the team with the better win_ratio.

From the 38 NCAA Men's tournaments that we have access to, the better seed wins __70.62%__ of the time. In the women's, the better seed wins __76.51%__ of the time. These are the baseline accuracies that we are attempting to surpass.

In [250]:
def get_dummy_preds(data):
    """
    Function to get dummy predictions based on seed and win percentage
    """

    # create a container
    dummy_preds = []

    # loop through the dataframe based on conditions
    for idx, row in data.iterrows():
        if data.loc[idx, "A_Seed"] < data.loc[idx, "B_Seed"]:
            dummy_preds.append(1)
        elif data.loc[idx, "A_Seed"] > data.loc[idx, "B_Seed"]:
            dummy_preds.append(0)
        else:
            if data.loc[idx, "A_win_ratio"] > data.loc[idx, "B_win_ratio"]:
                dummy_preds.append(1)
            else:
                dummy_preds.append(0)
    
    return np.array(dummy_preds)

### Men's

In [251]:
# get dummy preds
mchalk_preds = get_dummy_preds(mfcomp)

# compare preds to win col
mchalk_acc = accuracy_score(mfcomp['win'], mchalk_preds)

print(f"Accuracy of dummy predictions: {mchalk_acc*100:.2f}%")

Accuracy of dummy predictions: 71.04%


Dummy predictions beat the baseline by __0.42__ %.

### Women's

In [252]:
# get dummy preds
wchalk_preds = get_dummy_preds(wfcomp)

# compare preds to win col
wchalk_acc = accuracy_score(wfcomp['win'], wchalk_preds)

print(f"Accuracy of dummy predictions: {wchalk_acc*100:.2f}%")

Accuracy of dummy predictions: 77.65%


Dummy predictions beat the baseline by __1.14__ %.

## Linear/Logistic Regression
This problem can be treated as a regression problem (label is __score differential__). It can also be treated as binary classification (label is __win/loss__).

We will try the original aggregated stats for both team A and B as features (mfcomp/wfcomp), and also the differences between the two teams' stats (mcomp_diff/wcomp_diff seen below).

In [253]:
# look at num feats for the 2 datasets
print(f'Features of both team A and B: {mfcomp.shape[1] - 6}')
print(f'Features of the DIFFERENCE between team A and B: {mfcomp_diff.shape[1] - 6}')

Features of both team A and B: 70
Features of the DIFFERENCE between team A and B: 44


In [None]:
# create feature order list so our models are trained on the same feature order
sorted_cols = sorted(mfcomp.columns)
sorted_cols.remove('A_Loc_A')
sorted_cols.remove('A_Loc_H')
sorted_cols.remove('score_diff_adj')
sorted_cols.remove('win')
sorted_cols.extend(['A_Loc_A', 'A_Loc_H', 'score_diff_adj', 'win'])

In [278]:
mfcomp[sorted(mfcomp.columns)].head()

Unnamed: 0,A_1_pos_game_ratio,A_1_pos_loss_missing,A_1_pos_win_missing,A_1_pos_win_ratio,A_FullSeed,A_Loc_A,A_Loc_H,A_PlayIn,A_Seed,A_TeamID,A_away_win_ratio,A_home_win_ratio,A_max_loss_diff,A_max_win_diff,A_mean_diff,A_mean_papg,A_mean_ppg,A_neutral_win_ratio,A_num_games,A_ot_loss_missing,A_ot_ratio,A_ot_win_missing,A_ot_win_ratio,A_recent_mean_pts_against,A_recent_mean_pts_for,A_recent_mean_score_diff,A_recent_std_pts_against,A_recent_std_pts_for,A_recent_std_score_diff,A_recent_win_ratio,A_seed_win_prob,A_std_diff,A_std_papg,A_std_ppg,A_win_ratio,B_1_pos_game_ratio,B_1_pos_loss_missing,B_1_pos_win_missing,B_1_pos_win_ratio,B_FullSeed,B_PlayIn,B_Seed,B_TeamID,B_away_win_ratio,B_home_win_ratio,B_max_loss_diff,B_max_win_diff,B_mean_diff,B_mean_papg,B_mean_ppg,B_neutral_win_ratio,B_num_games,B_ot_loss_missing,B_ot_ratio,B_ot_win_missing,B_ot_win_ratio,B_recent_mean_pts_against,B_recent_mean_pts_for,B_recent_mean_score_diff,B_recent_std_pts_against,B_recent_std_pts_for,B_recent_std_score_diff,B_recent_win_ratio,B_seed_win_prob,B_std_diff,B_std_papg,B_std_ppg,B_win_ratio,Season,round_2,round_3,round_4,round_5,round_6,score_diff_adj,win
0,0.363636,0,0,0.25,X09,0,0,0,9,1116,0.333333,0.909091,20,35,3.636364,61.69697,65.333333,0.7,33,1,0.0,1,0.0,61.5,74.166667,12.666667,8.624899,15.382169,8.286535,0.666667,0.45202,9.620253,9.055902,11.332454,0.636364,0.233333,0,0,0.142857,X08,0,8,1234,0.333333,0.833333,20,49,10.466667,59.266667,69.733333,0.666667,30,1,0.0,1,0.0,63.833333,62.5,-1.333333,15.625833,7.207249,9.061518,0.333333,0.54798,13.416676,11.243712,12.375335,0.666667,0,0,0,0,0,0,9.0,1
1,0.4,0,0,0.2,Z06,0,0,0,6,1345,0.6,0.714286,43,31,3.8,65.32,69.12,1.0,25,1,0.0,1,0.0,68.833333,66.333333,-2.5,4.778424,12.270108,11.172884,0.666667,0.659091,13.685843,8.837016,10.537755,0.68,0.482759,0,0,0.428571,Z11,0,11,1120,0.454545,0.666667,19,42,3.689655,66.655172,70.344828,0.833333,29,0,0.068966,0,0.5,57.0,63.166667,6.166667,6.689544,14.250731,9.27811,0.833333,0.340909,12.768266,11.519276,13.52334,0.62069,0,0,0,0,0,0,-1.0,0
2,0.724138,0,0,0.238095,W16,0,0,0,16,1250,0.363636,0.333333,31,18,-4.448276,70.206897,65.758621,0.5,29,1,0.034483,0,1.0,70.0,69.5,-0.5,7.071068,7.593857,1.5,0.833333,0.013158,16.284512,8.504503,8.131251,0.37931,0.074074,1,0,1.0,W01,0,1,1207,0.857143,0.923077,2,41,15.666667,60.074074,75.740741,1.0,27,1,0.0,1,0.0,64.333333,83.666667,19.333333,10.726913,10.366613,8.594572,1.0,0.986842,10.490513,10.523929,11.475417,0.925926,0,0,0,0,0,0,-25.0,0
3,0.37037,0,0,0.4,Y09,0,0,0,9,1229,0.6,0.785714,13,31,5.962963,65.62963,71.592593,1.0,27,1,0.0,1,0.0,64.833333,72.0,7.166667,6.940221,9.006171,8.36328,0.666667,0.45202,8.956982,10.519915,10.506689,0.740741,0.428571,0,0,0.5,Y08,0,8,1425,0.727273,0.642857,21,33,3.785714,64.607143,68.392857,0.666667,28,1,0.0,1,0.0,68.166667,68.833333,0.666667,13.083068,6.879922,11.56143,0.5,0.54798,9.777817,8.751871,9.272442,0.678571,0,0,0,0,0,0,3.0,1
4,0.333333,0,0,0.333333,Z14,0,0,0,14,1325,0.583333,1.0,17,23,4.555556,63.0,67.555556,0.6,27,1,0.0,1,0.0,58.166667,68.333333,10.166667,4.941322,7.455423,7.762087,0.833333,0.127907,7.092059,8.801914,9.638866,0.740741,0.3,0,0,0.333333,Z03,0,3,1242,0.6,1.0,19,27,5.633333,70.4,76.033333,0.5,30,1,0.0,1,0.0,72.166667,76.833333,4.666667,5.228129,7.46101,5.416026,0.833333,0.872093,8.081707,11.20687,10.420842,0.766667,0,0,0,0,0,0,-11.0,0


In [266]:
# function that takes in estimator, data, models_df, tournament, test_size
def run_model(estimator, features, regression, models_df, tournament, folds=5):
    """
    Run a model on data and save results to models_df.

    Parameters
    ----------
    estimator : sklearn estimator
        Estimator to use for modeling.
    features : pd.DataFrame
        Data to model.
    regression : bool
        Whether to model score differential (regression) or win/loss (classification).
    models_df : pd.DataFrame
        DataFrame to save results to.
    tournament : str
        Gender - 'M' or 'F'.
    folds : int
        Number of cross-validation folds to use.

    Returns
    -------
    models_df.tail() : pd.DataFrame
        Last 5 rows of dataframe with results from model.
    """

    data = features.copy().drop(columns=['A_TeamID', 'B_TeamID', 'A_FullSeed', 'B_FullSeed'])
    # data = data[sorted(data.columns)]

    # predicting score differential
    if regression:
        # define X and y
        X = data.drop(columns=['score_diff_adj', 'win'])
        y = data['score_diff_adj']

        # define 5-fold cross-validation
        kf = KFold(n_splits=folds, shuffle=True, random_state=SEED)

        # initialize lists to store metrics
        rmse_scores_train = []
        rmse_scores_test = []
        r2_scores_train = []
        r2_scores_test = []
        acc_scores_train = []
        acc_scores_test = []
        r1_accs = []
        r2_accs = []
        r3_accs = []
        r4_accs = []
        r5_accs = []
        r6_accs = []

        for train_index, test_index in kf.split(X):
            # split data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # scale data
            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # fit model
            estimator.fit(X_train, y_train)

            # predict
            train_preds = estimator.predict(X_train)
            test_preds = estimator.predict(X_test)

            # evaluate
            rmse_scores_train.append(mean_squared_error(y_train, train_preds, squared=False))
            rmse_scores_test.append(mean_squared_error(y_test, test_preds, squared=False))
            r2_scores_train.append(r2_score(y_train, train_preds))
            r2_scores_test.append(r2_score(y_test, test_preds))
            acc_scores_train.append(accuracy_score(np.sign(y_train), np.sign(train_preds)))
            acc_scores_test.append(accuracy_score(np.sign(y_test), np.sign(test_preds)))

            # get indices of each round
            r1_idx = data.iloc[test_index].index[
                (data.iloc[test_index]['round_2'] == 0) &
                (data.iloc[test_index]['round_3'] == 0) &
                (data.iloc[test_index]['round_4'] == 0) &
                (data.iloc[test_index]['round_5'] == 0) &
                (data.iloc[test_index]['round_6'] == 0)
            ]
            r2_idx = data.iloc[test_index].index[data.iloc[test_index]['round_2'] == 1]
            r3_idx = data.iloc[test_index].index[data.iloc[test_index]['round_3'] == 1]
            r4_idx = data.iloc[test_index].index[data.iloc[test_index]['round_4'] == 1]
            r5_idx = data.iloc[test_index].index[data.iloc[test_index]['round_5'] == 1]
            r6_idx = data.iloc[test_index].index[data.iloc[test_index]['round_6'] == 1]

            # get accuracy of each round
            r1_accs.append(accuracy_score(np.sign(y_test.loc[r1_idx]), np.sign(test_preds[np.where(data.index.isin(r1_idx))])))
            r2_accs.append(accuracy_score(np.sign(y_test.loc[r2_idx]), np.sign(test_preds[np.where(data.index.isin(r2_idx))])))
            r3_accs.append(accuracy_score(np.sign(y_test.loc[r3_idx]), np.sign(test_preds[np.where(data.index.isin(r3_idx))])))
            r4_accs.append(accuracy_score(np.sign(y_test.loc[r4_idx]), np.sign(test_preds[np.where(data.index.isin(r4_idx))])))
            r5_accs.append(accuracy_score(np.sign(y_test.loc[r5_idx]), np.sign(test_preds[np.where(data.index.isin(r5_idx))])))
            r6_accs.append(accuracy_score(np.sign(y_test.loc[r6_idx]), np.sign(test_preds[np.where(data.index.isin(r6_idx))])))

        # average the metrics across folds
        train_rmse = np.mean(rmse_scores_train)
        test_rmse = np.mean(rmse_scores_test)
        train_r2 = np.mean(r2_scores_train)
        test_r2 = np.mean(r2_scores_test)
        train_acc = np.mean(acc_scores_train)
        test_acc = np.mean(acc_scores_test)
        r1_acc = np.mean(r1_accs)
        r2_acc = np.mean(r2_accs)
        r3_acc = np.mean(r3_accs)
        r4_acc = np.mean(r4_accs)
        r5_acc = np.mean(r5_accs)
        r6_acc = np.mean(r6_accs)
        
        # Save results to models_df
        models_df.loc[len(models_df.index)] = [tournament, 'adj_score_diff', estimator, X.shape[1], X.columns.to_list(), 5, train_r2, test_r2, train_rmse, test_rmse, train_acc, test_acc, r1_acc, r2_acc, r3_acc, r4_acc, r5_acc, r6_acc]
        
        return models_df.tail()
    
    # predicting win/loss
    else:
        # define X and y
        X = data.drop(columns=['score_diff_adj', 'win'])
        y = data['win']

        # define 5-fold cross-validation
        kf = KFold(n_splits=folds, shuffle=True, random_state=SEED)

        # initialize lists to store metrics
        log_loss_train = []
        log_loss_test = []
        acc_scores_train = []
        acc_scores_test = []
        r1_accs = []
        r2_accs = []
        r3_accs = []
        r4_accs = []
        r5_accs = []
        r6_accs = []

        for train_index, test_index in kf.split(X):
            # split data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # scale data
            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # fit model
            estimator.fit(X_train, y_train)

            # predict
            train_preds = estimator.predict(X_train)
            test_preds = estimator.predict(X_test)

            # evaluate
            log_loss_train.append(log_loss(y_train, train_preds))
            log_loss_test.append(log_loss(y_test, test_preds))
            acc_scores_train.append(accuracy_score(np.sign(y_train), np.sign(train_preds)))
            acc_scores_test.append(accuracy_score(np.sign(y_test), np.sign(test_preds)))

            # get indices of each round
            r1_idx = data.iloc[test_index].index[
                (data.iloc[test_index]['round_2'] == 0) &
                (data.iloc[test_index]['round_3'] == 0) &
                (data.iloc[test_index]['round_4'] == 0) &
                (data.iloc[test_index]['round_5'] == 0) &
                (data.iloc[test_index]['round_6'] == 0)
            ]
            r2_idx = data.iloc[test_index].index[data.iloc[test_index]['round_2'] == 1]
            r3_idx = data.iloc[test_index].index[data.iloc[test_index]['round_3'] == 1]
            r4_idx = data.iloc[test_index].index[data.iloc[test_index]['round_4'] == 1]
            r5_idx = data.iloc[test_index].index[data.iloc[test_index]['round_5'] == 1]
            r6_idx = data.iloc[test_index].index[data.iloc[test_index]['round_6'] == 1]

            # get accuracy of each round
            r1_accs.append(accuracy_score(np.sign(y_test.loc[r1_idx]), np.sign(test_preds[np.where(data.index.isin(r1_idx))])))
            r2_accs.append(accuracy_score(np.sign(y_test.loc[r2_idx]), np.sign(test_preds[np.where(data.index.isin(r2_idx))])))
            r3_accs.append(accuracy_score(np.sign(y_test.loc[r3_idx]), np.sign(test_preds[np.where(data.index.isin(r3_idx))])))
            r4_accs.append(accuracy_score(np.sign(y_test.loc[r4_idx]), np.sign(test_preds[np.where(data.index.isin(r4_idx))])))
            r5_accs.append(accuracy_score(np.sign(y_test.loc[r5_idx]), np.sign(test_preds[np.where(data.index.isin(r5_idx))])))
            r6_accs.append(accuracy_score(np.sign(y_test.loc[r6_idx]), np.sign(test_preds[np.where(data.index.isin(r6_idx))])))

        # average the metrics across folds
        train_log_loss = np.mean(log_loss_train)
        test_log_loss = np.mean(log_loss_test)
        train_acc = np.mean(acc_scores_train)
        test_acc = np.mean(acc_scores_test)
        r1_acc = np.mean(r1_accs)
        r2_acc = np.mean(r2_accs)
        r3_acc = np.mean(r3_accs)
        r4_acc = np.mean(r4_accs)
        r5_acc = np.mean(r5_accs)
        r6_acc = np.mean(r6_accs)
        
        # save results to models_df
        models_df.loc[len(models_df.index)] = [tournament, 'A_Win', estimator, X.shape[1], X.columns.to_list(), 5, train_log_loss, test_log_loss, train_acc, test_acc, r1_acc, r2_acc, r3_acc, r4_acc, r5_acc, r6_acc]
        
        return models_df.tail()

### Regression (Predicting <ins>Score Differential</ins>)

#### Men's

In [267]:
# create a df to hold regression models
reg_df = pd.DataFrame(columns=['Tournament', 'Label', 'Model', 'Num_Features', 'Features', 'Num_CV_Folds', 'Train_R2', 'Val_R2', 'Train_RMSE', 'Val_RMSE', 'Train_Acc', 'Val_Acc', 'Val_r1_acc', 'Val_r2_acc', 'Val_r3_acc', 'Val_r4_acc', 'Val_r5_acc', 'Val_r6_acc'])

# load reg df
# reg_df = pd.read_csv('models/regression_models.csv')

In [268]:
# define regression models
models = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_jobs=-1), XGBRegressor(n_jobs=-1), SVR(), KNeighborsRegressor(n_jobs=-1)]
datasets = [mfcomp, mfcomp_diff]

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, features=dataset, regression=True, models_df=reg_df, tournament='M')

In [269]:
# inspect
reg_df[reg_df['Tournament'] == 'M'].sort_values(by='Test_RMSE').head()

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_R2,Test_R2,Train_RMSE,Test_RMSE,Train_Acc,Test_Acc
1,M,adj_score_diff,LinearRegression(n_jobs=-1),44,"[Season, A_1_pos_loss_missing, A_1_pos_win_mis...",5,0.422245,0.237746,11.119313,11.707705,0.720749,0.695368
0,M,adj_score_diff,LinearRegression(n_jobs=-1),70,"[Season, A_Seed, B_Seed, A_PlayIn, B_PlayIn, A...",5,0.431099,0.232183,11.033611,11.754277,0.725972,0.702473
3,M,adj_score_diff,"(DecisionTreeRegressor(max_features=1.0, rando...",44,"[Season, A_1_pos_loss_missing, A_1_pos_win_mis...",5,0.91336,0.219695,4.306035,11.842166,0.932615,0.676981
2,M,adj_score_diff,"(DecisionTreeRegressor(max_features=1.0, rando...",70,"[Season, A_Seed, B_Seed, A_PlayIn, B_PlayIn, A...",5,0.913924,0.212484,4.291858,11.884496,0.93293,0.670301
6,M,adj_score_diff,SVR(),70,"[Season, A_Seed, B_Seed, A_PlayIn, B_PlayIn, A...",5,0.38646,0.160203,11.458109,12.239306,0.72545,0.675297


With untuned regression models, best performance on men's test data is __71.16% acc__ using a Linear Regression. This is __0.12%__ better than the dummy preds.

#### Women's

In [270]:
# define regression models
models = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_jobs=-1), XGBRegressor(n_jobs=-1), SVR(), KNeighborsRegressor(n_jobs=-1)]
datasets = [wfcomp, wfcomp_diff]

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, features=dataset, regression=True, models_df=reg_df, tournament='W')

In [271]:
# inspect
reg_df[reg_df['Tournament'] == 'W'].sort_values(by='Test_RMSE').head()

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_R2,Test_R2,Train_RMSE,Test_RMSE,Train_Acc,Test_Acc
11,W,adj_score_diff,LinearRegression(n_jobs=-1),44,"[Season, A_1_pos_loss_missing, A_1_pos_win_mis...",5,0.665216,0.641316,12.061677,12.45376,0.795556,0.787302
10,W,adj_score_diff,LinearRegression(n_jobs=-1),70,"[Season, A_Seed, B_Seed, A_PlayIn, B_PlayIn, A...",5,0.671902,0.635712,11.940627,12.550715,0.798095,0.786667
13,W,adj_score_diff,"(DecisionTreeRegressor(max_features=1.0, rando...",44,"[Season, A_1_pos_loss_missing, A_1_pos_win_mis...",5,0.945833,0.614321,4.851408,12.90353,0.944286,0.772063
12,W,adj_score_diff,"(DecisionTreeRegressor(max_features=1.0, rando...",70,"[Season, A_Seed, B_Seed, A_PlayIn, B_PlayIn, A...",5,0.946841,0.608586,4.806392,13.006047,0.945714,0.775238
14,W,adj_score_diff,"XGBRegressor(base_score=0.5, booster='gbtree',...",70,"[Season, A_Seed, B_Seed, A_PlayIn, B_PlayIn, A...",5,0.999718,0.553044,0.345817,13.899102,1.0,0.765079


With untuned regression models, best performance on women's test data is __78.98% acc__ using a Linear Regression. This is __1.33%__ better than the dummy preds.

In [70]:
# save reg_df
reg_df.to_csv('models/regression_models.csv', index=False)

### Classification (Predicting <ins>Win/Loss</ins>)

In [27]:
# create a df to hold classification models
class_df = pd.DataFrame(columns=['Tournament', 'Label', 'Model', 'Num_Features', 'Features', 'Num_CV_Folds', 'Train_LogLoss', 'Val_LogLoss', 'Train_Acc', 'Val_Acc', 'Val_r1_acc', 'Val_r2_acc', 'Val_r3_acc', 'Val_r4_acc', 'Val_r5_acc', 'Val_r6_acc'])

# load class df
# class_df = pd.read_csv('models/classification_models.csv')

#### Men's

In [65]:
# define regression models
models = [LogisticRegression(n_jobs=-1, random_state=SEED), RandomForestClassifier(n_jobs=-1), XGBClassifier(n_jobs=-1), SVC(), KNeighborsClassifier(n_jobs=-1)]
datasets = [mfcomp, mfcomp_diff]

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, data=dataset, regression=False, models_df=class_df, tournament='M')

In [66]:
# inspect
class_df[class_df['Tournament'] == 'M'].sort_values(by='Test_LogLoss').head()

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_LogLoss,Test_LogLoss,Train_Acc,Test_Acc
0,M,A_Win,"LogisticRegression(n_jobs=-1, random_state=0)",62,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,10.0587,10.617469,0.72093,0.705427
1,M,A_Win,"LogisticRegression(n_jobs=-1, random_state=0)",37,"['A_1_pos_loss_missing', 'A_1_pos_win_missing'...",5,10.2719,10.764347,0.715015,0.701353
7,M,A_Win,SVC(),37,"['A_1_pos_loss_missing', 'A_1_pos_win_missing'...",5,9.371213,10.926235,0.740004,0.696861
6,M,A_Win,SVC(),62,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,8.816072,10.955808,0.755406,0.696041
3,M,A_Win,RandomForestClassifier(n_jobs=-1),37,"['A_1_pos_loss_missing', 'A_1_pos_win_missing'...",5,2.220446e-16,11.073322,1.0,0.69278


With untuned classification models, best performance on men's test data is __70.54% acc__ using a Logistic Regression. This is __0.5%__ WORSE than the dummy preds.

#### Women's

In [67]:
# define regression models
models = [LogisticRegression(n_jobs=-1, random_state=SEED), RandomForestClassifier(n_jobs=-1), XGBClassifier(n_jobs=-1), SVC(), KNeighborsClassifier(n_jobs=-1)]
datasets = [wfcomp, wfcomp_diff]

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, data=dataset, regression=False, models_df=class_df, tournament='W')

In [68]:
# inspect
class_df[class_df['Tournament'] == 'W'].sort_values(by='Test_LogLoss').head()

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_LogLoss,Test_LogLoss,Train_Acc,Test_Acc
51,W,A_Win,"LogisticRegression(n_jobs=-1, random_state=0)",43,"[Season, A_1_pos_loss_missing, A_1_pos_win_mis...",5,7.054258,7.483349,0.804286,0.792381
31,W,A_Win,"LogisticRegression(n_jobs=-1, random_state=0)",43,"[Season, A_1_pos_loss_missing, A_1_pos_win_mis...",5,7.054258,7.483349,0.804286,0.792381
11,W,A_Win,"LogisticRegression(n_jobs=-1, random_state=0)",37,"['A_1_pos_loss_missing', 'A_1_pos_win_missing'...",5,7.496748,7.718376,0.792009,0.78586
36,W,A_Win,SVC(),68,"[Season, A_PlayIn, B_PlayIn, A_max_win_diff, A...",5,6.241845,7.757967,0.826825,0.784762
56,W,A_Win,SVC(),70,"[Season, A_Seed, B_Seed, A_PlayIn, B_PlayIn, A...",5,6.224682,7.826622,0.827302,0.782857


With untuned classification models, best performance on wommen's test data is __79.24% acc__ using a Logistic Regression. This is __1.59%__ better than the dummy preds.

In [69]:
# save reg_df
class_df.to_csv('models/classification_models.csv', index=False)

## Simultate N Brackets

In [236]:
# root dirs
root = 'data/'
mroot = 'data/mens/'
wroot = 'data/womens/'

# load data, get slots for 2024, drop play-ins
slots = pd.read_csv(mroot + 'MNCAATourneySlots.csv')
slots = slots[slots['Season'] == 2024]
slots = slots[slots['Slot'].str.contains('R')].reset_index(drop=True)

# load seed data
seeds_2024 = pd.read_csv(root + '2024_tourney_seeds.csv')

# load in 2024 data
df_2024 = pd.read_csv(root + 'processed/2024_features.csv').drop(columns=['Region', 'Season'])

# drop teams not in 2024 tourney
df_2024 = df_2024[df_2024['TeamID'].isin(seeds_2024['TeamID'])]

# drop play in char
df_2024['FullSeed'] = df_2024['FullSeed'].apply(lambda x: x[:3])

# split into mens and womens
mdf_2024 = df_2024[df_2024['TeamID'] < 3000].reset_index(drop=True)
wdf_2024 = df_2024[df_2024['TeamID'] >= 3000].reset_index(drop=True)

In [238]:
def generate_bracket(data, estimator, slots_df=slots):
    """
    Generate a single bracket for the 2024 NCAA tournament.

    Parameters
    ----------
    data : pd.DataFrame
        Regular season data for the 2024 teams competing in the tournament.
    estimator : sklearn estimator
        Pre-trained estimator to use for modeling.
    slots : pd.DataFrame
        Slots for the 2024 tournament.

    Returns
    -------
    result_df : pd.DataFrame
        DataFrame with the predicted outcomes of the tournament.
    
    """

    # get a copies of data to avoid modifying the original
    features = data.copy()
    slots = slots_df.copy()

    # create empty results
    result_df = pd.DataFrame(columns=["Slot", "Team"])

    # define scaler
    scaler = MinMaxScaler()

    # 6 rounds in a single bracket
    for i in range(1, 7):
        # get slots for round
        slots_round = slots[slots['Slot'].str.contains(f'R{i}')].reset_index(drop=True)

        # holds data for each matchup
        round_matchups = []

        # loop through the slots
        for idx, row in slots_round.iterrows():
            # get team A and team B
            A = features[features['FullSeed'] == row['StrongSeed']].reset_index(drop=True)
            B = features[features['FullSeed'] == row['WeakSeed']].reset_index(drop=True)

            # rename cols
            A = A.add_prefix('A_')
            B = B.add_prefix('B_')

            # create matchup dataframe
            combined = pd.concat([A, B], axis=1)

            # append combined row to the list
            round_matchups.append(combined)

        # concatenate all matchup rows into a single DataFrame
        round_df = pd.concat(round_matchups, axis=0).reset_index(drop=True)

        # add some cols that were in training data
        round_df['Season'] = (2024 - 1985)
        round_df['A_Loc_A'] = 0
        round_df['A_Loc_H'] = 0
        round_df[['round_2', 'round_3', 'round_4', 'round_5', 'round_6']] = 0
        if i == 1:
            pass
        else:
            round_df[f'round_{i}'] = 1

        # define X and reorder cols
        X = round_df.drop(columns=['A_TeamID', 'A_FullSeed', 'B_TeamID', 'B_FullSeed'])
        X = X[sorted(X.columns)]

        # scale data
        X = scaler.fit_transform(X)
        
        # predict the outcomes of the round
        preds = estimator.predict(X)

        # replace preds with full seed of winning team
        preds = np.where(preds > 0, round_df['A_FullSeed'], round_df['B_FullSeed'])

        for slot, winner_seed in zip(slots_round['Slot'], preds):
            # save results to result_df
            result_df.loc[len(result_df.index)] = [slot, winner_seed]

        # edit slots df for next round
        if i != 6:
            next_round_slots = slots[slots['Slot'].str.contains(f'R{i+1}')]

            for idx, row in next_round_slots.iterrows():
                # get the teams playing in that slot for the next round
                team1 = result_df[result_df['Slot'] == row['StrongSeed']]['Team'].values[0]
                team2 = result_df[result_df['Slot'] == row['WeakSeed']]['Team'].values[0]

                # update the slots df
                slots.loc[slots['Slot'] == row['Slot'], 'StrongSeed'] = team1
                slots.loc[slots['Slot'] == row['Slot'], 'WeakSeed'] = team2

        # drop teams that have been eliminated
        features = features[features['FullSeed'].isin(result_df['Team'])].reset_index(drop=True)

    # add tournament col
    result_df['Bracket'] = 1

    return result_df


In [None]:
# train best men's model on entire dataset
mmodel = LogisticRegression(n_jobs=-1, random_state=SEED)

# define X and y
X = mfcomp.drop(columns=['A_TeamID', 'A_FullSeed', 'B_TeamID', 'B_FullSeed', 'A_seed_win_prob', 'B_seed_win_prob', 'score_diff_adj', 'win'])
X = X[sorted(X.columns)]
y = mfcomp['win']

# scale data
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# fit model
mmodel.fit(X, y)

In [None]:
# train best women's model on entire dataset
wmodel = LogisticRegression(n_jobs=-1, random_state=SEED)

# define X and y
X = wfcomp.drop(columns=['A_TeamID', 'A_FullSeed', 'B_TeamID', 'B_FullSeed', 'A_seed_win_prob', 'B_seed_win_prob', 'score_diff_adj', 'win'])
X = X[sorted(X.columns)]
y = wfcomp['win']

# scale data
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# fit model
mmodel.fit(X, y)

In [None]:
# num brackets
n_brackets = 100000

# men's
result_m = generate_bracket(data=mdf_2024, estimator=mmodel)

# women's
result_w = generate_bracket(data=wdf_2024, estimator=wmodel)

# combine results
submission = pd.concat([result_m, result_w])
submission = submission.reset_index(drop=True)
submission.index.names = ['RowId']

# reorder
submission = submission[['Tournament', 'Bracket', 'Slot', 'Team']]

In [None]:
# save
submission.to_csv('submission.csv')
submission.head()