# Predictive Modeling (Compact)
In this notebook, we:
- Try different models on the [compact features](./eda_compact.ipynb) and assess their performance.

In [1]:
from helper import *

# load in features compact
fcomp = pd.read_csv('data/cleaned/features_compact.csv')

# view a row
fcomp.sample()

Unnamed: 0,Season,DayNum,WTeamID,WScore,WRegion,WSeed_num,WPlayIn,WLoc,LTeamID,LScore,LRegion,LSeed_num,LPlayIn,NumOT,num_games_x,win_pct_x,mean_pts_x,std_pts_x,mean_pts_against_x,std_pts_against_x,mean_score_diff_x,std_score_diff_x,home_game_pct_x,away_game_pct_x,neutral_game_pct_x,home_win_pct_x,away_win_pct_x,neutral_win_pct_x,close_games_pct_x,close_games_win_pct_x,num_games_y,win_pct_y,mean_pts_y,std_pts_y,mean_pts_against_y,std_pts_against_y,mean_score_diff_y,std_score_diff_y,home_game_pct_y,away_game_pct_y,neutral_game_pct_y,home_win_pct_y,away_win_pct_y,neutral_win_pct_y,close_games_pct_y,close_games_win_pct_y,LLoc,Wseed_diff,Lseed_diff,round,WTeamName,LTeamName,Wnum_championships,Lnum_championships
1822,2013,146,1257,85,Y,1,0,N,1181,63,Y,2,0,0,34,0.852941,73.558824,10.968802,57.970588,13.192021,15.588235,13.555984,0.470588,0.323529,0.205882,0.9375,0.727273,0.857143,0.176471,0.5,32,0.84375,78.28125,9.102408,65.40625,12.14392,12.875,14.322889,0.5,0.28125,0.21875,1.0,0.555556,0.857143,0.09375,0.666667,N,-1,1,4,Louisville,Duke,2,5


# Rearranging the Data
First, we will rename 'W' and 'L' to 'x' and 'y'.

In [2]:
# delete starting 'L' if it is in a column name and add '_y' to the end of these columns
fcomp.columns = [c[1:] + '_y' if c[0] == 'L' else c for c in fcomp.columns]

# same for 'W' with '_x'
fcomp.columns = [c[1:] + '_x' if c[0] == 'W' else c for c in fcomp.columns]

# rearrange cols
fcomp = fcomp.reindex(sorted(fcomp.columns), axis=1)

# view a row
fcomp.sample()

Unnamed: 0,DayNum,Loc_x,Loc_y,NumOT,PlayIn_x,PlayIn_y,Region_x,Region_y,Score_x,Score_y,Season,Seed_num_x,Seed_num_y,TeamID_x,TeamID_y,TeamName_x,TeamName_y,away_game_pct_x,away_game_pct_y,away_win_pct_x,away_win_pct_y,close_games_pct_x,close_games_pct_y,close_games_win_pct_x,close_games_win_pct_y,home_game_pct_x,home_game_pct_y,home_win_pct_x,home_win_pct_y,mean_pts_against_x,mean_pts_against_y,mean_pts_x,mean_pts_y,mean_score_diff_x,mean_score_diff_y,neutral_game_pct_x,neutral_game_pct_y,neutral_win_pct_x,neutral_win_pct_y,num_championships_x,num_championships_y,num_games_x,num_games_y,round,seed_diff_x,seed_diff_y,std_pts_against_x,std_pts_against_y,std_pts_x,std_pts_y,std_score_diff_x,std_score_diff_y,win_pct_x,win_pct_y
3766,144,N,N,0,0,0,Z,Z,72,67,2018,6,2,3333,3124,Oregon St,Baylor,0.366667,0.34375,0.636364,0.909091,0.166667,0.0,0.8,0.0,0.533333,0.5,0.875,1.0,58.066667,55.90625,73.566667,86.59375,15.5,30.6875,0.1,0.15625,0.666667,1.0,0,3,30,32,3,4,-4,11.70303,11.069748,15.392975,10.527566,21.969807,16.969494,0.766667,0.96875


Currently, the winner is always on the left side of the row (team 'x'). To prevent models from learning this positional encoding, we will duplicate rows similar to how we duplicated the regular season data. This will also double our pool of training examples.

In [3]:
# flip x and y col values
flipped = fcomp.copy()
for col in flipped.columns:
    if col[-2:] == '_x':
        col_y = col.replace('_x', '_y')
        flipped.loc[:, col], flipped.loc[:, col_y] = flipped.loc[:, col_y].copy(), flipped.loc[:, col].copy()

# rearrange cols
flipped = flipped.reindex(sorted(flipped.columns), axis=1)

# check
display(fcomp.head(1))
display(flipped.head(1))

Unnamed: 0,DayNum,Loc_x,Loc_y,NumOT,PlayIn_x,PlayIn_y,Region_x,Region_y,Score_x,Score_y,Season,Seed_num_x,Seed_num_y,TeamID_x,TeamID_y,TeamName_x,TeamName_y,away_game_pct_x,away_game_pct_y,away_win_pct_x,away_win_pct_y,close_games_pct_x,close_games_pct_y,close_games_win_pct_x,close_games_win_pct_y,home_game_pct_x,home_game_pct_y,home_win_pct_x,home_win_pct_y,mean_pts_against_x,mean_pts_against_y,mean_pts_x,mean_pts_y,mean_score_diff_x,mean_score_diff_y,neutral_game_pct_x,neutral_game_pct_y,neutral_win_pct_x,neutral_win_pct_y,num_championships_x,num_championships_y,num_games_x,num_games_y,round,seed_diff_x,seed_diff_y,std_pts_against_x,std_pts_against_y,std_pts_x,std_pts_y,std_score_diff_x,std_score_diff_y,win_pct_x,win_pct_y
0,136,N,N,0,0,0,X,X,63,54,1985,9,8,1116,1234,Arkansas,Iowa,0.363636,0.3,0.333333,0.333333,0.272727,0.166667,0.333333,0.2,0.333333,0.6,0.909091,0.833333,61.69697,59.266667,65.333333,69.733333,3.636364,10.466667,0.30303,0.1,0.7,0.666667,1,0,33,30,1,1,-1,9.815437,11.295203,12.151817,15.418156,11.661417,16.074682,0.636364,0.666667


Unnamed: 0,DayNum,Loc_x,Loc_y,NumOT,PlayIn_x,PlayIn_y,Region_x,Region_y,Score_x,Score_y,Season,Seed_num_x,Seed_num_y,TeamID_x,TeamID_y,TeamName_x,TeamName_y,away_game_pct_x,away_game_pct_y,away_win_pct_x,away_win_pct_y,close_games_pct_x,close_games_pct_y,close_games_win_pct_x,close_games_win_pct_y,home_game_pct_x,home_game_pct_y,home_win_pct_x,home_win_pct_y,mean_pts_against_x,mean_pts_against_y,mean_pts_x,mean_pts_y,mean_score_diff_x,mean_score_diff_y,neutral_game_pct_x,neutral_game_pct_y,neutral_win_pct_x,neutral_win_pct_y,num_championships_x,num_championships_y,num_games_x,num_games_y,round,seed_diff_x,seed_diff_y,std_pts_against_x,std_pts_against_y,std_pts_x,std_pts_y,std_score_diff_x,std_score_diff_y,win_pct_x,win_pct_y
0,136,N,N,0,0,0,X,X,54,63,1985,8,9,1234,1116,Iowa,Arkansas,0.3,0.363636,0.333333,0.333333,0.166667,0.272727,0.2,0.333333,0.6,0.333333,0.833333,0.909091,59.266667,61.69697,69.733333,65.333333,10.466667,3.636364,0.1,0.30303,0.666667,0.7,0,1,30,33,1,-1,1,11.295203,9.815437,15.418156,12.151817,16.074682,11.661417,0.666667,0.636364


Now that we have a flipped version of our features, we can combine the 2 dataframes.

In [4]:
# combine
fcomp = pd.concat([fcomp, flipped], axis=0)

# check
print(fcomp.shape)
display(fcomp.sample())

(8188, 54)


Unnamed: 0,DayNum,Loc_x,Loc_y,NumOT,PlayIn_x,PlayIn_y,Region_x,Region_y,Score_x,Score_y,Season,Seed_num_x,Seed_num_y,TeamID_x,TeamID_y,TeamName_x,TeamName_y,away_game_pct_x,away_game_pct_y,away_win_pct_x,away_win_pct_y,close_games_pct_x,close_games_pct_y,close_games_win_pct_x,close_games_win_pct_y,home_game_pct_x,home_game_pct_y,home_win_pct_x,home_win_pct_y,mean_pts_against_x,mean_pts_against_y,mean_pts_x,mean_pts_y,mean_score_diff_x,mean_score_diff_y,neutral_game_pct_x,neutral_game_pct_y,neutral_win_pct_x,neutral_win_pct_y,num_championships_x,num_championships_y,num_games_x,num_games_y,round,seed_diff_x,seed_diff_y,std_pts_against_x,std_pts_against_y,std_pts_x,std_pts_y,std_score_diff_x,std_score_diff_y,win_pct_x,win_pct_y
2391,152,N,N,0,0,0,W,X,71,72,2023,9,5,1194,1361,FL Atlantic,San Diego St,0.4375,0.3125,0.785714,0.8,0.15625,0.21875,1.0,0.857143,0.46875,0.46875,1.0,0.933333,65.28125,63.8125,78.15625,71.34375,12.875,7.53125,0.09375,0.21875,1.0,0.571429,0,0,32,32,5,4,-4,11.453227,10.999817,11.755875,9.488905,12.372576,10.758296,0.90625,0.8125


# Create Labels

In [5]:
# create regression label, drop cols
fcomp['score_diff_x'] = fcomp['Score_x'] - fcomp['Score_y']

# create binary label
fcomp['win_x'] = fcomp['score_diff_x'].apply(lambda x: 1 if x > 0 else 0)

# check
fcomp.sample()[['Score_x', 'Score_y', 'score_diff_x', 'win_x']]

Unnamed: 0,Score_x,Score_y,score_diff_x,win_x
2347,63,87,-24,0


In [6]:
# create adjusted score diff col (score diff is inversely scaled by NumOT periods)
fcomp['score_diff_adj_x'] = fcomp['score_diff_x'] / (2 ** fcomp['NumOT'])

# check
fcomp[fcomp['NumOT'] > 0][['score_diff_x', 'NumOT', 'score_diff_adj_x', 'win_x']].sort_values(by='NumOT', ascending=False).head()

Unnamed: 0,score_diff_x,NumOT,score_diff_adj_x,win_x
654,8,3,1.0,1
654,-8,3,-1.0,0
1298,-6,2,-1.5,0
1676,-7,2,-1.75,0
1199,-7,2,-1.75,0


- Adjusted score differential penalizes teams less for losing in overtimes (especially multiple).
- __score_diff_adj__ and __win__ can both be used as labels. They are both calculated with respect to team x.

# Chalk Bracket
Here, we will simply predict the better seed to win each game. If seeds are equal (in rounds 5 and 6), we will predict the team with the better regular season winning percentage.

In [93]:
# split genders for winning rows
fcomp_mens, fcomp_womens = split_genders(fcomp[fcomp['win_x'] == 1], id_col='TeamID_x')

# get dummy preds for men and calculate accuracy
mchalk_preds = get_dummy_preds(fcomp_mens)
mchalk_acc = accuracy_score(fcomp_mens['win_x'], mchalk_preds)

# women
wchalk_preds = get_dummy_preds(fcomp_womens)
wchalk_acc = accuracy_score(fcomp_womens['win_x'], wchalk_preds)

# show
print(f"Accuracy of dummy predictions in men's tournaments (39 brackets): {mchalk_acc*100:.2f}%.")
print(f"Accuracy of dummy predictions in women's tournaments (26 brackets): {wchalk_acc*100:.2f}%")

Accuracy of dummy predictions in men's tournaments (39 brackets): 71.01%.
Accuracy of dummy predictions in women's tournaments (26 brackets): 77.96%


- We can see that the better seed wins about 7% more often in women's tournaments. This coincides with the [upset analysis](./eda_compact.ipynb).

# Finalize Columns
We will one-hot encode categorical columns, then drop non-feature columns. We will also create seperate feature subsets that only contain general scoring and win data.

In [7]:
# one-hot encode location columns
X = fcomp.copy()

# sort columns
X = X.reindex(sorted(X.columns), axis=1)

# non-feature columns
dropped_cols = ['DayNum', 'NumOT', 'Region_x', 'Region_y', 'Score_x', 'Score_y', 'Season', 'TeamName_x', 'TeamName_y', 'num_championships_x', 'num_championships_y', 'round', 'seed_diff_x', 'seed_diff_y', 'score_diff_x', 'Loc_x', 'Loc_y']

# split genders
X_mens, X_womens = split_genders(X, id_col='TeamID_x')

# drop non-feature columns
X_mens = X_mens.drop(columns=dropped_cols + ['Season', 'TeamID_x', 'TeamID_y'])
X_womens = X_womens.drop(columns=dropped_cols + ['Season', 'TeamID_x', 'TeamID_y'])

# simpler feature subsets
extra_dropped_cols = ['away_game_pct_x', 'away_game_pct_y', 'away_win_pct_x', 'away_win_pct_y', 'home_game_pct_x', 'home_game_pct_y', 'home_win_pct_x', 'home_win_pct_y', 'neutral_game_pct_x', 'neutral_game_pct_y', 'neutral_win_pct_x', 'neutral_win_pct_y']
X_mens_small = X_mens.drop(columns=extra_dropped_cols)
X_womens_small = X_womens.drop(columns=extra_dropped_cols)

# number of features
X_mens.shape, X_mens_small.shape, X_womens.shape, X_womens_small.shape

((4912, 38), (4912, 26), (3276, 38), (3276, 26))

- We have 4912 rows of training data for men, and 3276 rows of training data for women.
- 3 of these columns are labels. Thus, we have a large 36 feature subset, as well as a simpler 24 feature subset.
- Each gender will be trained seperately, as we have already found multiple statistically-significant differences in the data.

# Test Models
We will experiment with 5 regression models, 5 classification models, and 3 scalers. We will also try each of the 2 different feature subsets.

In [101]:
# define models and scalers
models_reg = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_jobs=-1), KNeighborsRegressor(n_jobs=-1), SVR(), XGBRegressor(n_jobs=-1)]
models_class = [LogisticRegression(n_jobs=-1), RandomForestClassifier(n_jobs=-1), SVC(), XGBClassifier(n_jobs=-1)]
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler()]

# create a df to hold model performance
models_df = pd.DataFrame(columns=['Gender', 'Target', 'Model', 'Model_Params', 'Scaler', 'Num_Features', 'Train_R2', 'Val_R2', 'Train_RMSE', 'Val_RMSE', 'Train_LogLoss', 'Val_LogLoss', 'Train_Acc', 'Val_Acc']).astype({
    'Gender': 'object', 'Target': 'object', 'Model': 'object', 'Model_Params': 'object', 'Scaler': 'object', 'Num_Features': 'int', 'Train_R2': 'float', 'Val_R2': 'float', 'Train_RMSE': 'float', 'Val_RMSE': 'float', 
    'Train_LogLoss': 'float', 'Val_LogLoss': 'float', 'Train_Acc': 'float', 'Val_Acc': 'float'})
models_df

Unnamed: 0,Gender,Target,Model,Model_Params,Scaler,Num_Features,Train_R2,Val_R2,Train_RMSE,Val_RMSE,Train_LogLoss,Val_LogLoss,Train_Acc,Val_Acc


## Men's
We're trying to beat the chalk accuracy of __71.01%__.

In [102]:
# run models
for model in tqdm(models_class, desc="Running Models"):
    for scaler in scalers:
        for features in [X_mens, X_mens_small]:
                cross_val_model(estimator=model, df=features, target_col='win_x', gender='M', scaler=scaler, models_df=models_df)

Running Models: 100%|██████████| 4/4 [02:59<00:00, 44.98s/it]


In [105]:
# inspect
models_df.query("Gender == 'M'").sort_values(by='Val_LogLoss').head()

Unnamed: 0,Gender,Target,Model,Model_Params,Scaler,Num_Features,Train_R2,Val_R2,Train_RMSE,Val_RMSE,Train_LogLoss,Val_LogLoss,Train_Acc,Val_Acc
5,M,win_x,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",RobustScaler,24,0.0,0.0,0.0,0.0,10.212691,10.258338,0.716658,0.715391
1,M,win_x,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",StandardScaler,24,0.0,0.0,0.0,0.0,10.22166,10.27299,0.716409,0.714985
3,M,win_x,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",MinMaxScaler,24,0.0,0.0,0.0,0.0,10.245306,10.361066,0.715753,0.712541
2,M,win_x,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",MinMaxScaler,36,0.0,0.0,0.0,0.0,10.316239,10.463689,0.713785,0.709694
0,M,win_x,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",StandardScaler,36,0.0,0.0,0.0,0.0,10.280363,10.478415,0.71478,0.709285


- Our best model was a __Logistic Regression__ on the __smaller feature subset__.
- Our best results were __71.54%__ accuracy, only 0.53% better than the chalk accuracy.

## Women's
We're trying to beat the chalk accuracy of __77.96%__.

In [103]:
# run models
for model in tqdm(models_class, desc="Running Models"):
    for scaler in scalers:
        for features in [X_womens, X_womens_small]:
            cross_val_model(estimator=model, df=features, target_col='win_x', gender='W', scaler=scaler, models_df=models_df)

Running Models: 100%|██████████| 4/4 [01:09<00:00, 17.29s/it]


In [104]:
# inspect
models_df.query("Gender == 'W'").sort_values(by='Val_LogLoss').head()

Unnamed: 0,Gender,Target,Model,Model_Params,Scaler,Num_Features,Train_R2,Val_R2,Train_RMSE,Val_RMSE,Train_LogLoss,Val_LogLoss,Train_Acc,Val_Acc
27,W,win_x,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",MinMaxScaler,24,0.0,0.0,0.0,0.0,7.523154,7.580438,0.791277,0.789687
25,W,win_x,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",StandardScaler,24,0.0,0.0,0.0,0.0,7.498702,7.613607,0.791955,0.788767
29,W,win_x,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",RobustScaler,24,0.0,0.0,0.0,0.0,7.498702,7.624663,0.791955,0.78846
26,W,win_x,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",MinMaxScaler,36,0.0,0.0,0.0,0.0,7.508478,7.646607,0.791684,0.787851
24,W,win_x,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",StandardScaler,36,0.0,0.0,0.0,0.0,7.42413,7.657596,0.794024,0.787547


- The women's best model was also a __Logistic Regression__ on the __smaller feature subset__.
- Our best results were __78.97%__ accuracy, 1% better than the chalk accuracy.

In [9]:
# save best hyperparameters
best_model_mens = LogisticRegression(n_jobs=-1)
best_scaler_mens = RobustScaler()
best_model_womens = LogisticRegression(n_jobs=-1)
best_scaler_womens = MinMaxScaler()

In [107]:
# save models_df
models_df.to_csv('data/models/models_compact.csv', index=False)

# Train Final Model

In [30]:
# train final models on all data
X_mens_all = best_scaler_mens.fit_transform(X_mens_small.drop(columns=['score_diff_adj_x', 'win_x']))
y_mens_all = X_mens_small['win_x']
final_model_mens = best_model_mens.fit(X_mens_all, y_mens_all)
X_womens_all = best_scaler_womens.fit_transform(X_womens_small.drop(columns=['score_diff_adj_x', 'win_x']))
y_womens_all = X_womens_small['win_x']
final_model_womens = best_model_womens.fit(X_womens_all, y_womens_all)

# accuracy
mens_acc = accuracy_score(y_mens_all, final_model_mens.predict(X_mens_all))
womens_acc = accuracy_score(y_womens_all, final_model_womens.predict(X_womens_all))

# view
mens_acc, womens_acc

(0.7178338762214984, 0.7912087912087912)

# 2025 Predictions

In [31]:
# calculate number of unique matchups (63 for team1, then 62 for team2, 61, 60...)
n_teams = 64
total_matchups = ((n_teams - 1) / 2) * 64

# show
print(f"Total number of unique team_x vs team_y matchups in a 64-team tournament: {total_matchups}")

# load sample submission
sample_submission = pd.read_csv(ROOT + 'SampleSubmissionStage2.csv')

# shape
sample_submission.shape[0]

Total number of unique team_x vs team_y matchups in a 64-team tournament: 2016.0


131407

There are __2016 possible unique matchups__ in a 64-team bracket. The submission requires us to predict results for every possible team matchup (131K), even though only 63 total games (out of 2016 possible) will be actually scored. It would be inneficent to predict all of these outcomes, so we will generate the actual possible matchups for both men and women.

In [32]:
# split ID col into Season, Team1, and Team2
sample_submission[['Season', 'T1', 'T2']] = sample_submission['ID'].str.split('_', expand=True)

# ensure sample submission doesn't contain flipped duplicates
sample_submission.query("(T1 == '1101' & T2 == '1102') | (T2 == '1101' & T1 == '1102')")

Unnamed: 0,ID,Pred,Season,T1,T2
0,2025_1101_1102,0.5,2025,1101,1102


No duplicate (flipped) matchups in the submission file.

In [33]:
# load seed data, split genders
seeds_2025 = seeds =  pd.concat([pd.read_csv(MENS_ROOT + 'MNCAATourneySeeds.csv'), pd.read_csv(WOMENS_ROOT + 'WNCAATourneySeeds.csv')], ignore_index=True).query("Season == 2025")

# generate matchups
matchups_men, matchups_women = generate_matchups(seeds_2025)

# shapes
print(matchups_men.shape, matchups_women.shape)
matchups_men.sample()

(2278, 2) (2278, 2)


Unnamed: 0,T1,T2
3107,1361,1463


In [34]:
# load in 2025 data
features_2025 = pd.read_csv(ROOT + 'cleaned/features_2025.csv')

# drop columns that we did not use in small subset
features_2025 = features_2025.drop(columns=['home_game_pct', 'away_game_pct', 'neutral_game_pct', 'home_win_pct', 'away_win_pct', 'neutral_win_pct'])

# create a seed map for current and opponent teams
seed_map = dict(zip(seeds_2025['TeamID'], seeds_2025['Seed']))

# map
features_2025['Seed'] = features_2025['TeamID'].map(seed_map)

# drop nulls (teams that aren't in tournament)
features_2025 = features_2025.dropna()

# split the seed value into region, seed num, and play-in flag
features_2025[['Region', 'Seed_num', 'PlayIn']] = features_2025['Seed'].apply(split_seed).apply(pd.Series)

# drop old seed and region
features_2025 = features_2025.drop(columns=['Region', 'Seed'])

# drop any non-numeric chars from seed nums
features_2025['Seed_num'] = features_2025['Seed_num'].str.extract('(\d+)').astype(int).copy()

# split into mens and womens
mens_2025, womens_2025 = split_genders(features_2025, id_col='TeamID')

In [38]:
# get predictions
preds_mens = get_2025_predictions(mens_2025, matchups_men, model=final_model_mens, scaler=best_scaler_mens)
preds_womens = get_2025_predictions(womens_2025, matchups_women, model=final_model_womens, scaler=best_scaler_womens)

# combine predictions
preds_2025 = pd.concat([preds_mens, preds_womens], axis=0)

# check
print(preds_2025.shape)
preds_2025.sample()

(4556, 2)


Unnamed: 0,ID,Pred
218,2025_3124_3250,0.038687


# Submission

In [39]:
# sample_submission holds all unique matchups (including non-tournament teams)
sample_submission = sample_submission[['ID']]
submission = sample_submission.copy()

# map ID col to preds from preds_2025
submission['Pred'] = sample_submission['ID'].map(preds_2025.set_index('ID')['Pred'])

# fill nulls with 0.5
submission['Pred'] = submission['Pred'].fillna(0.5)

# check
print(submission.shape)
submission.sample()

(131407, 2)


Unnamed: 0,ID,Pred
28966,2025_1198_1229,0.5


In [40]:
# save
submission.to_csv('submission.csv', header=True, index=False)