# Model
In this notebook, we:
- Define the structure of our prediction model.
- Try different models and assess their performance.
- Predict on the 2024 March Madness bracket.

## Imports

In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, log_loss, accuracy_score, confusion_matrix, classification_report
from xgboost import XGBRegressor, XGBClassifier

# display 100 rows and 100 columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

# global random seed
SEED = 0

## Load Data

In [147]:
# load in features compact
fcomp = pd.read_csv('data/processed/features_compact.csv')

# load in features detailed
# fdet = pd.read_csv('data/processed/features_detailed.csv')

## Linear/Logistic Regression
This problem can be treated as a regression problem (label is __score differential__). It can also be treated as binary classification (label is __win/loss__).

In [148]:
# one-hot encode A_Loc col, drop loc cols
fcomp = pd.get_dummies(fcomp, columns=['A_Loc'], dtype=int)
fcomp = fcomp.drop(columns=['A_Loc_N', 'B_Loc'])

In [149]:
# create adjusted score diff col (score diff is inversely scaled by NumOT periods)
fcomp['score_diff_adj'] = fcomp['score_diff'] / (2 ** fcomp['NumOT'])

# check
fcomp[fcomp['NumOT'] > 0][['score_diff', 'NumOT', 'score_diff_adj']].sort_values(by='NumOT', ascending=False).head()

Unnamed: 0,score_diff,NumOT,score_diff_adj
654,-8,3,-1.0
1548,-2,2,-0.5
1330,-12,2,-3.0
2319,-2,2,-0.5
949,-5,2,-1.25


In [150]:
# # subtract 1985 from season col (to represent years since 1985). this may capture changes in game play over time
# fcomp['Season'] = fcomp['Season'] - 1985

# columns to drop
cols_def_drop = ['A_FullSeed', 'A_Region', 'A_Score', 'B_FullSeed', 'B_Region', 'B_Score', 'DayNum', 'NumOT', 'score_diff']
cols_maybe_drop = ['Season']

# drop cols
fcomp = fcomp.drop(columns=(cols_def_drop + cols_maybe_drop))

In [151]:
# split on gender, drop team IDs
mfcomp = fcomp[fcomp['A_TeamID'] < 3000].drop(columns=['A_TeamID', 'B_TeamID'])
wfcomp = fcomp[fcomp['A_TeamID'] >= 3000].drop(columns=['A_TeamID', 'B_TeamID'])

We will try the original aggregated stats for both team A and B as features (mfcomp/wfcomp), and also the differences between the two teams' stats (mcomp_diff/wcomp_diff seen below).

In [152]:
# create diff cols (features between 2 teams)
cols_to_diff = ['1_pos_game_ratio', '1_pos_win_ratio', 'Seed', 'away_win_ratio', 'home_win_ratio', 'max_loss_diff', 'max_win_diff', 'mean_diff', 'mean_papg', 'mean_ppg', 
                'neutral_win_ratio', 'num_games', 'ot_ratio', 'ot_win_ratio', 'recent_mean_pts_against', 'recent_mean_pts_for', 'recent_mean_score_diff', 'recent_std_pts_against', 
                'recent_std_pts_for', 'recent_std_score_diff', 'recent_win_ratio', 'std_diff', 'std_papg', 'std_ppg', 'win_ratio']

cols_to_keep = ['A_TeamID', 'A_1_pos_loss_missing', 'A_1_pos_win_missing', 'A_ot_loss_missing', 'A_ot_win_missing', 'A_PlayIn', 'B_1_pos_loss_missing', 'B_1_pos_win_missing', 
                'B_ot_loss_missing', 'B_ot_win_missing', 'B_PlayIn', 'A_Loc_A', 'A_Loc_H', 'score_diff_adj', 'win']

# create diff df
fcomp_diff = fcomp[cols_to_keep].copy()

# create diff cols
for col in cols_to_diff:
    fcomp_diff[col + '_diff'] = fcomp['A_' + col] - fcomp['B_' + col]

# check
fcomp_diff.head()

Unnamed: 0,A_TeamID,A_1_pos_loss_missing,A_1_pos_win_missing,A_ot_loss_missing,A_ot_win_missing,A_PlayIn,B_1_pos_loss_missing,B_1_pos_win_missing,B_ot_loss_missing,B_ot_win_missing,B_PlayIn,A_Loc_A,A_Loc_H,score_diff_adj,win,1_pos_game_ratio_diff,1_pos_win_ratio_diff,Seed_diff,away_win_ratio_diff,home_win_ratio_diff,max_loss_diff_diff,max_win_diff_diff,mean_diff_diff,mean_papg_diff,mean_ppg_diff,neutral_win_ratio_diff,num_games_diff,ot_ratio_diff,ot_win_ratio_diff,recent_mean_pts_against_diff,recent_mean_pts_for_diff,recent_mean_score_diff_diff,recent_std_pts_against_diff,recent_std_pts_for_diff,recent_std_score_diff_diff,recent_win_ratio_diff,std_diff_diff,std_papg_diff,std_ppg_diff,win_ratio_diff
0,1116,0,0,1,1,0,0,0,1,1,0,0,0,9.0,1,0.130303,0.107143,1,0.0,0.075758,0,-14,-6.830303,2.430303,-4.4,0.033333,3,0.0,0.0,-2.333333,11.666667,14.0,-7.000934,8.174919,-0.774983,0.333333,-3.796423,-2.187809,-1.042881,-0.030303
1,1120,0,0,0,0,0,0,0,1,1,0,0,0,1.0,1,0.082759,0.228571,5,-0.145455,-0.047619,-24,11,-0.110345,1.335172,1.224828,-0.166667,4,0.068966,0.5,-11.833333,-3.166667,8.666667,1.91112,1.980623,-1.894773,0.166667,-0.917577,2.682259,2.985585,-0.05931
2,1250,0,0,1,0,0,1,0,1,1,0,0,0,-25.0,0,0.650064,-0.761905,15,-0.493506,-0.589744,29,-23,-20.114943,10.132822,-9.98212,-0.5,2,0.034483,1.0,5.666667,-14.166667,-19.833333,-3.655845,-2.772756,-7.094572,-0.166667,5.793999,-2.019426,-3.344166,-0.546616
3,1229,0,0,1,1,0,0,0,1,1,0,0,0,3.0,1,-0.058201,-0.1,1,-0.127273,0.142857,-8,-2,2.177249,1.022487,3.199735,0.333333,-1,0.0,0.0,-3.333333,3.166667,6.5,-6.142847,2.126248,-3.198151,0.166667,-0.820835,1.768045,1.234247,0.062169
4,1242,0,0,1,1,0,0,0,1,1,0,0,0,11.0,1,-0.033333,0.0,-11,0.016667,0.0,2,4,1.077778,7.4,8.477778,-0.1,3,0.0,0.0,14.0,8.5,-5.5,0.286807,0.005587,-2.346062,0.0,0.989648,2.404957,0.781976,0.025926


In [153]:
# split on gender, drop team IDs
mfcomp_diff = fcomp_diff[fcomp_diff['A_TeamID'] < 3000].drop(columns=['A_TeamID'])
wfcomp_diff = fcomp_diff[fcomp_diff['A_TeamID'] >= 3000].drop(columns=['A_TeamID'])

In [164]:
# look at num feats for the 2 datasets
print(f'Features of both team A and B: {mfcomp.shape[1] - 2}')
print(f'Features of the DIFFERENCE between team A and B: {mfcomp_diff.shape[1] - 2}')

Features of both team A and B: 62
Features of the DIFFERENCE between team A and B: 37


In [154]:
# function that takes in estimator, data, models_df, tournament, test_size
def run_model(estimator, data, regression, models_df, tournament, diff_cols, test_size):
    """
    Run a model on data and save results to models_df.

    Parameters
    ----------
    estimator : sklearn estimator
        Estimator to use for modeling.
    data : pd.DataFrame
        Data to model.
    regression : bool
        Whether to model score differential (regression) or win/loss (classification).
    models_df : pd.DataFrame
        DataFrame to save results to.
    tournament : str
        Gender - 'M' or 'F'.
    diff_cols : bool
        Whether to use diff features.
    test_size : float
        Proportion of data to use for testing.

    Returns
    -------
    models_df.tail() : pd.DataFrame
        Last 5 rows of dataframe with results from model.
    """

    # predicting score differential
    if regression:
        # split data
        X = data.drop(columns=['score_diff_adj', 'win'])
        y = data['score_diff_adj']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)
        
        # scale data
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # fit model
        estimator.fit(X_train, y_train)
        
        # predict
        train_preds = estimator.predict(X_train)
        y_pred = estimator.predict(X_test)
        
        # evaluate
        train_rmse = mean_squared_error(y_train, train_preds, squared=False)
        test_rmse = mean_squared_error(y_test, y_pred, squared=False)
        train_r2 = r2_score(y_test, y_pred)
        test_r2 = r2_score(y_test, y_pred)
        train_acc = accuracy_score(np.sign(y_train), np.sign(train_preds))
        test_acc = accuracy_score(np.sign(y_test), np.sign(y_pred))
        
        # save results to models_df
        models_df.loc[len(models_df.index)] = [tournament, estimator, diff_cols, test_size, train_r2, test_r2, train_rmse, test_rmse, train_acc, test_acc]
        
        return models_df.tail()
    
    # predicting win/loss
    else:
        # split data
        X = data.drop(columns=['score_diff_adj', 'win'])
        y = data['win']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)
        
        # scale data
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # fit model
        estimator.fit(X_train, y_train)
        
        # predict
        train_preds = estimator.predict(X_train)
        y_pred = estimator.predict(X_test)
        
        # evaluate
        train_log_loss = log_loss(y_train, train_preds)
        test_log_loss = log_loss(y_test, y_pred)
        train_acc = accuracy_score(y_train, train_preds)
        test_acc = accuracy_score(y_test, y_pred)
        
        # save results to models_df
        models_df.loc[len(models_df.index)] = [tournament, estimator, diff_cols, test_size, train_log_loss, test_log_loss, train_acc, test_acc]
        
        return models_df.tail()

### Men's

#### Regression (Predicting <ins>Score Differential</ins>)

In [155]:
# create a df to hold regression models
reg_df = pd.DataFrame(columns=['Tournament', 'Model', 'Diff_Features', 'Test_Size', 'Train_R2', 'Test_R2', 'Train_RMSE', 'Test_RMSE', 'Train_Acc', 'Test_Acc'])

In [178]:
# run model
run_model(estimator=XGBRegressor(), data=mfcomp, regression=True, models_df=reg_df, tournament='M', diff_cols=False, test_size=0.2)

Unnamed: 0,Tournament,Model,Diff_Features,Test_Size,Train_R2,Test_R2,Train_RMSE,Test_RMSE,Train_Acc,Test_Acc
12,M,"XGBRegressor(base_score=0.5, booster='gbtree',...",False,0.6,0.249225,0.249225,0.079757,12.732147,1.0,0.668253
13,M,"XGBRegressor(base_score=0.5, booster='gbtree',...",False,0.5,0.244266,0.244266,0.215661,12.718735,1.0,0.671289
14,M,"XGBRegressor(base_score=0.5, booster='gbtree',...",False,0.4,0.2366,0.2366,0.328722,12.855532,0.99932,0.66157
15,M,"XGBRegressor(base_score=0.5, booster='gbtree',...",False,0.3,0.230406,0.230406,0.587856,12.795877,0.995918,0.648098
16,M,"XGBRegressor(base_score=0.5, booster='gbtree',...",False,0.2,0.214883,0.214883,0.859754,12.647062,0.993878,0.645621


The cell above was run many times with different inputs to save space in the notebook.

In [179]:
# save reg_df
reg_df.to_csv('models/regression_models_train_and_test.csv', index=False)

### Classification (Predicting <ins>Win/Loss</ins>)

In [None]:
# create a df to hold classification models
class_df = pd.DataFrame(columns=['Tournament', 'Model', 'RMSE', 'R2', 'Accuracy', 'Diff_Features', 'Test_Size'])