# Model
In this notebook, we:
- Define the structure of our prediction model.
- Try different models and assess their performance.
- Predict on the 2024 March Madness bracket.

## Imports

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, log_loss, accuracy_score, confusion_matrix, classification_report
from xgboost import XGBRegressor, XGBClassifier

# display 100 rows and 100 columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

# global random seed
SEED = 0

## Load Data

In [58]:
# load in features compact
fcomp = pd.read_csv('data/processed/features_compact.csv')

# load in features detailed
# fdet = pd.read_csv('data/processed/features_detailed.csv')

## Linear/Logistic Regression
This problem can be treated as a regression problem (label is __score differential__). It can also be treated as binary classification (label is __win/loss__).

In [59]:
# one-hot encode A_Loc col, drop loc cols
fcomp = pd.get_dummies(fcomp, columns=['A_Loc'], dtype=int)
fcomp = fcomp.drop(columns=['A_Loc_N', 'B_Loc'])

In [60]:
# create adjusted score diff col (score diff is inversely scaled by NumOT periods)
fcomp['score_diff_adj'] = fcomp['score_diff'] / (2 ** fcomp['NumOT'])

# check
fcomp[fcomp['NumOT'] > 0][['score_diff', 'NumOT', 'score_diff_adj']].sort_values(by='NumOT', ascending=False).head()

Unnamed: 0,score_diff,NumOT,score_diff_adj
654,-8,3,-1.0
1548,-2,2,-0.5
1330,-12,2,-3.0
2319,-2,2,-0.5
949,-5,2,-1.25


In [None]:
# save team scores to use later
team_scores = fcomp[['A_Score', 'B_Score']].copy()

In [106]:
fcomp[(fcomp['A_Seed'] == 16) & (fcomp['B_Seed'] == 16)]

Unnamed: 0,A_1_pos_game_ratio,A_1_pos_loss_missing,A_1_pos_win_missing,A_1_pos_win_ratio,A_PlayIn,A_Seed,A_TeamID,A_away_win_ratio,A_home_win_ratio,A_max_loss_diff,A_max_win_diff,A_mean_diff,A_mean_papg,A_mean_ppg,A_neutral_win_ratio,A_num_games,A_ot_loss_missing,A_ot_ratio,A_ot_win_missing,A_ot_win_ratio,A_recent_mean_pts_against,A_recent_mean_pts_for,A_recent_mean_score_diff,A_recent_std_pts_against,A_recent_std_pts_for,A_recent_std_score_diff,A_recent_win_ratio,A_std_diff,A_std_papg,A_std_ppg,A_win_ratio,B_1_pos_game_ratio,B_1_pos_loss_missing,B_1_pos_win_missing,B_1_pos_win_ratio,B_PlayIn,B_Seed,B_TeamID,B_away_win_ratio,B_home_win_ratio,B_max_loss_diff,B_max_win_diff,B_mean_diff,B_mean_papg,B_mean_ppg,B_neutral_win_ratio,B_num_games,B_ot_loss_missing,B_ot_ratio,B_ot_win_missing,B_ot_win_ratio,B_recent_mean_pts_against,B_recent_mean_pts_for,B_recent_mean_score_diff,B_recent_std_pts_against,B_recent_std_pts_for,B_recent_std_score_diff,B_recent_win_ratio,B_std_diff,B_std_papg,B_std_ppg,B_win_ratio,win,A_Loc_A,A_Loc_H,score_diff_adj
1008,0.6,0,0,0.333333,1,16,1322,0.470588,0.727273,68,27,-2.233333,74.466667,72.233333,1.0,30,1,0.0,1,0.0,73.166667,78.833333,5.666667,4.932883,8.717798,6.020797,0.833333,19.519304,11.056136,10.835136,0.6,0.423077,0,0,0.090909,1,16,1457,0.384615,1.0,23,31,4.384615,60.653846,65.038462,0.5,26,1,0.076923,0,1.0,61.666667,69.0,7.333333,8.770215,6.204837,8.401389,0.833333,11.024425,8.038423,7.053944,0.615385,1,0,0,4.0
1072,0.529412,0,0,0.055556,1,16,1373,0.214286,0.705882,25,20,1.352941,68.235294,69.588235,0.333333,34,0,0.147059,0,0.4,67.333333,74.5,7.166667,5.811865,5.115336,5.005552,0.666667,10.761904,10.330948,8.96362,0.470588,0.482759,0,0,0.357143,1,16,1108,0.470588,1.0,41,24,3.137931,76.655172,79.793103,1.0,29,1,0.068966,0,1.0,74.666667,90.333333,15.666667,9.003703,12.242004,8.936815,1.0,13.200515,10.020736,9.671112,0.689655,1,0,0,4.0
1136,0.655172,0,0,0.210526,1,16,1421,0.125,0.833333,52,14,-7.241379,78.448276,71.206897,1.0,29,1,0.137931,0,1.0,72.833333,70.333333,-2.5,5.986095,7.916228,5.416026,0.5,24.46355,12.333069,8.942216,0.448276,0.5,0,0,0.266667,1,16,1411,0.333333,0.785714,23,43,1.966667,70.833333,72.8,0.75,30,1,0.033333,0,1.0,68.0,73.333333,5.333333,2.5,4.301163,5.057997,0.833333,11.009877,12.137088,10.894732,0.6,1,0,0,4.0
1200,0.428571,0,0,0.333333,1,16,1250,0.357143,0.916667,26,28,2.178571,63.678571,65.857143,1.0,28,0,0.142857,0,0.5,60.166667,60.666667,0.5,11.923366,8.158159,6.23164,0.666667,11.06872,13.08131,12.335812,0.642857,0.5,0,0,0.133333,1,16,1197,0.2,0.666667,34,25,-1.866667,73.966667,72.1,1.0,30,0,0.033333,1,0.0,67.5,70.833333,3.333333,9.658042,10.611838,4.977728,0.666667,17.098926,11.527648,9.292539,0.466667,0,0,0,-15.0
1264,0.62069,0,0,0.111111,1,16,1324,0.1875,0.5,31,26,-3.137931,73.482759,70.344828,1.0,29,1,0.034483,0,1.0,66.666667,73.0,6.333333,12.698425,14.991664,9.385272,0.833333,16.89469,10.224181,11.522374,0.37931,0.482759,0,0,0.214286,1,16,1105,0.428571,0.7,32,26,1.275862,67.793103,69.068966,0.6,29,1,0.068966,0,1.0,60.166667,68.833333,8.666667,5.492419,4.31406,3.872983,0.666667,13.405888,12.101819,8.622278,0.551724,1,0,0,10.0
1328,0.483871,0,0,0.066667,1,16,1214,0.384615,0.5,40,43,3.0,63.806452,66.806452,1.0,31,1,0.0,1,0.0,59.333333,63.0,3.666667,3.800585,6.027714,5.840472,0.666667,14.528192,10.008598,10.959288,0.516129,0.5625,0,0,0.222222,1,16,1284,0.529412,0.666667,17,20,0.84375,64.0,64.84375,0.333333,32,1,0.09375,0,1.0,65.833333,69.5,3.666667,15.483863,15.864005,4.536886,0.833333,9.774896,12.61377,11.663607,0.5625,0,0,0,-22.0
1392,0.516129,0,0,0.3125,1,16,1197,0.4,0.8,39,33,-1.387097,70.354839,68.967742,0.666667,31,0,0.096774,0,0.666667,63.0,66.5,3.5,3.559026,6.5,4.358899,0.833333,16.825848,9.65933,11.244269,0.580645,0.454545,0,0,0.333333,1,16,1310,0.692308,0.6,22,33,2.333333,73.727273,76.060606,0.8,33,0,0.090909,0,0.333333,73.166667,83.5,10.333333,10.925505,7.148426,8.115828,1.0,10.897987,10.33311,10.280679,0.666667,0,0,0,-8.0
1456,0.676471,0,0,0.26087,1,16,1164,0.222222,0.5,49,23,-6.970588,66.117647,59.147059,0.833333,34,0,0.117647,0,0.5,64.5,68.0,3.5,6.78233,10.614456,7.158911,0.833333,24.613011,12.773915,10.614547,0.411765,0.46875,0,0,0.066667,1,16,1291,0.588235,0.533333,24,37,2.5625,66.5,69.0625,0.0,32,0,0.03125,1,0.0,64.5,75.0,10.5,5.307228,5.894913,2.84312,0.833333,11.296736,6.505415,9.144964,0.5625,0,0,0,-9.0
1520,0.46875,0,0,0.133333,1,16,1287,0.294118,0.909091,38,21,0.65625,68.6875,69.34375,0.5,32,1,0.03125,0,1.0,69.833333,71.0,1.166667,7.291548,11.697578,7.371115,0.5,12.441774,7.488943,9.590441,0.53125,0.344828,0,0,0.1,1,16,1106,0.533333,0.888889,29,44,5.896552,63.586207,69.482759,0.8,29,1,0.0,1,0.0,61.5,69.5,8.0,6.538348,10.17759,3.947573,0.833333,11.429132,8.114986,9.455479,0.689655,1,0,0,15.0
1584,0.533333,0,0,0.3125,1,16,1457,0.352941,0.833333,36,33,0.633333,61.933333,62.566667,1.0,30,0,0.066667,0,0.5,60.166667,65.166667,5.0,11.801601,8.891319,4.307616,0.666667,14.246561,11.976598,9.83178,0.566667,0.5625,0,0,0.222222,1,16,1115,0.263158,0.888889,20,34,-0.28125,64.6875,64.40625,1.0,32,0,0.125,0,0.75,54.833333,61.833333,7.0,12.069245,14.136242,5.627314,0.833333,13.660014,9.782084,11.950255,0.53125,0,0,0,-17.0


In [61]:
# # subtract 1985 from season col (to represent years since 1985). this may capture changes in game play over time
# fcomp['Season'] = fcomp['Season'] - 1985

# columns to drop
cols_def_drop = ['A_FullSeed', 'A_Region', 'A_Score', 'B_FullSeed', 'B_Region', 'B_Score', 'DayNum', 'NumOT', 'score_diff']
cols_maybe_drop = ['Season']

# drop cols
fcomp = fcomp.drop(columns=(cols_def_drop + cols_maybe_drop))

In [62]:
# split on gender, drop team IDs
mfcomp = fcomp[fcomp['A_TeamID'] < 3000].drop(columns=['A_TeamID', 'B_TeamID'])
wfcomp = fcomp[fcomp['A_TeamID'] >= 3000].drop(columns=['A_TeamID', 'B_TeamID'])

We will try the original aggregated stats for both team A and B as features (mfcomp/wfcomp), and also the differences between the two teams' stats (mcomp_diff/wcomp_diff seen below).

In [63]:
# create diff cols (features between 2 teams)
cols_to_diff = ['1_pos_game_ratio', '1_pos_win_ratio', 'Seed', 'away_win_ratio', 'home_win_ratio', 'max_loss_diff', 'max_win_diff', 'mean_diff', 'mean_papg', 'mean_ppg', 
                'neutral_win_ratio', 'num_games', 'ot_ratio', 'ot_win_ratio', 'recent_mean_pts_against', 'recent_mean_pts_for', 'recent_mean_score_diff', 'recent_std_pts_against', 
                'recent_std_pts_for', 'recent_std_score_diff', 'recent_win_ratio', 'std_diff', 'std_papg', 'std_ppg', 'win_ratio']

cols_to_keep = ['A_TeamID', 'A_1_pos_loss_missing', 'A_1_pos_win_missing', 'A_ot_loss_missing', 'A_ot_win_missing', 'A_PlayIn', 'B_1_pos_loss_missing', 'B_1_pos_win_missing', 
                'B_ot_loss_missing', 'B_ot_win_missing', 'B_PlayIn', 'A_Loc_A', 'A_Loc_H', 'score_diff_adj', 'win']

# create diff df
fcomp_diff = fcomp[cols_to_keep].copy()

# create diff cols
for col in cols_to_diff:
    fcomp_diff[col + '_diff'] = fcomp['A_' + col] - fcomp['B_' + col]

# check
fcomp_diff.head()

Unnamed: 0,A_TeamID,A_1_pos_loss_missing,A_1_pos_win_missing,A_ot_loss_missing,A_ot_win_missing,A_PlayIn,B_1_pos_loss_missing,B_1_pos_win_missing,B_ot_loss_missing,B_ot_win_missing,B_PlayIn,A_Loc_A,A_Loc_H,score_diff_adj,win,1_pos_game_ratio_diff,1_pos_win_ratio_diff,Seed_diff,away_win_ratio_diff,home_win_ratio_diff,max_loss_diff_diff,max_win_diff_diff,mean_diff_diff,mean_papg_diff,mean_ppg_diff,neutral_win_ratio_diff,num_games_diff,ot_ratio_diff,ot_win_ratio_diff,recent_mean_pts_against_diff,recent_mean_pts_for_diff,recent_mean_score_diff_diff,recent_std_pts_against_diff,recent_std_pts_for_diff,recent_std_score_diff_diff,recent_win_ratio_diff,std_diff_diff,std_papg_diff,std_ppg_diff,win_ratio_diff
0,1116,0,0,1,1,0,0,0,1,1,0,0,0,9.0,1,0.130303,0.107143,1,0.0,0.075758,0,-14,-6.830303,2.430303,-4.4,0.033333,3,0.0,0.0,-2.333333,11.666667,14.0,-7.000934,8.174919,-0.774983,0.333333,-3.796423,-2.187809,-1.042881,-0.030303
1,1120,0,0,0,0,0,0,0,1,1,0,0,0,1.0,1,0.082759,0.228571,5,-0.145455,-0.047619,-24,11,-0.110345,1.335172,1.224828,-0.166667,4,0.068966,0.5,-11.833333,-3.166667,8.666667,1.91112,1.980623,-1.894773,0.166667,-0.917577,2.682259,2.985585,-0.05931
2,1250,0,0,1,0,0,1,0,1,1,0,0,0,-25.0,0,0.650064,-0.761905,15,-0.493506,-0.589744,29,-23,-20.114943,10.132822,-9.98212,-0.5,2,0.034483,1.0,5.666667,-14.166667,-19.833333,-3.655845,-2.772756,-7.094572,-0.166667,5.793999,-2.019426,-3.344166,-0.546616
3,1229,0,0,1,1,0,0,0,1,1,0,0,0,3.0,1,-0.058201,-0.1,1,-0.127273,0.142857,-8,-2,2.177249,1.022487,3.199735,0.333333,-1,0.0,0.0,-3.333333,3.166667,6.5,-6.142847,2.126248,-3.198151,0.166667,-0.820835,1.768045,1.234247,0.062169
4,1242,0,0,1,1,0,0,0,1,1,0,0,0,11.0,1,-0.033333,0.0,-11,0.016667,0.0,2,4,1.077778,7.4,8.477778,-0.1,3,0.0,0.0,14.0,8.5,-5.5,0.286807,0.005587,-2.346062,0.0,0.989648,2.404957,0.781976,0.025926


In [64]:
# split on gender, drop team IDs
mfcomp_diff = fcomp_diff[fcomp_diff['A_TeamID'] < 3000].drop(columns=['A_TeamID'])
wfcomp_diff = fcomp_diff[fcomp_diff['A_TeamID'] >= 3000].drop(columns=['A_TeamID'])

In [65]:
# look at num feats for the 2 datasets
print(f'Features of both team A and B: {mfcomp.shape[1] - 2}')
print(f'Features of the DIFFERENCE between team A and B: {mfcomp_diff.shape[1] - 2}')

Features of both team A and B: 62
Features of the DIFFERENCE between team A and B: 37


In [91]:
# function that takes in estimator, data, models_df, tournament, test_size
def run_model(estimator, data, regression, models_df, tournament):
    """
    Run a model on data and save results to models_df.

    Parameters
    ----------
    estimator : sklearn estimator
        Estimator to use for modeling.
    data : pd.DataFrame
        Data to model.
    regression : bool
        Whether to model score differential (regression) or win/loss (classification).
    models_df : pd.DataFrame
        DataFrame to save results to.
    tournament : str
        Gender - 'M' or 'F'.

    Returns
    -------
    models_df.tail() : pd.DataFrame
        Last 5 rows of dataframe with results from model.
    """

    # predicting score differential
    if regression:
        # define X and y
        X = data.drop(columns=['score_diff_adj', 'win'])
        y = data['score_diff_adj']

        # define 5-fold cross-validation
        kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

        # initialize lists to store metrics
        rmse_scores_train = []
        rmse_scores_test = []
        r2_scores_train = []
        r2_scores_test = []
        acc_scores_train = []
        acc_scores_test = []

        for train_index, test_index in kf.split(X):
            # split data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # scale data
            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # fit model
            estimator.fit(X_train, y_train)

            # predict
            train_preds = estimator.predict(X_train)
            test_preds = estimator.predict(X_test)

            # evaluate
            rmse_scores_train.append(mean_squared_error(y_train, train_preds, squared=False))
            rmse_scores_test.append(mean_squared_error(y_test, test_preds, squared=False))
            r2_scores_train.append(r2_score(y_train, train_preds))
            r2_scores_test.append(r2_score(y_test, test_preds))
            acc_scores_train.append(accuracy_score(np.sign(y_train), np.sign(train_preds)))
            acc_scores_test.append(accuracy_score(np.sign(y_test), np.sign(test_preds)))

        # average the metrics across folds
        train_rmse = np.mean(rmse_scores_train)
        test_rmse = np.mean(rmse_scores_test)
        train_r2 = np.mean(r2_scores_train)
        test_r2 = np.mean(r2_scores_test)
        train_acc = np.mean(acc_scores_train)
        test_acc = np.mean(acc_scores_test)

        # Save results to models_df
        models_df.loc[len(models_df.index)] = [tournament, 'adj_score_diff', estimator, X.shape[1], X.columns.to_list(), 5, train_r2, test_r2, train_rmse, test_rmse, train_acc, test_acc]
        
        return models_df.tail()
    
    # predicting win/loss
    else:
        # define X and y
        X = data.drop(columns=['score_diff_adj', 'win'])
        y = data['win']

        # define 5-fold cross-validation
        kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

        # initialize lists to store metrics
        log_loss_train = []
        log_loss_test = []
        acc_scores_train = []
        acc_scores_test = []

        for train_index, test_index in kf.split(X):
            # split data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # scale data
            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # fit model
            estimator.fit(X_train, y_train)

            # predict
            train_preds = estimator.predict(X_train)
            test_preds = estimator.predict(X_test)

            # evaluate
            log_loss_train.append(log_loss(y_train, train_preds))
            log_loss_test.append(log_loss(y_test, test_preds))
            acc_scores_train.append(accuracy_score(np.sign(y_train), np.sign(train_preds)))
            acc_scores_test.append(accuracy_score(np.sign(y_test), np.sign(test_preds)))

        # average the metrics across folds
        train_log_loss = np.mean(log_loss_train)
        test_log_loss = np.mean(log_loss_test)
        train_acc = np.mean(acc_scores_train)
        test_acc = np.mean(acc_scores_test)
        
        # save results to models_df
        models_df.loc[len(models_df.index)] = [tournament, 'A_Win', estimator, X.shape[1], X.columns.to_list(), 5, train_log_loss, test_log_loss, train_acc, test_acc]
        
        return models_df.tail()

### Regression (Predicting <ins>Score Differential</ins>)

#### Men's

In [92]:
# create a df to hold regression models
reg_df = pd.DataFrame(columns=['Tournament', 'Label', 'Model', 'Num_Features', 'Features', 'Num_CV_Folds', 'Train_R2', 'Test_R2', 'Train_RMSE', 'Test_RMSE', 'Train_Acc', 'Test_Acc'])

# load reg df
# reg_df = pd.read_csv('data/processed/regression_models.csv')

In [93]:
# define regression models
models = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_jobs=-1), XGBRegressor(n_jobs=-1), SVR(), KNeighborsRegressor(n_jobs=-1)]
datasets = [mfcomp, mfcomp_diff]

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, data=dataset, regression=True, models_df=reg_df, tournament='M')

In [94]:
# inspect
reg_df.tail(10)

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_R2,Test_R2,Train_RMSE,Test_RMSE,Train_Acc,Test_Acc
0,M,adj_score_diff,LinearRegression(n_jobs=-1),62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,0.417254,0.377014,11.160668,11.509196,0.717361,0.711553
1,M,adj_score_diff,LinearRegression(n_jobs=-1),37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,0.415972,0.389867,11.173144,11.389649,0.712771,0.7038
2,M,adj_score_diff,"(DecisionTreeRegressor(max_features=1.0, rando...",62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,0.911757,0.374615,4.342668,11.52667,0.931966,0.701754
3,M,adj_score_diff,"(DecisionTreeRegressor(max_features=1.0, rando...",37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,0.91348,0.369573,4.300074,11.567861,0.932273,0.699313
4,M,adj_score_diff,"XGBRegressor(base_score=0.5, booster='gbtree',...",62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,0.997001,0.265625,0.799427,12.493124,0.993472,0.660961
5,M,adj_score_diff,"XGBRegressor(base_score=0.5, booster='gbtree',...",37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,0.994838,0.281542,1.048096,12.357205,0.988372,0.6838
6,M,adj_score_diff,SVR(),62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,0.372484,0.339052,11.58171,11.856539,0.720012,0.705423
7,M,adj_score_diff,SVR(),37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,0.363161,0.3338,11.667324,11.903634,0.714504,0.703799
8,M,adj_score_diff,KNeighborsRegressor(n_jobs=-1),62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,0.468786,0.194126,10.656043,13.093428,0.743676,0.642192
9,M,adj_score_diff,KNeighborsRegressor(n_jobs=-1),37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,0.45888,0.17523,10.755022,13.244022,0.73572,0.63607


With untuned models, best performance on men's test data is __71.2% acc__ using a Linear Regression with the non-diff features.

#### Women's

In [95]:
# define regression models
models = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_jobs=-1), XGBRegressor(n_jobs=-1), SVR(), KNeighborsRegressor(n_jobs=-1)]
datasets = [wfcomp, wfcomp_diff]

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, data=dataset, regression=True, models_df=reg_df, tournament='W')

In [96]:
# inspect
reg_df.tail(10)

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_R2,Test_R2,Train_RMSE,Test_RMSE,Train_Acc,Test_Acc
10,W,adj_score_diff,LinearRegression(n_jobs=-1),62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,0.657987,0.62425,12.175749,12.747932,0.795642,0.781434
11,W,adj_score_diff,LinearRegression(n_jobs=-1),37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,0.654486,0.635684,12.238462,12.550268,0.792641,0.78712
12,W,adj_score_diff,"(DecisionTreeRegressor(max_features=1.0, rando...",62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,0.94556,0.611157,4.857876,12.965723,0.949622,0.773216
13,W,adj_score_diff,"(DecisionTreeRegressor(max_features=1.0, rando...",37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,0.94674,0.61562,4.805069,12.890524,0.951674,0.766272
14,W,adj_score_diff,"XGBRegressor(base_score=0.5, booster='gbtree',...",62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,0.999672,0.565048,0.372988,13.70029,0.999842,0.756155
15,W,adj_score_diff,"XGBRegressor(base_score=0.5, booster='gbtree',...",37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,0.999413,0.552594,0.503591,13.906069,0.999526,0.746704
16,W,adj_score_diff,SVR(),62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,0.535718,0.510561,14.186634,14.54804,0.780796,0.765008
17,W,adj_score_diff,SVR(),37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,0.518741,0.495108,14.443841,14.7756,0.769268,0.765008
18,W,adj_score_diff,KNeighborsRegressor(n_jobs=-1),62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,0.618946,0.432647,12.852413,15.657415,0.781903,0.703099
19,W,adj_score_diff,KNeighborsRegressor(n_jobs=-1),37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,0.606968,0.416758,13.052565,15.868571,0.77448,0.704374


With untuned models, best performance on wommen's test data is __78.7% acc__ using a Linear Regression with the diff features.

In [97]:
# save reg_df
reg_df.to_csv('models/regression_models.csv', index=False)

### Classification (Predicting <ins>Win/Loss</ins>)

In [98]:
# create a df to hold classification models
class_df = pd.DataFrame(columns=['Tournament', 'Label', 'Model', 'Num_Features', 'Features', 'Num_CV_Folds', 'Train_LogLoss', 'Test_LogLoss', 'Train_Acc', 'Test_Acc'])

# load class df
# class_df = pd.read_csv('models/classification_models.csv')

#### Men's

In [99]:
# define regression models
models = [LogisticRegression(n_jobs=-1, random_state=SEED), RandomForestClassifier(n_jobs=-1), XGBClassifier(n_jobs=-1), SVC(), KNeighborsClassifier(n_jobs=-1)]
datasets = [mfcomp, mfcomp_diff]

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, data=dataset, regression=False, models_df=class_df, tournament='M')

In [100]:
# inspect
class_df.tail(10)

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_LogLoss,Test_LogLoss,Train_Acc,Test_Acc
0,M,A_Win,"LogisticRegression(n_jobs=-1, random_state=0)",62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,10.0587,10.617469,0.72093,0.705427
1,M,A_Win,"LogisticRegression(n_jobs=-1, random_state=0)",37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,10.2719,10.764347,0.715015,0.701353
2,M,A_Win,"(DecisionTreeClassifier(max_features='sqrt', r...",62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,2.220446e-16,11.367316,1.0,0.684624
3,M,A_Win,"(DecisionTreeClassifier(max_features='sqrt', r...",37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,2.220446e-16,11.073322,1.0,0.69278
4,M,A_Win,"XGBClassifier(base_score=0.5, booster='gbtree'...",62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,2.220446e-16,11.99968,1.0,0.667079
5,M,A_Win,"XGBClassifier(base_score=0.5, booster='gbtree'...",37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,2.220446e-16,12.411427,1.0,0.655656
6,M,A_Win,SVC(),62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,8.816072,10.955808,0.755406,0.696041
7,M,A_Win,SVC(),37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,9.371213,10.926235,0.740004,0.696861
8,M,A_Win,KNeighborsClassifier(n_jobs=-1),62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,8.282981,13.117529,0.770196,0.636066
9,M,A_Win,KNeighborsClassifier(n_jobs=-1),37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,8.827066,13.55855,0.755101,0.62383


With untuned models, best performance on men's test data is __70.5% acc__ using a Logistic Regression with the non-diff features.

#### Women's

In [101]:
# define regression models
models = [LogisticRegression(n_jobs=-1, random_state=SEED), RandomForestClassifier(n_jobs=-1), XGBClassifier(n_jobs=-1), SVC(), KNeighborsClassifier(n_jobs=-1)]
datasets = [wfcomp, wfcomp_diff]

# run regression models
for model in models:
    for dataset in datasets:
        # run model
        run_model(estimator=model, data=dataset, regression=False, models_df=class_df, tournament='W')

In [102]:
# inspect
class_df.tail(10)

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_LogLoss,Test_LogLoss,Train_Acc,Test_Acc
10,W,A_Win,"LogisticRegression(n_jobs=-1, random_state=0)",62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,7.251991,7.969097,0.7988,0.778904
11,W,A_Win,"LogisticRegression(n_jobs=-1, random_state=0)",37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,7.496748,7.718376,0.792009,0.78586
12,W,A_Win,"(DecisionTreeClassifier(max_features='sqrt', r...",62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,2.220446e-16,8.538185,1.0,0.763115
13,W,A_Win,"(DecisionTreeClassifier(max_features='sqrt', r...",37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,2.220446e-16,8.515085,1.0,0.763756
14,W,A_Win,"XGBClassifier(base_score=0.5, booster='gbtree'...",62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,2.220446e-16,8.971118,1.0,0.751104
15,W,A_Win,"XGBClassifier(base_score=0.5, booster='gbtree'...",37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,2.220446e-16,9.038835,1.0,0.749225
16,W,A_Win,SVC(),62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,6.398155,8.333161,0.822489,0.768804
17,W,A_Win,SVC(),37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,6.973093,8.720253,0.806538,0.758064
18,W,A_Win,KNeighborsClassifier(n_jobs=-1),62,"[A_1_pos_game_ratio, A_1_pos_loss_missing, A_1...",5,7.217813,10.929605,0.799748,0.696768
19,W,A_Win,KNeighborsClassifier(n_jobs=-1),37,"[A_1_pos_loss_missing, A_1_pos_win_missing, A_...",5,7.246212,10.700833,0.79896,0.703115


With untuned models, best performance on wommen's test data is __78.6% acc__ using a Logistic Regression with the diff features.

In [103]:
# save reg_df
class_df.to_csv('models/classification_models.csv', index=False)