# March Machine Learning Mania
In this notebook, we:
- Attempt to predict the number of upsets in a following round based on the number of upsets that have occurred so far in a NCAA tournament.

## Imports

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gc import collect
import os
import sys
from tqdm import tqdm

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, log_loss, accuracy_score, confusion_matrix, classification_report
from xgboost import XGBRegressor, XGBClassifier

# display 100 rows and 100 columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# global random seed
SEED = 0

# set numpy seed
np.random.seed(SEED)

## Load Data

In [19]:
# root dirs
root = 'data/'
mroot = 'data/mens/'
wroot = 'data/womens/'

# load in features compact
upsets = pd.read_csv(root + 'processed/upsets.csv').sort_values(['Season', 'Round'])

# check
upsets.head(12)

Unnamed: 0,Season,Round,Pct_upsets,Avg_seed_diff,Std_seed_diff,Avg_seed,Std_seed,Tournament
0,2003,1,0.25,-3.75,3.011881,8.5,4.609772,M
1,2003,2,0.375,-4.333333,3.204164,5.4375,3.445445,M
2,2003,3,0.25,-1.0,0.0,4.1875,3.186274,M
3,2003,4,0.75,-1.666667,0.57735,2.375,1.932453,M
4,2003,5,0.5,-2.0,0.0,2.25,0.829156,M
5,2003,6,1.0,-1.0,0.0,2.5,0.5,M
6,2004,1,0.125,-4.5,3.0,8.5,4.609772,M
7,2004,2,0.4375,-4.714286,3.093773,5.0625,3.071416,M
8,2004,3,0.25,-3.5,0.707107,4.5625,2.691857,M
9,2004,4,0.25,-1.0,0.0,3.5,2.5,M


## Create Labels

In [20]:
# label is pct of upsets in next round
upsets['target'] = upsets['Pct_upsets'].shift(-1)

# check
upsets.head(12)

Unnamed: 0,Season,Round,Pct_upsets,Avg_seed_diff,Std_seed_diff,Avg_seed,Std_seed,Tournament,target
0,2003,1,0.25,-3.75,3.011881,8.5,4.609772,M,0.375
1,2003,2,0.375,-4.333333,3.204164,5.4375,3.445445,M,0.25
2,2003,3,0.25,-1.0,0.0,4.1875,3.186274,M,0.75
3,2003,4,0.75,-1.666667,0.57735,2.375,1.932453,M,0.5
4,2003,5,0.5,-2.0,0.0,2.25,0.829156,M,1.0
5,2003,6,1.0,-1.0,0.0,2.5,0.5,M,0.125
6,2004,1,0.125,-4.5,3.0,8.5,4.609772,M,0.4375
7,2004,2,0.4375,-4.714286,3.093773,5.0625,3.071416,M,0.25
8,2004,3,0.25,-3.5,0.707107,4.5625,2.691857,M,0.25
9,2004,4,0.25,-1.0,0.0,3.5,2.5,M,1.0


In [21]:
# drop round 6 rows (no target)
upsets = upsets[upsets['Round'] != 6]

# check
upsets.head(10)

Unnamed: 0,Season,Round,Pct_upsets,Avg_seed_diff,Std_seed_diff,Avg_seed,Std_seed,Tournament,target
0,2003,1,0.25,-3.75,3.011881,8.5,4.609772,M,0.375
1,2003,2,0.375,-4.333333,3.204164,5.4375,3.445445,M,0.25
2,2003,3,0.25,-1.0,0.0,4.1875,3.186274,M,0.75
3,2003,4,0.75,-1.666667,0.57735,2.375,1.932453,M,0.5
4,2003,5,0.5,-2.0,0.0,2.25,0.829156,M,1.0
6,2004,1,0.125,-4.5,3.0,8.5,4.609772,M,0.4375
7,2004,2,0.4375,-4.714286,3.093773,5.0625,3.071416,M,0.25
8,2004,3,0.25,-3.5,0.707107,4.5625,2.691857,M,0.25
9,2004,4,0.25,-1.0,0.0,3.5,2.5,M,1.0
10,2004,5,1.0,-1.0,0.0,2.0,0.707107,M,0.0


## Predictions

In [24]:
# split data
mupsets = upsets[upsets['Tournament'] == 'M']
wupsets = upsets[upsets['Tournament'] == 'W']

### Data Preprocessing

In [36]:
def run_model(tournament, estimator, data, features, models_df, scaler=None, folds=5):
    """
    Run a model on data and save results to models_df.

    Parameters
    ----------
    tournament : str
        'M' or 'W'.
    estimator : sklearn estimator
        Estimator to use for modeling.
    data : pd.DataFrame
        Data to model.
    features : list
        Feature subset to use for modeling.
    models_df : pd.DataFrame
        DataFrame to save results to.
    scaler : sklearn scaler, optional
        Scaler to use for data. Default is None.
    folds : int
        Number of cross-validation folds to use.

    Returns
    -------
    None
    """
    
    # create copy to avoid modification
    data = data.copy()

    # drop unused cols
    data = data.drop(columns=['Season', 'Tournament'])

    # define cross-validation
    kf = KFold(n_splits=folds, shuffle=True, random_state=SEED)

    # define X and y
    X = data[features]
    y = data['target']

    # initialize lists to store metrics
    rmse_scores_train = []
    rmse_scores_test = []
    r2_scores_train = []
    r2_scores_test = []
    acc_scores_train = []
    acc_scores_test = []

    for train_index, test_index in kf.split(X):
        # split data
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # scale data
        if scaler:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        # fit model
        model = estimator.fit(X_train, y_train)

        # make predictions
        train_preds = model.predict(X_train)
        test_preds = model.predict(X_test)

        # calculate metrics
        rmse_scores_train.append(np.sqrt(mean_squared_error(y_train, train_preds)))
        rmse_scores_test.append(np.sqrt(mean_squared_error(y_test, test_preds)))
        r2_scores_train.append(r2_score(y_train, train_preds))
        r2_scores_test.append(r2_score(y_test, test_preds))

    # save results
    models_df.loc[len(models_df.index)] = [tournament, estimator, scaler, features, folds, np.mean(rmse_scores_train), np.mean(rmse_scores_test), np.mean(r2_scores_train), np.mean(r2_scores_test)]

    # delete variables
    del data, X, y, kf, X_train, X_test, y_train, y_test, model, train_preds, test_preds
    
    return

### Men's

In [37]:
# create a df to hold performance metrics
upset_preds = pd.DataFrame(columns=['Tournament', 'Model', 'Scaler', 'Features', 'Num_CV_Folds', 'Train_RMSE', 'Val_RMSE', 'Train_R2', 'Val_R2'])

# load upset_preds df
# upset_preds = pd.read_csv('models/upset_preds.csv')

In [38]:
# define regression models
models = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_jobs=-1), XGBRegressor(n_jobs=-1), SVR(), KNeighborsRegressor(n_jobs=-1)]
scalers = [None, StandardScaler(), MinMaxScaler()]
feature_subsets = [['Round', 'Pct_upsets', 'Avg_seed_diff', 'Std_seed_diff', 'Avg_seed', 'Std_seed'], ['Round', 'Pct_upsets', 'Avg_seed_diff', 'Std_seed_diff']]

# run regression models
for model in tqdm(models, desc='Model', file=sys.stdout):
    for scaler in scalers:
        for features in feature_subsets:
            for fold in [3, 6, 9]:
                # run model
                run_model(tournament='M', estimator=model, data=mupsets, features=features, models_df=upset_preds, scaler=scaler, folds=fold)

Model: 100%|██████████| 5/5 [00:34<00:00,  6.94s/it]


In [39]:
# inspect
upset_preds[upset_preds['Tournament'] == 'M'].sort_values(by='Val_RMSE', ascending=False).head()

Unnamed: 0,Tournament,Model,Scaler,Features,Num_CV_Folds,Train_RMSE,Val_RMSE,Train_R2,Val_R2
70,M,SVR(),MinMaxScaler(),"[Pct_upsets, Avg_seed_diff, Std_seed_diff]",6,0.250775,0.300734,0.013023,-0.633954
64,M,SVR(),StandardScaler(),"[Pct_upsets, Avg_seed_diff, Std_seed_diff]",6,0.245539,0.298828,0.053546,-0.609231
69,M,SVR(),MinMaxScaler(),"[Pct_upsets, Avg_seed_diff, Std_seed_diff]",3,0.245441,0.291431,0.057558,-0.352699
71,M,SVR(),MinMaxScaler(),"[Pct_upsets, Avg_seed_diff, Std_seed_diff]",9,0.252381,0.291012,0.004603,-0.558221
58,M,SVR(),,"[Pct_upsets, Avg_seed_diff, Std_seed_diff]",6,0.262521,0.288065,-0.080753,-0.453554


In [40]:
upset_preds

Unnamed: 0,Tournament,Model,Scaler,Features,Num_CV_Folds,Train_RMSE,Val_RMSE,Train_R2,Val_R2
0,M,LinearRegression(n_jobs=-1),,"[Pct_upsets, Avg_seed_diff, Std_seed_diff, Avg...",3,0.243946,0.2683,0.069308,-0.133886
1,M,LinearRegression(n_jobs=-1),,"[Pct_upsets, Avg_seed_diff, Std_seed_diff, Avg...",6,0.246183,0.263354,0.052075,-0.199196
2,M,LinearRegression(n_jobs=-1),,"[Pct_upsets, Avg_seed_diff, Std_seed_diff, Avg...",9,0.247018,0.254722,0.047425,-0.149606
3,M,LinearRegression(n_jobs=-1),,"[Pct_upsets, Avg_seed_diff, Std_seed_diff]",3,0.248812,0.259653,0.031906,-0.062198
4,M,LinearRegression(n_jobs=-1),,"[Pct_upsets, Avg_seed_diff, Std_seed_diff]",6,0.24945,0.259144,0.026765,-0.156977
5,M,LinearRegression(n_jobs=-1),,"[Pct_upsets, Avg_seed_diff, Std_seed_diff]",9,0.250169,0.250284,0.022919,-0.127606
6,M,LinearRegression(n_jobs=-1),StandardScaler(),"[Pct_upsets, Avg_seed_diff, Std_seed_diff, Avg...",3,0.243946,0.2683,0.069308,-0.133886
7,M,LinearRegression(n_jobs=-1),StandardScaler(),"[Pct_upsets, Avg_seed_diff, Std_seed_diff, Avg...",6,0.246183,0.263354,0.052075,-0.199196
8,M,LinearRegression(n_jobs=-1),StandardScaler(),"[Pct_upsets, Avg_seed_diff, Std_seed_diff, Avg...",9,0.247018,0.254722,0.047425,-0.149606
9,M,LinearRegression(n_jobs=-1),StandardScaler(),"[Pct_upsets, Avg_seed_diff, Std_seed_diff]",3,0.248812,0.259653,0.031906,-0.062198


### Women's

In [None]:
# run regression models
for model in tqdm(models, desc='Model', file=sys.stdout):
    for scaler in scalers:
        for features in feature_subsets:
            for fold in [3, 6, 9]:
                # run model
                run_model(tournament='W', estimator=model, data=wupsets, features=features, models_df=upset_preds, scaler=scaler, folds=fold)

Model: 100%|██████████| 5/5 [00:35<00:00,  7.00s/it]


In [None]:
# inspect
upset_preds[upset_preds['Tournament'] == 'W'].sort_values(by='Val_RMSE', ascending=False).head()

Unnamed: 0,Tournament,Label,Model,Num_Features,Features,Num_CV_Folds,Train_R2,Val_R2,Train_RMSE,Val_RMSE,Train_Acc,Val_Acc,Val_r1_acc,Val_r2_acc,Val_r3_acc,Val_r4_acc,Val_r5_acc,Val_r6_acc
80,M,A_adj_score_diff,LinearRegression(n_jobs=-1),67,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.429856,0.239511,11.045665,11.701572,0.724928,0.70456,0.742861,0.690977,0.684622,0.560981,0.612601,0.636825
0,M,A_adj_score_diff,LinearRegression(n_jobs=-1),70,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.431068,0.232711,11.033918,11.749827,0.725868,0.702054,0.741293,0.677911,0.693672,0.568981,0.599267,0.670159
60,M,A_adj_score_diff,LinearRegression(n_jobs=-1),69,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.430107,0.235875,11.043223,11.729381,0.724301,0.702054,0.740778,0.681194,0.690814,0.554085,0.612601,0.670159
20,M,A_adj_score_diff,LinearRegression(n_jobs=-1),68,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.430615,0.232794,11.03822,11.750188,0.724405,0.699541,0.741371,0.679361,0.683224,0.552981,0.568498,0.670159
40,M,A_adj_score_diff,LinearRegression(n_jobs=-1),67,"['A_1_pos_game_ratio', 'A_1_pos_loss_missing',...",5,0.426528,0.235529,11.078017,11.735523,0.723674,0.698284,0.739631,0.674414,0.689038,0.563176,0.570549,0.647937


In [None]:
# save reg_df
reg_df.to_csv('models/regression_models.csv', index=False)

- After manual model inspection, Linear Regression performed the best when we replaced the seed with seed_win_prob and used the team A/B data.