# March Machine Learning Mania
In this notebook, we:
- Attempt to predict the number of upsets in a following round based on the number of upsets that have occurred so far in a NCAA tournament.

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gc import collect
import os
import sys
from tqdm import tqdm

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, log_loss, accuracy_score, confusion_matrix, classification_report
from xgboost import XGBRegressor, XGBClassifier

# display 100 rows and 100 columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# global random seed
SEED = 0

# set numpy seed
np.random.seed(SEED)

## Load Data

In [2]:
# root dirs
root = 'data/'
mroot = 'data/mens/'
wroot = 'data/womens/'

# load in features compact
upsets = pd.read_csv(root + 'processed/upsets.csv').sort_values(['Season', 'Round'])

# check
upsets.head(12)

Unnamed: 0,Season,Round,Pct_upsets,Avg_seed_diff,Std_seed_diff,Avg_seed,Std_seed,Tournament
0,2003,1,0.25,-3.75,3.011881,8.5,4.609772,M
1,2003,2,0.375,-4.333333,3.204164,5.4375,3.445445,M
2,2003,3,0.25,-1.0,0.0,4.1875,3.186274,M
3,2003,4,0.75,-1.666667,0.57735,2.375,1.932453,M
4,2003,5,0.5,-2.0,0.0,2.25,0.829156,M
5,2003,6,1.0,-1.0,0.0,2.5,0.5,M
6,2004,1,0.125,-4.5,3.0,8.5,4.609772,M
7,2004,2,0.4375,-4.714286,3.093773,5.0625,3.071416,M
8,2004,3,0.25,-3.5,0.707107,4.5625,2.691857,M
9,2004,4,0.25,-1.0,0.0,3.5,2.5,M


## Create Labels

In [3]:
# label is pct of upsets in next round
upsets['target'] = upsets['Pct_upsets'].shift(-1)

# check
upsets.head(12)

Unnamed: 0,Season,Round,Pct_upsets,Avg_seed_diff,Std_seed_diff,Avg_seed,Std_seed,Tournament,target
0,2003,1,0.25,-3.75,3.011881,8.5,4.609772,M,0.375
1,2003,2,0.375,-4.333333,3.204164,5.4375,3.445445,M,0.25
2,2003,3,0.25,-1.0,0.0,4.1875,3.186274,M,0.75
3,2003,4,0.75,-1.666667,0.57735,2.375,1.932453,M,0.5
4,2003,5,0.5,-2.0,0.0,2.25,0.829156,M,1.0
5,2003,6,1.0,-1.0,0.0,2.5,0.5,M,0.125
6,2004,1,0.125,-4.5,3.0,8.5,4.609772,M,0.4375
7,2004,2,0.4375,-4.714286,3.093773,5.0625,3.071416,M,0.25
8,2004,3,0.25,-3.5,0.707107,4.5625,2.691857,M,0.25
9,2004,4,0.25,-1.0,0.0,3.5,2.5,M,1.0


In [5]:
# drop round 6 rows (no target)
upsets = upsets[upsets['Round'] != 6]

# check
print(upsets.shape)
upsets.head(10)

(165, 9)


Unnamed: 0,Season,Round,Pct_upsets,Avg_seed_diff,Std_seed_diff,Avg_seed,Std_seed,Tournament,target
0,2003,1,0.25,-3.75,3.011881,8.5,4.609772,M,0.375
1,2003,2,0.375,-4.333333,3.204164,5.4375,3.445445,M,0.25
2,2003,3,0.25,-1.0,0.0,4.1875,3.186274,M,0.75
3,2003,4,0.75,-1.666667,0.57735,2.375,1.932453,M,0.5
4,2003,5,0.5,-2.0,0.0,2.25,0.829156,M,1.0
6,2004,1,0.125,-4.5,3.0,8.5,4.609772,M,0.4375
7,2004,2,0.4375,-4.714286,3.093773,5.0625,3.071416,M,0.25
8,2004,3,0.25,-3.5,0.707107,4.5625,2.691857,M,0.25
9,2004,4,0.25,-1.0,0.0,3.5,2.5,M,1.0
10,2004,5,1.0,-1.0,0.0,2.0,0.707107,M,0.0


Only 165 rows of data to train on may be problematic.

## Predictions

In [6]:
# split data
mupsets = upsets[upsets['Tournament'] == 'M']
wupsets = upsets[upsets['Tournament'] == 'W']

### Regression

In [7]:
def run_model(tournament, estimator, data, features, models_df, scaler=None, folds=5):
    """
    Run a model on data and save results to models_df.

    Parameters
    ----------
    tournament : str
        'M' or 'W'.
    estimator : sklearn estimator
        Estimator to use for modeling.
    data : pd.DataFrame
        Data to model.
    features : list
        Feature subset to use for modeling.
    models_df : pd.DataFrame
        DataFrame to save results to.
    scaler : sklearn scaler, optional
        Scaler to use for data. Default is None.
    folds : int
        Number of cross-validation folds to use.

    Returns
    -------
    None
    """
    
    # create copy to avoid modification
    data = data.copy()

    # drop unused cols
    data = data.drop(columns=['Season', 'Tournament'])

    # define cross-validation
    kf = KFold(n_splits=folds, shuffle=True, random_state=SEED)

    # define X and y
    X = data[features]
    y = data['target']

    # initialize lists to store metrics
    rmse_scores_train = []
    rmse_scores_test = []
    r2_scores_train = []
    r2_scores_test = []
    acc_scores_train = []
    acc_scores_test = []

    for train_index, test_index in kf.split(X):
        # split data
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # scale data
        if scaler:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        # fit model
        model = estimator.fit(X_train, y_train)

        # make predictions
        train_preds = model.predict(X_train)
        test_preds = model.predict(X_test)

        # calculate metrics
        rmse_scores_train.append(np.sqrt(mean_squared_error(y_train, train_preds)))
        rmse_scores_test.append(np.sqrt(mean_squared_error(y_test, test_preds)))
        r2_scores_train.append(r2_score(y_train, train_preds))
        r2_scores_test.append(r2_score(y_test, test_preds))

    # save results
    models_df.loc[len(models_df.index)] = [tournament, estimator, scaler, features, folds, np.mean(rmse_scores_train), np.mean(rmse_scores_test), np.mean(r2_scores_train), np.mean(r2_scores_test)]

    # delete variables
    del data, X, y, kf, X_train, X_test, y_train, y_test, model, train_preds, test_preds
    
    return

#### Men's

In [9]:
# create a df to hold performance metrics
upset_preds = pd.DataFrame(columns=['Tournament', 'Model', 'Scaler', 'Features', 'Num_CV_Folds', 'Train_RMSE', 'Val_RMSE', 'Train_R2', 'Val_R2'])

# load upset_preds df
# upset_preds = pd.read_csv('models/upset_preds.csv')

In [10]:
# define regression models
models = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_jobs=-1), XGBRegressor(n_jobs=-1), SVR(), KNeighborsRegressor(n_jobs=-1)]
scalers = [None, StandardScaler(), MinMaxScaler()]
feature_subsets = [['Round', 'Pct_upsets', 'Avg_seed_diff', 'Std_seed_diff', 'Avg_seed', 'Std_seed'], ['Round', 'Pct_upsets', 'Avg_seed_diff', 'Std_seed_diff'], 
                   ['Pct_upsets', 'Avg_seed_diff', 'Std_seed_diff']]

# run regression models
for model in tqdm(models, desc='Model', file=sys.stdout):
    for scaler in scalers:
        for features in feature_subsets:
            for fold in [3, 6, 9]:
                # run model
                run_model(tournament='M', estimator=model, data=mupsets, features=features, models_df=upset_preds, scaler=scaler, folds=fold)

Model: 100%|██████████| 5/5 [00:53<00:00, 10.65s/it]


In [13]:
# inspect
upset_preds[upset_preds['Tournament'] == 'M'].sort_values(by='Val_RMSE', ascending=True).head()

Unnamed: 0,Tournament,Model,Scaler,Features,Num_CV_Folds,Train_RMSE,Val_RMSE,Train_R2,Val_R2
29,M,"(DecisionTreeRegressor(max_features='auto', ra...",,"[Round, Pct_upsets, Avg_seed_diff, Std_seed_di...",9,0.093053,0.218553,0.864548,0.015459
38,M,"(DecisionTreeRegressor(max_features='auto', ra...",StandardScaler(),"[Round, Pct_upsets, Avg_seed_diff, Std_seed_di...",9,0.092091,0.222024,0.867361,0.004838
28,M,"(DecisionTreeRegressor(max_features='auto', ra...",,"[Round, Pct_upsets, Avg_seed_diff, Std_seed_di...",6,0.093577,0.224051,0.862694,0.110532
37,M,"(DecisionTreeRegressor(max_features='auto', ra...",StandardScaler(),"[Round, Pct_upsets, Avg_seed_diff, Std_seed_di...",6,0.09332,0.224654,0.863552,0.109184
47,M,"(DecisionTreeRegressor(max_features='auto', ra...",MinMaxScaler(),"[Round, Pct_upsets, Avg_seed_diff, Std_seed_di...",9,0.092791,0.224742,0.865307,0.016473


#### Women's

In [14]:
# run regression models
for model in tqdm(models, desc='Model', file=sys.stdout):
    for scaler in scalers:
        for features in feature_subsets:
            for fold in [3, 6, 9]:
                # run model
                run_model(tournament='W', estimator=model, data=wupsets, features=features, models_df=upset_preds, scaler=scaler, folds=fold)

Model: 100%|██████████| 5/5 [00:51<00:00, 10.34s/it]


In [15]:
# inspect
upset_preds[upset_preds['Tournament'] == 'W'].sort_values(by='Val_RMSE', ascending=True).head()

Unnamed: 0,Tournament,Model,Scaler,Features,Num_CV_Folds,Train_RMSE,Val_RMSE,Train_R2,Val_R2
167,W,"(DecisionTreeRegressor(max_features='auto', ra...",,"[Round, Pct_upsets, Avg_seed_diff, Std_seed_diff]",9,0.152308,0.218834,0.693583,0.257741
185,W,"(DecisionTreeRegressor(max_features='auto', ra...",MinMaxScaler(),"[Round, Pct_upsets, Avg_seed_diff, Std_seed_diff]",9,0.152311,0.221223,0.69327,0.225921
184,W,"(DecisionTreeRegressor(max_features='auto', ra...",MinMaxScaler(),"[Round, Pct_upsets, Avg_seed_diff, Std_seed_diff]",6,0.152558,0.222341,0.69158,0.307839
176,W,"(DecisionTreeRegressor(max_features='auto', ra...",StandardScaler(),"[Round, Pct_upsets, Avg_seed_diff, Std_seed_diff]",9,0.152765,0.222888,0.691437,0.224161
166,W,"(DecisionTreeRegressor(max_features='auto', ra...",,"[Round, Pct_upsets, Avg_seed_diff, Std_seed_diff]",6,0.152648,0.224729,0.691697,0.291686


In [16]:
# save to csv
upset_preds.to_csv('models/upset_preds_reg.csv', index=False)