In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import os

import march_madness as mm
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures

In [2]:
analysis = mm.Analysis()

In [3]:
analysis.load_training_data('MDataFiles_Stage2/calculated features 1985-2021.csv')

In [4]:
analysis.load_validation_data('MDataFiles_Stage2/calculated features 2022.csv')

# Overview

This notebook tests all the models developed for phase 1 of my model devlopepment against the actional results of the 2022 Tournament.

The detailed process of how I developed these models can be found here. TODO: insert hyperlink

# Notable Results

* The predictions based solely on the historical win percentages of each seed-to-seed match-up performed remarkably well with a competition score of 0.60497. That score would've placed 52nd out 930 teams.
* The Logistic Regression model appeared to generalize best with fewer features. Its performance got worse as more features were added its best result was actually using the pruned feature set. The score of 0.61613 using the pruned feature set would've been good for 143rd.
* The Random Forest model (max_features controlled) performed batter as more features were added. This model performed worse on the training data but actually performed better in the 2022 data. Using all features, this model scored 0.63989 for 342nd.
* The Random Forest models that we controlled using 'max_depth' performed better across the board. The final model using all the features scored 0.60753 which would've placed 70th.

# Baseline Cases

## 50/50 Predictions

In [5]:
feature_keys = []

In [6]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [7]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [8]:
naive_predictions = np.ones(y_test.shape) * 0.5

In [9]:
analysis.score_model_predictions(y_test,naive_predictions)

0.69315

## Seed Win Percentages

In [10]:
feature_keys = ['tourney seed']

In [11]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [12]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [13]:
seed_predictions = analysis.generate_seed_win_predictions(X_test)

In [14]:
analysis.score_model_predictions(y_test,seed_predictions)

0.60497

## Seed Only (As Continuous Value)

In [15]:
feature_keys = ['tourney seed']

In [16]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [17]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [18]:
logreg = LogisticRegression().fit(X_train,y_train)

In [19]:
logreg.score(X_train,y_train)

0.7129909365558912

In [20]:
logreg.score(X_test,y_test)

0.6567164179104478

In [21]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [22]:
analysis.score_model_predictions(y_test,logreg_pred)

0.62572

# Model Development

## RPI (No Seed)

In [23]:
feature_keys = ['weighted win pct','owp','oowp']

In [24]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [25]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [26]:
logreg = LogisticRegression().fit(X_train,y_train)

In [27]:
logreg.score(X_train,y_train)

0.7060854553301683

In [28]:
logreg.score(X_test,y_test)

0.6417910447761194

In [29]:
forest = RandomForestClassifier(n_estimators=100,max_features=3,random_state=100).fit(X_train,y_train)

In [30]:
forest.score(X_train,y_train)

1.0

In [31]:
forest.score(X_test,y_test)

0.6119402985074627

In [32]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [33]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [34]:
analysis.score_model_predictions(y_test,logreg_pred)

0.62033

In [35]:
analysis.score_model_predictions(y_test,forest_pred)

0.66694

## RPI + Seed

In [36]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp']

In [37]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [38]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [39]:
logreg = LogisticRegression().fit(X_train,y_train)

In [40]:
logreg.score(X_train,y_train)

0.7069486404833837

In [41]:
logreg.score(X_test,y_test)

0.6567164179104478

In [42]:
forest = RandomForestClassifier(n_estimators=100,max_features=3,random_state=100).fit(X_train,y_train)

In [43]:
forest.score(X_train,y_train)

1.0

In [44]:
forest.score(X_test,y_test)

0.582089552238806

In [45]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [46]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [47]:
analysis.score_model_predictions(y_test,logreg_pred)

0.63246

In [48]:
analysis.score_model_predictions(y_test,forest_pred)

0.67231

## Margin of Victory

In [49]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses']

In [50]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [51]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [52]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
logreg.score(X_train,y_train)

0.7112645662494606

In [54]:
logreg.score(X_test,y_test)

0.6417910447761194

In [55]:
forest = RandomForestClassifier(n_estimators=100,max_features=3,random_state=100).fit(X_train,y_train)

In [56]:
forest.score(X_train,y_train)

1.0

In [57]:
forest.score(X_test,y_test)

0.7164179104477612

In [58]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [59]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [60]:
analysis.score_model_predictions(y_test,logreg_pred)

0.67942

In [61]:
analysis.score_model_predictions(y_test,forest_pred)

0.63465

## Quality Wins

In [62]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses']

In [63]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [64]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [65]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [66]:
logreg.score(X_train,y_train)

0.7293914544669832

In [67]:
logreg.score(X_test,y_test)

0.6417910447761194

In [68]:
forest = RandomForestClassifier(n_estimators=1000,max_features=3,random_state=100).fit(X_train,y_train)

In [69]:
forest.score(X_train,y_train)

1.0

In [70]:
forest.score(X_test,y_test)

0.6268656716417911

In [71]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [72]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [73]:
analysis.score_model_predictions(y_test,logreg_pred)

0.68977

In [74]:
analysis.score_model_predictions(y_test,forest_pred)

0.64788

## Late Season Form

In [75]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ']

In [76]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [77]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [78]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [79]:
logreg.score(X_train,y_train)

0.7285282693137678

In [80]:
logreg.score(X_test,y_test)

0.6417910447761194

In [81]:
forest = RandomForestClassifier(n_estimators=1000,max_features=3,random_state=100).fit(X_train,y_train)

In [82]:
forest.score(X_train,y_train)

1.0

In [83]:
forest.score(X_test,y_test)

0.6268656716417911

In [84]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [85]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [86]:
analysis.score_model_predictions(y_test,logreg_pred)

0.67371

In [87]:
analysis.score_model_predictions(y_test,forest_pred)

0.6399

## Pruned Feature Set

In [88]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','capped avg win margin','capped std win margin',
                'capped avg loss margin','capped std loss margin','close wins','close losses','weighted top64 wins',
                'weighted top16 wins','weighted top64 losses','weighted top16 losses','last10 weighted win pct',
                'last5 weighted win pct','conference tourney wins','conference champ']

In [89]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [90]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [91]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [92]:
logreg.score(X_train,y_train)

0.7229175658178679

In [93]:
logreg.score(X_test,y_test)

0.6716417910447762

In [94]:
max_features = round(np.sqrt(len(feature_keys)))
max_features

4

In [95]:
forest = RandomForestClassifier(n_estimators=1000,max_features=3,random_state=100).fit(X_train,y_train)

In [96]:
forest.score(X_train,y_train)

1.0

In [97]:
forest.score(X_test,y_test)

0.6567164179104478

In [98]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [99]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [100]:
analysis.score_model_predictions(y_test,logreg_pred)

0.61613

In [101]:
analysis.score_model_predictions(y_test,forest_pred)

0.61972

# Random Forest - Depth Limit Models

## RPI (No Seed)

In [102]:
feature_keys = ['weighted win pct','owp','oowp']

In [103]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [104]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [105]:
forest = RandomForestClassifier(n_estimators=1000,max_features='sqrt',max_depth=5,random_state=100).fit(X_train,y_train)

In [106]:
forest.score(X_train,y_train)

0.736296935692706

In [107]:
forest.score(X_test,y_test)

0.6268656716417911

In [108]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [109]:
analysis.score_model_predictions(y_test,forest_pred)

0.58884

## RPI + Seed

In [110]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp']

In [111]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [112]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [113]:
forest = RandomForestClassifier(n_estimators=1000,max_features='sqrt',max_depth=5,random_state=100).fit(X_train,y_train)

In [114]:
forest.score(X_train,y_train)

0.7457919723780752

In [115]:
forest.score(X_test,y_test)

0.6716417910447762

In [116]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [117]:
analysis.score_model_predictions(y_test,forest_pred)

0.59257

## Margin of Victory

In [118]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses']

In [119]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [120]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [121]:
forest = RandomForestClassifier(n_estimators=1000,max_features='sqrt',max_depth=5,random_state=100).fit(X_train,y_train)

In [122]:
forest.score(X_train,y_train)

0.7643504531722054

In [123]:
forest.score(X_test,y_test)

0.6417910447761194

In [124]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [125]:
analysis.score_model_predictions(y_test,forest_pred)

0.61524

## Quality Wins

In [126]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses']

In [127]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [128]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [129]:
forest = RandomForestClassifier(n_estimators=1000,max_features='sqrt',max_depth=5,random_state=100).fit(X_train,y_train)

In [130]:
forest.score(X_train,y_train)

0.7686663789382823

In [131]:
forest.score(X_test,y_test)

0.6417910447761194

In [132]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [133]:
analysis.score_model_predictions(y_test,forest_pred)

0.6119

## Late Season Form

In [134]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ']

In [135]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [136]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [137]:
forest = RandomForestClassifier(n_estimators=1000,max_features='sqrt',max_depth=5,random_state=100).fit(X_train,y_train)

In [138]:
forest.score(X_train,y_train)

0.7734138972809668

In [139]:
forest.score(X_test,y_test)

0.6268656716417911

In [140]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [141]:
analysis.score_model_predictions(y_test,forest_pred)

0.60753