In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import os
import time

import march_madness as mm
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures

In [2]:
analysis = mm.Analysis()

In [3]:
analysis.load_training_data('MDataFiles_Stage2/calculated features 1985-2021.csv')

In [4]:
analysis.load_validation_data('MDataFiles_Stage2/calculated features 2022.csv')

# Overview

This notebook tests the actual results of the 2022 tournamend against the models I developed during the second phase of my work on this project (after official submissions were closed).

The a more detailed process of how I developed these models can be found here. TODO: insert hyperlink

# Notable Results

* The Random Forest performed better using the additional game information and features. Overall these models seemed to generalize quite well. 
* The Logistic Regression models actually performed worse than their results in Phase 1.
* The final Random Forest model using all the features scored 0.61260 which would've placed 126nd out 930 submissions.

# Phase 2 - Detailed Game Result Models

Phase 2 of the model training is using the 'detailed game results' for each game that gives additional box score stats beyond the final score of each game.

These stats included the follwing for both teams:
* FGM - field goals made
* FGA - field goals attempted
* FGM3 - three pointers made
* FGA3 - three pointers attempted
* FTM - free throws made
* FTA - free throws attempted
* OR - offensive rebounds
* DR - defensive rebounds
* Ast - assists
* TO - turnovers committed
* Stl - steals
* Blk - blocks
* PF - personal fouls committed

## Detailed Game Results

In [5]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ','Team Avg FGM',
                'Team Avg FGA','Team Avg FGM3','Team Avg FGA3','Team Avg FTM','Team Avg FTA',
                'Team Avg OR','Team Avg DR','Team Avg Ast','Team Avg TO%','Team Avg Stl%','Team Avg Blk%',
                'Team Avg PF','Team Avg TR','Team Avg FGM2','Team Avg FGA2','Team Avg FG%','Team Avg FG2%',
                'Team Avg FG3%','Team Avg FGA3%','Team Avg FT%','Team Avg Pos','Team Avg OEff','Opp Avg FGM',
                'Opp Avg FGA','Opp Avg FGM3','Opp Avg FGA3','Opp Avg FTM','Opp Avg FTA','Opp Avg OR','Opp Avg DR',
                'Opp Avg Ast','Opp Avg TO%','Opp Avg Stl%','Opp Avg Blk%','Opp Avg PF','Opp Avg TR','Opp Avg FGM2',
                'Opp Avg FGA2','Opp Avg FG%','Opp Avg FG2%','Opp Avg FG3%','Opp Avg FGA3%','Opp Avg FT%',
                'Opp Avg Pos','Opp Avg OEff']

In [6]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [7]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [12]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
logreg.score(X_train,y_train)

0.7367285282693138

In [14]:
logreg.score(X_test,y_test)

0.6119402985074627

In [15]:
forest = RandomForestClassifier(n_estimators=10000,max_features='sqrt',
                                max_depth=5,random_state=100,n_jobs=2).fit(X_train,y_train)

In [16]:
forest.score(X_train,y_train)

0.778593008200259

In [17]:
forest.score(X_test,y_test)

0.6567164179104478

In [18]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [19]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [20]:
analysis.score_model_predictions(y_test,logreg_pred)

0.718729905414014

In [21]:
analysis.score_model_predictions(y_test,forest_pred)

0.6078644035391965

## Detailed Game Results + Interactions

Additionally, the interactions between all of these features were added to the model. This appeared to help the Random Forest but actually hurt the Logistic Regression.

In [22]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ','Team Avg FGM',
                'Team Avg FGA','Team Avg FGM3','Team Avg FGA3','Team Avg FTM','Team Avg FTA',
                'Team Avg OR','Team Avg DR','Team Avg Ast','Team Avg TO%','Team Avg Stl%','Team Avg Blk%',
                'Team Avg PF','Team Avg TR','Team Avg FGM2','Team Avg FGA2','Team Avg FG%','Team Avg FG2%',
                'Team Avg FG3%','Team Avg FGA3%','Team Avg FT%','Team Avg Pos','Team Avg OEff','Opp Avg FGM',
                'Opp Avg FGA','Opp Avg FGM3','Opp Avg FGA3','Opp Avg FTM','Opp Avg FTA','Opp Avg OR','Opp Avg DR',
                'Opp Avg Ast','Opp Avg TO%','Opp Avg Stl%','Opp Avg Blk%','Opp Avg PF','Opp Avg TR','Opp Avg FGM2',
                'Opp Avg FGA2','Opp Avg FG%','Opp Avg FG2%','Opp Avg FG3%','Opp Avg FGA3%','Opp Avg FT%',
                'Opp Avg Pos','Opp Avg OEff']

In [23]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [24]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [25]:
poly = PolynomialFeatures(2)

In [26]:
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)

In [31]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
logreg.score(X_train,y_train)

0.7388864911523522

In [33]:
logreg.score(X_test,y_test)

0.5970149253731343

In [34]:
forest = RandomForestClassifier(n_estimators=10000,max_features='sqrt',
                                max_depth=5,random_state=100,n_jobs=2).fit(X_train,y_train)

In [35]:
forest.score(X_train,y_train)

0.7980146741476046

In [36]:
forest.score(X_test,y_test)

0.6716417910447762

In [37]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [38]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [39]:
analysis.score_model_predictions(y_test,logreg_pred)

0.7901949394505107

In [40]:
analysis.score_model_predictions(y_test,forest_pred)

0.6159448696704682

## Net Detailed Game Results

The last features I engineer were the 'net' statistics for all these parameters. These parameters can be interpreted as "how the team did compared to what the opponent generally allowed". For example if an opponent allows an average of 5 three-pointers per game and the team scored 10, that would be a net of 5 for that parameter in that game. 

So on offense these metrics capture, how well did the team do compared to what their opponents allow on average.

And on defense they capture, how well did you limit your opponent compared to how much they normally produce.

'Net' parameters were calculated for all stats derived from the 'detailed' information and were the average difference for all of the team's games.

In [41]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ','Team Avg FGM',
                'Team Avg FGA','Team Avg FGM3','Team Avg FGA3','Team Avg FTM','Team Avg FTA',
                'Team Avg OR','Team Avg DR','Team Avg Ast','Team Avg TO%','Team Avg Stl%','Team Avg Blk%',
                'Team Avg PF','Team Avg TR','Team Avg FGM2','Team Avg FGA2','Team Avg FG%','Team Avg FG2%',
                'Team Avg FG3%','Team Avg FGA3%','Team Avg FT%','Team Avg Pos','Team Avg OEff','Opp Avg FGM',
                'Opp Avg FGA','Opp Avg FGM3','Opp Avg FGA3','Opp Avg FTM','Opp Avg FTA','Opp Avg OR','Opp Avg DR',
                'Opp Avg Ast','Opp Avg TO%','Opp Avg Stl%','Opp Avg Blk%','Opp Avg PF','Opp Avg TR','Opp Avg FGM2',
                'Opp Avg FGA2','Opp Avg FG%','Opp Avg FG2%','Opp Avg FG3%','Opp Avg FGA3%','Opp Avg FT%',
                'Opp Avg Pos','Opp Avg OEff','Net Team Avg FGM','Net Team Avg FGA','Net Team Avg FGM3',
                'Net Team Avg FGA3','Net Team Avg FTM','Net Team Avg FTA','Net Team Avg OR','Net Team Avg DR',
                'Net Team Avg Ast','Net Team Avg TO%','Net Team Avg Stl%','Net Team Avg Blk%','Net Team Avg PF',
                'Net Team Avg TR','Net Team Avg FGM2','Net Team Avg FGA2','Net Team Avg FG%','Net Team Avg FG2%',
                'Net Team Avg FG3%','Net Team Avg FGA3%','Net Team Avg FT%','Net Team Avg Pos','Net Team Avg OEff',
                'Net Opp Avg FGM','Net Opp Avg FGA','Net Opp Avg FGM3','Net Opp Avg FGA3','Net Opp Avg FTM',
                'Net Opp Avg FTA','Net Opp Avg OR','Net Opp Avg DR','Net Opp Avg Ast','Net Opp Avg TO%',
                'Net Opp Avg Stl%','Net Opp Avg Blk%','Net Opp Avg PF','Net Opp Avg TR','Net Opp Avg FGM2',
                'Net Opp Avg FGA2','Net Opp Avg FG%','Net Opp Avg FG2%','Net Opp Avg FG3%','Net Opp Avg FGA3%',
                'Net Opp Avg FT%','Net Opp Avg Pos','Net Opp Avg OEff']

In [42]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [43]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [48]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
logreg.score(X_train,y_train)

0.7427708243418213

In [50]:
logreg.score(X_test,y_test)

0.5970149253731343

In [51]:
forest = RandomForestClassifier(n_estimators=10000,max_features='sqrt',
                                max_depth=5,random_state=100,n_jobs=2).fit(X_train,y_train)

In [52]:
forest.score(X_train,y_train)

0.7820457488131204

In [53]:
forest.score(X_test,y_test)

0.6567164179104478

In [54]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [55]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [56]:
analysis.score_model_predictions(y_test,logreg_pred)

0.7167180750918998

In [57]:
analysis.score_model_predictions(y_test,forest_pred)

0.6062899829821622

## Net Detailed Game Results + Interactions

Again, the interactions between all of these features were added to the model. This again, helped the Random Forest and hurt the Logistic Regression.

In [58]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ','Team Avg FGM',
                'Team Avg FGA','Team Avg FGM3','Team Avg FGA3','Team Avg FTM','Team Avg FTA',
                'Team Avg OR','Team Avg DR','Team Avg Ast','Team Avg TO%','Team Avg Stl%','Team Avg Blk%',
                'Team Avg PF','Team Avg TR','Team Avg FGM2','Team Avg FGA2','Team Avg FG%','Team Avg FG2%',
                'Team Avg FG3%','Team Avg FGA3%','Team Avg FT%','Team Avg Pos','Team Avg OEff','Opp Avg FGM',
                'Opp Avg FGA','Opp Avg FGM3','Opp Avg FGA3','Opp Avg FTM','Opp Avg FTA','Opp Avg OR','Opp Avg DR',
                'Opp Avg Ast','Opp Avg TO%','Opp Avg Stl%','Opp Avg Blk%','Opp Avg PF','Opp Avg TR','Opp Avg FGM2',
                'Opp Avg FGA2','Opp Avg FG%','Opp Avg FG2%','Opp Avg FG3%','Opp Avg FGA3%','Opp Avg FT%',
                'Opp Avg Pos','Opp Avg OEff','Net Team Avg FGM','Net Team Avg FGA','Net Team Avg FGM3',
                'Net Team Avg FGA3','Net Team Avg FTM','Net Team Avg FTA','Net Team Avg OR','Net Team Avg DR',
                'Net Team Avg Ast','Net Team Avg TO%','Net Team Avg Stl%','Net Team Avg Blk%','Net Team Avg PF',
                'Net Team Avg TR','Net Team Avg FGM2','Net Team Avg FGA2','Net Team Avg FG%','Net Team Avg FG2%',
                'Net Team Avg FG3%','Net Team Avg FGA3%','Net Team Avg FT%','Net Team Avg Pos','Net Team Avg OEff',
                'Net Opp Avg FGM','Net Opp Avg FGA','Net Opp Avg FGM3','Net Opp Avg FGA3','Net Opp Avg FTM',
                'Net Opp Avg FTA','Net Opp Avg OR','Net Opp Avg DR','Net Opp Avg Ast','Net Opp Avg TO%',
                'Net Opp Avg Stl%','Net Opp Avg Blk%','Net Opp Avg PF','Net Opp Avg TR','Net Opp Avg FGM2',
                'Net Opp Avg FGA2','Net Opp Avg FG%','Net Opp Avg FG2%','Net Opp Avg FG3%','Net Opp Avg FGA3%',
                'Net Opp Avg FT%','Net Opp Avg Pos','Net Opp Avg OEff']

In [59]:
X_train, y_train = analysis.extract_training_data(feature_keys=feature_keys)

In [60]:
X_test, y_test = analysis.extract_validation_data(feature_keys=feature_keys)

In [61]:
poly = PolynomialFeatures(2)

In [62]:
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)

In [67]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [68]:
logreg.score(X_train,y_train)

0.743202416918429

In [69]:
logreg.score(X_test,y_test)

0.6119402985074627

In [70]:
forest = RandomForestClassifier(n_estimators=10000,max_features='sqrt',
                                max_depth=5,random_state=100,n_jobs=2).fit(X_train,y_train)

In [71]:
forest.score(X_train,y_train)

0.8118256365990505

In [72]:
forest.score(X_test,y_test)

0.6567164179104478

In [73]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [74]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [75]:
analysis.score_model_predictions(y_test,logreg_pred)

0.7295417529811066

In [76]:
analysis.score_model_predictions(y_test,forest_pred)

0.6125969406878569