In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import os
import time

import march_madness as mm
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures

In [2]:
analysis = mm.Analysis()

In [3]:
analysis.load_training_data('MDataFiles_Stage2/calculated features 1985-2021.csv')

# Phase 2 - Detailed Game Results

Phase 2 of the model training is using the 'detailed game results' for each game that gives additional box score stats beyond the final score of each game.

These stats included the follwing for both teams:
* FGM - field goals made
* FGA - field goals attempted
* FGM3 - three pointers made
* FGA3 - three pointers attempted
* FTM - free throws made
* FTA - free throws attempted
* OR - offensive rebounds
* DR - defensive rebounds
* Ast - assists
* TO - turnovers committed
* Stl - steals
* Blk - blocks
* PF - personal fouls committed

## Detailed Game Results

These statistics were then used to generate additional statistics such as offensive and defensive efficiency, various rates (ex. steals per possesion), and relavent ratios (ex. ratio of 3pt attempts to all field attempts). The game averages of these statistics for each team and their opponents were calculated and added to the model.

In [4]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ','Team Avg FGM',
                'Team Avg FGA','Team Avg FGM3','Team Avg FGA3','Team Avg FTM','Team Avg FTA',
                'Team Avg OR','Team Avg DR','Team Avg Ast','Team Avg TO%','Team Avg Stl%','Team Avg Blk%',
                'Team Avg PF','Team Avg TR','Team Avg FGM2','Team Avg FGA2','Team Avg FG%','Team Avg FG2%',
                'Team Avg FG3%','Team Avg FGA3%','Team Avg FT%','Team Avg Pos','Team Avg OEff','Opp Avg FGM',
                'Opp Avg FGA','Opp Avg FGM3','Opp Avg FGA3','Opp Avg FTM','Opp Avg FTA','Opp Avg OR','Opp Avg DR',
                'Opp Avg Ast','Opp Avg TO%','Opp Avg Stl%','Opp Avg Blk%','Opp Avg PF','Opp Avg TR','Opp Avg FGM2',
                'Opp Avg FGA2','Opp Avg FG%','Opp Avg FG2%','Opp Avg FG3%','Opp Avg FGA3%','Opp Avg FT%',
                'Opp Avg Pos','Opp Avg OEff']

In [5]:
X, y = analysis.extract_training_data(feature_keys=feature_keys)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [7]:
X_train.shape

(1737, 148)

In [8]:
y_train.shape

(1737,)

In [9]:
X_test.shape

(580, 148)

In [10]:
y_test.shape

(580,)

In [11]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
logreg.score(X_train,y_train)

0.7322970639032815

In [13]:
logreg.score(X_test,y_test)

0.7086206896551724

In [14]:
forest = RandomForestClassifier(n_estimators=10000,max_features='sqrt',
                                max_depth=5,random_state=100,n_jobs=2).fit(X_train,y_train)

In [15]:
forest.score(X_train,y_train)

0.7944732297063903

In [16]:
forest.score(X_test,y_test)

0.7017241379310345

In [17]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [18]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [19]:
analysis.score_model_predictions(y_test,logreg_pred)

0.54878

In [20]:
analysis.score_model_predictions(y_test,forest_pred)

0.56165

## Detailed Game Results + Interactions

Additionally, the interactions between all of these features were added to the model. This appeared to help the Random Forest but actually hurt the Logistic Regression.

In [21]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ','Team Avg FGM',
                'Team Avg FGA','Team Avg FGM3','Team Avg FGA3','Team Avg FTM','Team Avg FTA',
                'Team Avg OR','Team Avg DR','Team Avg Ast','Team Avg TO%','Team Avg Stl%','Team Avg Blk%',
                'Team Avg PF','Team Avg TR','Team Avg FGM2','Team Avg FGA2','Team Avg FG%','Team Avg FG2%',
                'Team Avg FG3%','Team Avg FGA3%','Team Avg FT%','Team Avg Pos','Team Avg OEff','Opp Avg FGM',
                'Opp Avg FGA','Opp Avg FGM3','Opp Avg FGA3','Opp Avg FTM','Opp Avg FTA','Opp Avg OR','Opp Avg DR',
                'Opp Avg Ast','Opp Avg TO%','Opp Avg Stl%','Opp Avg Blk%','Opp Avg PF','Opp Avg TR','Opp Avg FGM2',
                'Opp Avg FGA2','Opp Avg FG%','Opp Avg FG2%','Opp Avg FG3%','Opp Avg FGA3%','Opp Avg FT%',
                'Opp Avg Pos','Opp Avg OEff']

In [22]:
X, y = analysis.extract_training_data(feature_keys=feature_keys)

In [23]:
X.shape

(2317, 148)

In [24]:
poly = PolynomialFeatures(2)

In [25]:
X = poly.fit_transform(X)

In [26]:
X.shape

(2317, 11175)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [28]:
X_train.shape

(1737, 11175)

In [29]:
y_train.shape

(1737,)

In [30]:
X_test.shape

(580, 11175)

In [31]:
y_test.shape

(580,)

In [32]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
logreg.score(X_train,y_train)

0.7489925158318941

In [34]:
logreg.score(X_test,y_test)

0.696551724137931

In [35]:
forest = RandomForestClassifier(n_estimators=10000,max_features='sqrt',
                                max_depth=5,random_state=100,n_jobs=2).fit(X_train,y_train)

In [36]:
forest.score(X_train,y_train)

0.8405296488198043

In [37]:
forest.score(X_test,y_test)

0.7086206896551724

In [38]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [39]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [40]:
analysis.score_model_predictions(y_test,logreg_pred)

0.56185

In [41]:
analysis.score_model_predictions(y_test,forest_pred)

0.55319

## Net Detailed Game Results

The last features I engineer were the 'net' statistics for all these parameters. These parameters can be interpreted as "how the team did compared to what the opponent generally allowed". For example if an opponent allows an average of 5 three-pointers per game and the team scored 10, that would be a net of 5 for that parameter in that game. 

So on offense these metrics capture, how well did the team do compared to what their opponents allow on average.

And on defense they capture, how well did you limit your opponent compared to how much they normally produce.

'Net' parameters were calculated for all stats derived from the 'detailed' information and were the average difference for all of the team's games.

In [42]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ','Team Avg FGM',
                'Team Avg FGA','Team Avg FGM3','Team Avg FGA3','Team Avg FTM','Team Avg FTA',
                'Team Avg OR','Team Avg DR','Team Avg Ast','Team Avg TO%','Team Avg Stl%','Team Avg Blk%',
                'Team Avg PF','Team Avg TR','Team Avg FGM2','Team Avg FGA2','Team Avg FG%','Team Avg FG2%',
                'Team Avg FG3%','Team Avg FGA3%','Team Avg FT%','Team Avg Pos','Team Avg OEff','Opp Avg FGM',
                'Opp Avg FGA','Opp Avg FGM3','Opp Avg FGA3','Opp Avg FTM','Opp Avg FTA','Opp Avg OR','Opp Avg DR',
                'Opp Avg Ast','Opp Avg TO%','Opp Avg Stl%','Opp Avg Blk%','Opp Avg PF','Opp Avg TR','Opp Avg FGM2',
                'Opp Avg FGA2','Opp Avg FG%','Opp Avg FG2%','Opp Avg FG3%','Opp Avg FGA3%','Opp Avg FT%',
                'Opp Avg Pos','Opp Avg OEff','Net Team Avg FGM','Net Team Avg FGA','Net Team Avg FGM3',
                'Net Team Avg FGA3','Net Team Avg FTM','Net Team Avg FTA','Net Team Avg OR','Net Team Avg DR',
                'Net Team Avg Ast','Net Team Avg TO%','Net Team Avg Stl%','Net Team Avg Blk%','Net Team Avg PF',
                'Net Team Avg TR','Net Team Avg FGM2','Net Team Avg FGA2','Net Team Avg FG%','Net Team Avg FG2%',
                'Net Team Avg FG3%','Net Team Avg FGA3%','Net Team Avg FT%','Net Team Avg Pos','Net Team Avg OEff',
                'Net Opp Avg FGM','Net Opp Avg FGA','Net Opp Avg FGM3','Net Opp Avg FGA3','Net Opp Avg FTM',
                'Net Opp Avg FTA','Net Opp Avg OR','Net Opp Avg DR','Net Opp Avg Ast','Net Opp Avg TO%',
                'Net Opp Avg Stl%','Net Opp Avg Blk%','Net Opp Avg PF','Net Opp Avg TR','Net Opp Avg FGM2',
                'Net Opp Avg FGA2','Net Opp Avg FG%','Net Opp Avg FG2%','Net Opp Avg FG3%','Net Opp Avg FGA3%',
                'Net Opp Avg FT%','Net Opp Avg Pos','Net Opp Avg OEff']

In [43]:
X, y = analysis.extract_training_data(feature_keys=feature_keys)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [45]:
X_train.shape

(1737, 240)

In [46]:
y_train.shape

(1737,)

In [47]:
X_test.shape

(580, 240)

In [48]:
y_test.shape

(580,)

In [49]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
logreg.score(X_train,y_train)

0.7478411053540587

In [51]:
logreg.score(X_test,y_test)

0.7

In [52]:
forest = RandomForestClassifier(n_estimators=10000,max_features='sqrt',
                                max_depth=5,random_state=100,n_jobs=2).fit(X_train,y_train)

In [53]:
forest.score(X_train,y_train)

0.8008059873344847

In [54]:
forest.score(X_test,y_test)

0.7017241379310345

In [55]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [56]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [57]:
analysis.score_model_predictions(y_test,logreg_pred)

0.5538

In [58]:
analysis.score_model_predictions(y_test,forest_pred)

0.56639

## Net Detailed Game Results + Interactions

Again, the interactions between all of these features were added to the model. This again, helped the Random Forest and hurt the Logistic Regression.

In [59]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ','Team Avg FGM',
                'Team Avg FGA','Team Avg FGM3','Team Avg FGA3','Team Avg FTM','Team Avg FTA',
                'Team Avg OR','Team Avg DR','Team Avg Ast','Team Avg TO%','Team Avg Stl%','Team Avg Blk%',
                'Team Avg PF','Team Avg TR','Team Avg FGM2','Team Avg FGA2','Team Avg FG%','Team Avg FG2%',
                'Team Avg FG3%','Team Avg FGA3%','Team Avg FT%','Team Avg Pos','Team Avg OEff','Opp Avg FGM',
                'Opp Avg FGA','Opp Avg FGM3','Opp Avg FGA3','Opp Avg FTM','Opp Avg FTA','Opp Avg OR','Opp Avg DR',
                'Opp Avg Ast','Opp Avg TO%','Opp Avg Stl%','Opp Avg Blk%','Opp Avg PF','Opp Avg TR','Opp Avg FGM2',
                'Opp Avg FGA2','Opp Avg FG%','Opp Avg FG2%','Opp Avg FG3%','Opp Avg FGA3%','Opp Avg FT%',
                'Opp Avg Pos','Opp Avg OEff','Net Team Avg FGM','Net Team Avg FGA','Net Team Avg FGM3',
                'Net Team Avg FGA3','Net Team Avg FTM','Net Team Avg FTA','Net Team Avg OR','Net Team Avg DR',
                'Net Team Avg Ast','Net Team Avg TO%','Net Team Avg Stl%','Net Team Avg Blk%','Net Team Avg PF',
                'Net Team Avg TR','Net Team Avg FGM2','Net Team Avg FGA2','Net Team Avg FG%','Net Team Avg FG2%',
                'Net Team Avg FG3%','Net Team Avg FGA3%','Net Team Avg FT%','Net Team Avg Pos','Net Team Avg OEff',
                'Net Opp Avg FGM','Net Opp Avg FGA','Net Opp Avg FGM3','Net Opp Avg FGA3','Net Opp Avg FTM',
                'Net Opp Avg FTA','Net Opp Avg OR','Net Opp Avg DR','Net Opp Avg Ast','Net Opp Avg TO%',
                'Net Opp Avg Stl%','Net Opp Avg Blk%','Net Opp Avg PF','Net Opp Avg TR','Net Opp Avg FGM2',
                'Net Opp Avg FGA2','Net Opp Avg FG%','Net Opp Avg FG2%','Net Opp Avg FG3%','Net Opp Avg FGA3%',
                'Net Opp Avg FT%','Net Opp Avg Pos','Net Opp Avg OEff']

In [60]:
X, y = analysis.extract_training_data(feature_keys=feature_keys)

In [61]:
X.shape

(2317, 240)

In [62]:
poly = PolynomialFeatures(2)

In [63]:
X = poly.fit_transform(X)

In [64]:
X.shape

(2317, 29161)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [66]:
X_train.shape

(1737, 29161)

In [67]:
y_train.shape

(1737,)

In [68]:
X_test.shape

(580, 29161)

In [69]:
y_test.shape

(580,)

In [70]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [71]:
logreg.score(X_train,y_train)

0.795048934945308

In [72]:
logreg.score(X_test,y_test)

0.6655172413793103

In [73]:
forest = RandomForestClassifier(n_estimators=10000,max_features='sqrt',
                                max_depth=5,random_state=100,n_jobs=2).fit(X_train,y_train)

In [74]:
forest.score(X_train,y_train)

0.8485895221646517

In [75]:
forest.score(X_test,y_test)

0.7155172413793104

In [76]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [77]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [78]:
analysis.score_model_predictions(y_test,logreg_pred)

0.65502

In [79]:
analysis.score_model_predictions(y_test,forest_pred)

0.55564

# Model Tuning Tests

Finally I tested the effect of tuning the key model parameters on the performance of the Random Forest. The two parameters tested were the number of trees in the forest and the 'mad_depth' allowed for the trees.

### Feature Set - Net Detailed Game Results + Interactions

In [80]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ','Team Avg FGM',
                'Team Avg FGA','Team Avg FGM3','Team Avg FGA3','Team Avg FTM','Team Avg FTA',
                'Team Avg OR','Team Avg DR','Team Avg Ast','Team Avg TO%','Team Avg Stl%','Team Avg Blk%',
                'Team Avg PF','Team Avg TR','Team Avg FGM2','Team Avg FGA2','Team Avg FG%','Team Avg FG2%',
                'Team Avg FG3%','Team Avg FGA3%','Team Avg FT%','Team Avg Pos','Team Avg OEff','Opp Avg FGM',
                'Opp Avg FGA','Opp Avg FGM3','Opp Avg FGA3','Opp Avg FTM','Opp Avg FTA','Opp Avg OR','Opp Avg DR',
                'Opp Avg Ast','Opp Avg TO%','Opp Avg Stl%','Opp Avg Blk%','Opp Avg PF','Opp Avg TR','Opp Avg FGM2',
                'Opp Avg FGA2','Opp Avg FG%','Opp Avg FG2%','Opp Avg FG3%','Opp Avg FGA3%','Opp Avg FT%',
                'Opp Avg Pos','Opp Avg OEff','Net Team Avg FGM','Net Team Avg FGA','Net Team Avg FGM3',
                'Net Team Avg FGA3','Net Team Avg FTM','Net Team Avg FTA','Net Team Avg OR','Net Team Avg DR',
                'Net Team Avg Ast','Net Team Avg TO%','Net Team Avg Stl%','Net Team Avg Blk%','Net Team Avg PF',
                'Net Team Avg TR','Net Team Avg FGM2','Net Team Avg FGA2','Net Team Avg FG%','Net Team Avg FG2%',
                'Net Team Avg FG3%','Net Team Avg FGA3%','Net Team Avg FT%','Net Team Avg Pos','Net Team Avg OEff',
                'Net Opp Avg FGM','Net Opp Avg FGA','Net Opp Avg FGM3','Net Opp Avg FGA3','Net Opp Avg FTM',
                'Net Opp Avg FTA','Net Opp Avg OR','Net Opp Avg DR','Net Opp Avg Ast','Net Opp Avg TO%',
                'Net Opp Avg Stl%','Net Opp Avg Blk%','Net Opp Avg PF','Net Opp Avg TR','Net Opp Avg FGM2',
                'Net Opp Avg FGA2','Net Opp Avg FG%','Net Opp Avg FG2%','Net Opp Avg FG3%','Net Opp Avg FGA3%',
                'Net Opp Avg FT%','Net Opp Avg Pos','Net Opp Avg OEff']

In [81]:
X, y = analysis.extract_training_data(feature_keys=feature_keys)

In [82]:
poly = PolynomialFeatures(2)

In [83]:
X = poly.fit_transform(X)

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [85]:
X_train.shape

(1737, 29161)

In [86]:
y_train.shape

(1737,)

In [87]:
X_test.shape

(580, 29161)

In [88]:
y_test.shape

(580,)

## Number of Trees

Test model performance as a function of the number of trees

In [89]:
number_trees = [100,200,400,800,1600,3200,6400,12800]

In [90]:
for n_estimators in number_trees:
    start_time = time.time()
    forest = RandomForestClassifier(n_estimators=n_estimators,max_features='sqrt',
                                    max_depth=5,random_state=100,n_jobs=2).fit(X_train,y_train)
    end_time = time.time()
    training_score = forest.score(X_train,y_train)
    test_score = forest.score(X_test,y_test)
    forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])
    competition_score = analysis.score_model_predictions(y_test,forest_pred)
    training_time = end_time - start_time
    
    print(f'Number of Trees: {n_estimators}')
    print(f'  Training Set Score : {training_score:.3f}')
    print(f'     Test Set Score  : {test_score:.3f}')
    print(f'   Competition Score : {competition_score:.3f}')
    print(f'      Training Time  : {training_time:.2f}')

Number of Trees: 100
  Training Set Score : 0.844
     Test Set Score  : 0.712
   Competition Score : 0.553
      Training Time  : 3.13
Number of Trees: 200
  Training Set Score : 0.849
     Test Set Score  : 0.717
   Competition Score : 0.554
      Training Time  : 5.94
Number of Trees: 400
  Training Set Score : 0.847
     Test Set Score  : 0.703
   Competition Score : 0.554
      Training Time  : 11.85
Number of Trees: 800
  Training Set Score : 0.848
     Test Set Score  : 0.709
   Competition Score : 0.555
      Training Time  : 23.50
Number of Trees: 1600
  Training Set Score : 0.849
     Test Set Score  : 0.712
   Competition Score : 0.556
      Training Time  : 46.90
Number of Trees: 3200
  Training Set Score : 0.846
     Test Set Score  : 0.710
   Competition Score : 0.556
      Training Time  : 93.51
Number of Trees: 6400
  Training Set Score : 0.848
     Test Set Score  : 0.709
   Competition Score : 0.556
      Training Time  : 189.65
Number of Trees: 12800
  Training Set S

## Max Depth

Test Model Performance for fixed number of trees and variable 'max_depth'

In [91]:
max_depths = [3,5,7,9,11]

In [92]:
for max_depth in max_depths:
    forest = RandomForestClassifier(n_estimators=1600,max_features='sqrt',
                                    max_depth=max_depth,random_state=100,n_jobs=2).fit(X_train,y_train)
    training_score = forest.score(X_train,y_train)
    test_score = forest.score(X_test,y_test)
    forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])
    competition_score = analysis.score_model_predictions(y_test,forest_pred)
    
    print(f'Max Depth: {max_depth}')
    print(f'  Training Set Score : {training_score:.3f}')
    print(f'     Test Set Score  : {test_score:.3f}')
    print(f'   Competition Score : {competition_score:.3f}')

Max Depth: 3
  Training Set Score : 0.746
     Test Set Score  : 0.717
   Competition Score : 0.559
Max Depth: 5
  Training Set Score : 0.849
     Test Set Score  : 0.712
   Competition Score : 0.556
Max Depth: 7
  Training Set Score : 0.933
     Test Set Score  : 0.698
   Competition Score : 0.558
Max Depth: 9
  Training Set Score : 0.968
     Test Set Score  : 0.684
   Competition Score : 0.562
Max Depth: 11
  Training Set Score : 0.987
     Test Set Score  : 0.686
   Competition Score : 0.565


# Conclusions

In a surprising result, using the additional "detailed" game statistics and more advanced efficiency and rate statistics didn't improve the performance of the models compared to the phase 1 development solution.

In general, the Logisist Regression model performed best using just the features and the Random Forest performed better when interactions were added.