In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import os

import march_madness as mm
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures

This document is based on work for a Kaggle competition: [March Machine Learning Mania 2022](https://www.kaggle.com/competitions/mens-march-mania-2022)

The goal of this project is to predict the results of match-ups in the College Basketball March Madness Tournament.

Model performance is scored by a cost function that accumulates based on the correctness and confidence of a prediction. A confident, correct prediction has a minimal cost compared to a confident, incorrect prediction that has a high cost. The confidence of the models prediction for a game outcome is passes as a value from 0 to 1. 

Given I had a short period of time before the submission deadline (less than three weeks) and limited time to work on the project, I focused my strategy on developing features in a logical progression from simple and high-level towards more complex as I had time. An added benefit of this approach was I was able to test the performance of the model each step along the way as it "learned" more information about the teams.  

In [2]:
analysis = mm.Analysis()

In [3]:
analysis.load_seasons()

In [4]:
analysis.calc_seasons_features(detailed_stats_features=False)

1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


2006
2007
2008


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


2009
2010
2011
2012
2013


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


2014


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


2015


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


2016
2017
2018
2019
2021


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


2022


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


# Baseline Cases

Before getting started with models, I wanted to establish two baseline cases.

The first is for the coin flip case where both teams are give a 50/50 chance of winning or a 0.5 probability. Hopefully our model performance will be much better than this.

The second baseline gives a prediction based solely on the historical rate that one tournament seed defeated another. In the case where two teams with the same seed played each other or there were less than five historical match-ups between a pair of seeds, the neutral probability of 0.5 was used for the match-up.

## 50/50 Predictions

In [5]:
feature_keys = []

In [6]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [8]:
naive_predictions = np.ones(y_test.shape) * 0.5

In [9]:
analysis.score_model_predictions(y_test,naive_predictions)

0.6931471805599453

## Seed Win Percentages

In [10]:
feature_keys = ['tourney seed']

In [11]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [13]:
seed_predictions = analysis.generate_seed_win_predictions(X_test)

In [14]:
analysis.score_model_predictions(y_test,seed_predictions)

0.5110751475017764

The results for the predictions based solely on the historical seed match-ups were actually quite good however there is undoubtably over-fitting occuring since the test set games were included when these percentages were calculated.

So instead of using historically calculated seed win-percentages, I started with a logistic regression model using only the seed information for the two teams. I took two approaches to see if there was a significant difference, the first where I treated the seeding as a continuous variable and the second where I treated it as a categorical variable.

## Seed Only (As Continuous Value)

In [15]:
feature_keys = ['tourney seed']

In [16]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [18]:
X_train.shape

(885, 2)

In [19]:
y_train.shape

(885,)

In [20]:
X_test.shape

(296, 2)

In [21]:
y_test.shape

(296,)

In [22]:
logreg = LogisticRegression().fit(X_train,y_train)

In [23]:
logreg.score(X_train,y_train)

0.6983050847457627

In [24]:
logreg.score(X_test,y_test)

0.722972972972973

In [25]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [26]:
analysis.score_model_predictions(y_test,logreg_pred)

0.5431638347473852

## Seed Only (As Categorical Value)

In [27]:
feature_keys = ['tourney seed']

In [28]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [29]:
X = X.astype(str)

In [30]:
X = pd.get_dummies(X)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [32]:
X_train.shape

(885, 32)

In [33]:
y_train.shape

(885,)

In [34]:
X_test.shape

(296, 32)

In [35]:
y_test.shape

(296,)

In [36]:
logreg = LogisticRegression().fit(X_train,y_train)

In [37]:
logreg.score(X_train,y_train)

0.6994350282485876

In [38]:
logreg.score(X_test,y_test)

0.7162162162162162

In [39]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [40]:
analysis.score_model_predictions(y_test,logreg_pred)

0.538334648470593

The results were essentially the same between the two cases, indicating there isn't a huge difference either way. For the rest of the way, I treat the seed as a continuous variable.

# Model Development

I started feature engineering by recreating the [RPI Rating system](https://en.wikipedia.org/wiki/Rating_percentage_index) that was a common measure used to compare, select, and seed teams in the NCAA tournament through 2018.

RPI is made up of three components:
* Team weighted win percentage (Road wins count as 1.4 vs 0.6 for home wins)
* Opponents Win Percentage (OWP)
* Opponents-Opponents Win Percentage (OOWP)

In addition to being a common and accepted comparison system, calculating variables for RPI was a good place to start because it captures information on the teams' win/loss record, as well as, their Strength of Schedule (OWP & OOWP)

The rest of the way, I test my feature set on both a logistic regression model and a random forest classifier. I fully expected the random forest to overfit, especially on small feature sets, but I hoped I would see it begin to generalize as more features and information was added.

I chose these two models to primarily work with because:
* The logistic regression is simple, easy to interpret, and at the very least will serve as a performance baseline for more advanced models.
* Random Forests have general high performance and robustness, and in particular, are able to utilize a large number of features and determine which are the most relevant to the problem.

## RPI (No Seed)

In [41]:
feature_keys = ['weighted win pct','owp','oowp']

In [42]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [44]:
X_train.shape

(885, 6)

In [45]:
y_train.shape

(885,)

In [46]:
X_test.shape

(296, 6)

In [47]:
y_test.shape

(296,)

In [48]:
logreg = LogisticRegression().fit(X_train,y_train)

In [49]:
logreg.score(X_train,y_train)

0.6971751412429379

In [50]:
logreg.score(X_test,y_test)

0.6722972972972973

In [51]:
forest = RandomForestClassifier(n_estimators=100,max_features=3,random_state=100).fit(X_train,y_train)

In [52]:
forest.score(X_train,y_train)

1.0

In [53]:
forest.score(X_test,y_test)

0.6587837837837838

In [54]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [55]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [56]:
analysis.score_model_predictions(y_test,logreg_pred)

0.5945163233730747

In [57]:
analysis.score_model_predictions(y_test,forest_pred)

0.595994154403052

Interestingly using the RPI variables had slightly worse performance than seed alone. The percentage of correct values was nearly the same, however, the score based on the confidence of predictions actually suffered. I added back the seed to see how it helps the model performance.

## RPI + Seed

In [58]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp']

In [59]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [61]:
X_train.shape

(885, 8)

In [62]:
y_train.shape

(885,)

In [63]:
X_test.shape

(296, 8)

In [64]:
y_test.shape

(296,)

In [65]:
logreg = LogisticRegression().fit(X_train,y_train)

In [66]:
logreg.score(X_train,y_train)

0.6983050847457627

In [67]:
logreg.score(X_test,y_test)

0.7162162162162162

In [68]:
forest = RandomForestClassifier(n_estimators=100,max_features=3,random_state=100).fit(X_train,y_train)

In [69]:
forest.score(X_train,y_train)

1.0

In [70]:
forest.score(X_test,y_test)

0.6655405405405406

In [71]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [72]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [73]:
analysis.score_model_predictions(y_test,logreg_pred)

0.5395195192734197

In [74]:
analysis.score_model_predictions(y_test,forest_pred)

0.5746330340842978

Adding the seed back into the RPI calculation brought the prediction score back to nearly equivalent to the seed alone. Interestingly the RPI features didn't improve the performance of the model possibly because a lot of this information is baked into the process of seeding the teams.

## Margin of Victory

The next step was to add in features capturing the magnitude of the teams' wins and losses. Features inclueded the avg margin of wins and losses, as well as, the number of "close" wins and losses defined as 3pts or less. These features didn't have a significant impact on the performance of the logistic regression model but it was the first time the random forest had comparable performance even though it is still over-fitting.

In [75]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses']

In [76]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [78]:
X_train.shape

(885, 28)

In [79]:
y_train.shape

(885,)

In [80]:
X_test.shape

(296, 28)

In [81]:
y_test.shape

(296,)

In [82]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [83]:
logreg.score(X_train,y_train)

0.7220338983050848

In [84]:
logreg.score(X_test,y_test)

0.6790540540540541

In [85]:
forest = RandomForestClassifier(n_estimators=100,max_features=3,random_state=100).fit(X_train,y_train)

In [86]:
forest.score(X_train,y_train)

1.0

In [87]:
forest.score(X_test,y_test)

0.6993243243243243

In [88]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [89]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [90]:
analysis.score_model_predictions(y_test,logreg_pred)

0.5600032864659814

In [91]:
analysis.score_model_predictions(y_test,forest_pred)

0.5386938057937892

## Quality Wins

Next we want to add information about wins and losses a team has against top seeded teams and other teams in the tournament. These wins, especially those against the top 16 and top 8 teams would be characterized as "quality" wins.

In [92]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses']

In [93]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [95]:
X_train.shape

(885, 44)

In [96]:
y_train.shape

(885,)

In [97]:
X_test.shape

(296, 44)

In [98]:
y_test.shape

(296,)

In [99]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [100]:
logreg.score(X_train,y_train)

0.7231638418079096

In [101]:
logreg.score(X_test,y_test)

0.6824324324324325

In [102]:
forest = RandomForestClassifier(n_estimators=1000,max_features=3,random_state=100).fit(X_train,y_train)

In [103]:
forest.score(X_train,y_train)

1.0

In [104]:
forest.score(X_test,y_test)

0.6993243243243243

In [105]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [106]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [107]:
analysis.score_model_predictions(y_test,logreg_pred)

0.5623558796007845

In [108]:
analysis.score_model_predictions(y_test,forest_pred)

0.549516037327178

## Late Season Form

Finally, features on the the late season form of the teams is added to capture how "hot" each team is going into the tournament. This includeds their win percentage over the last 5 and 10 games, as well as, how they performed in their conference tournament.

In [109]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ']

In [110]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [112]:
X_train.shape

(885, 56)

In [113]:
y_train.shape

(885,)

In [114]:
X_test.shape

(296, 56)

In [115]:
y_test.shape

(296,)

In [116]:
logreg = LogisticRegression(max_iter=1000).fit(X_train,y_train)

In [117]:
logreg.score(X_train,y_train)

0.7209039548022599

In [118]:
logreg.score(X_test,y_test)

0.6824324324324325

In [119]:
forest = RandomForestClassifier(n_estimators=1000,max_features=3,random_state=100).fit(X_train,y_train)

In [120]:
forest.score(X_train,y_train)

1.0

In [121]:
forest.score(X_test,y_test)

0.7195945945945946

In [122]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [123]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [124]:
analysis.score_model_predictions(y_test,logreg_pred)

0.5671150235071052

In [125]:
analysis.score_model_predictions(y_test,forest_pred)

0.5471107494791293

Interestingly enought, the model performance was actually slightly worse with the additional features. I'm not sure how much to read into this but I feel like I have extracted most of the value from the final score lines alone.

Since this is where I got to before running into the deadline to submit the prediction before the tournament began, I first tested to see if removing some of the more redundent features helped the logistic regression model, and second, tested a range of regularization parameters to see which yielded the best performance for my submission.

## Pruned Feature Set

In [126]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','capped avg win margin','capped std win margin',
                'capped avg loss margin','capped std loss margin','close wins','close losses','weighted top64 wins',
                'weighted top16 wins','weighted top64 losses','weighted top16 losses','last10 weighted win pct',
                'last5 weighted win pct','conference tourney wins','conference champ']

In [127]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [129]:
X_train.shape

(885, 36)

In [130]:
y_train.shape

(885,)

In [131]:
X_test.shape

(296, 36)

In [132]:
y_test.shape

(296,)

In [133]:
logreg = LogisticRegression().fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [134]:
logreg.score(X_train,y_train)

0.711864406779661

In [135]:
logreg.score(X_test,y_test)

0.7195945945945946

In [136]:
max_features = round(np.sqrt(len(feature_keys)))
max_features

4

In [137]:
forest = RandomForestClassifier(n_estimators=1000,max_features=3,random_state=100).fit(X_train,y_train)

In [138]:
forest.score(X_train,y_train)

1.0

In [139]:
forest.score(X_test,y_test)

0.7195945945945946

In [140]:
logreg_pred = mm.bound_predictions(logreg.predict_proba(X_test)[:,1])

In [141]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [142]:
analysis.score_model_predictions(y_test,logreg_pred)

0.546533695668436

In [143]:
analysis.score_model_predictions(y_test,forest_pred)

0.5508740893156185

## Logistic Regression Model Tuning

From pruning the features, I didn't see a significant improvement in the logistic regression performance. Admitidly, I didn't have time to investigate which feature where having the largest effect on the data which would've been helpful for that selection. Since there doesn't seem to be over-fitting occuring with the logistic regression model, I decided to include the entire feature set. The random forest model is still over-fitting even when controlling the 'max_features' parameter to 3. Therefore, it looks like the logistic regression model is the way to go at this point.

In [144]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ']

In [145]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [147]:
X_train.shape

(885, 56)

In [148]:
y_train.shape

(885,)

In [149]:
X_test.shape

(296, 56)

In [150]:
y_test.shape

(296,)

In [151]:
C_values = [10**x for x in range(-4,5)]
C_values

[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

In [152]:
for C in C_values:
    logreg = LogisticRegression(C=C).fit(X_train,y_train)
    
    print(f'Regularization Value: {C}')
    print(f'Training Score: {logreg.score(X_train,y_train):.3f}')
    print(f'Test Score: {logreg.score(X_train,y_train):.3f}')
    print(f'Predictions Score: {analysis.score_model_predictions(y_test,logreg_pred):.3f}')
    print()

Regularization Value: 0.0001
Training Score: 0.704
Test Score: 0.704
Predictions Score: 0.547

Regularization Value: 0.001
Training Score: 0.707
Test Score: 0.707
Predictions Score: 0.547

Regularization Value: 0.01
Training Score: 0.716
Test Score: 0.716
Predictions Score: 0.547

Regularization Value: 0.1
Training Score: 0.715
Test Score: 0.715
Predictions Score: 0.547



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Regularization Value: 1
Training Score: 0.715
Test Score: 0.715
Predictions Score: 0.547

Regularization Value: 10
Training Score: 0.715
Test Score: 0.715
Predictions Score: 0.547

Regularization Value: 100
Training Score: 0.719
Test Score: 0.719
Predictions Score: 0.547

Regularization Value: 1000
Training Score: 0.721
Test Score: 0.721
Predictions Score: 0.547

Regularization Value: 10000
Training Score: 0.720
Test Score: 0.720
Predictions Score: 0.547



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In this case, a higher regularization value was better but didn't show model improvement above the default C=1 so I decided to leave the default parameter as-is.

# Development Phase 1 - Conclusions

For this phase, the models were trained using features created from game summary data (who won and the score). The features were derived from regular season data because when we go to make predictions on new tournaments, that's the only data we will have. However, after the features were created using each seasons' regular season games, only match-ups in the historical NCAA tournaments were used to train how these features can be used to predict tournament outcomes.

Information generated from the game summary data included:
* Overall team season performance (win %, weighted win %)
* Team strength of schedule
* Team win and loss margins
* How many 'quality' wins a team had
* The late season form of a team entering the tournament


One thing that was clear was the importance of the team's seed to game prediction, even over some of the more intuitively descriptive features of a teams quality. A likely explanation for this is that a lot of the information about the historical performance of a team over the season is baked into the their tournament seed, and by extension, their overall National ranking and standings in aggregate rating systems that were used to by the tournament selection committe to make the seeding decisions.


Although we were able to achieve modest performance improvements compared to just the seed or random guesses, particularly in regard to refining the confidence of our predictions, it wasn't overwhelmingly good. It's pretty clear that to achieve the type of improved predictive performance we would hope to get using machine learning, additional and more detailed information about the teams' performance is needed. This information will hopefully improve our model by not just utilizing game outcome information, but rather information about how the teams achieved those outcomes, which could be a significant factor in how two teams match-up in the context of the game, rather than just their wins and losses over the season.

# Depth Limit Tests

## RPI (No Seed)

In [153]:
feature_keys = ['weighted win pct','owp','oowp']

In [154]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [156]:
X_train.shape

(885, 6)

In [157]:
y_train.shape

(885,)

In [158]:
X_test.shape

(296, 6)

In [159]:
y_test.shape

(296,)

In [160]:
forest = RandomForestClassifier(n_estimators=1000,max_features='sqrt',max_depth=5,random_state=100).fit(X_train,y_train)

In [161]:
forest.score(X_train,y_train)

0.7728813559322034

In [162]:
forest.score(X_test,y_test)

0.6824324324324325

In [163]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [164]:
analysis.score_model_predictions(y_test,forest_pred)

0.561875917108726

## RPI + Seed

In [165]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp']

In [166]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [168]:
X_train.shape

(885, 8)

In [169]:
y_train.shape

(885,)

In [170]:
X_test.shape

(296, 8)

In [171]:
y_test.shape

(296,)

In [172]:
forest = RandomForestClassifier(n_estimators=1000,max_features='sqrt',max_depth=5,random_state=100).fit(X_train,y_train)

In [173]:
forest.score(X_train,y_train)

0.7830508474576271

In [174]:
forest.score(X_test,y_test)

0.6824324324324325

In [175]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [176]:
analysis.score_model_predictions(y_test,forest_pred)

0.5403769996676242

## Margin of Victory

In [177]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses']

In [178]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [179]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [180]:
X_train.shape

(885, 28)

In [181]:
y_train.shape

(885,)

In [182]:
X_test.shape

(296, 28)

In [183]:
y_test.shape

(296,)

In [184]:
forest = RandomForestClassifier(n_estimators=1000,max_features='sqrt',max_depth=5,random_state=100).fit(X_train,y_train)

In [185]:
forest.score(X_train,y_train)

0.8146892655367232

In [186]:
forest.score(X_test,y_test)

0.6993243243243243

In [187]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [188]:
analysis.score_model_predictions(y_test,forest_pred)

0.5390944304831864

## Quality Wins

In [189]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses']

In [190]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [191]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [192]:
X_train.shape

(885, 44)

In [193]:
y_train.shape

(885,)

In [194]:
X_test.shape

(296, 44)

In [195]:
y_test.shape

(296,)

In [196]:
forest = RandomForestClassifier(n_estimators=1000,max_features='sqrt',max_depth=5,random_state=100).fit(X_train,y_train)

In [197]:
forest.score(X_train,y_train)

0.8146892655367232

In [198]:
forest.score(X_test,y_test)

0.6959459459459459

In [199]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [200]:
analysis.score_model_predictions(y_test,forest_pred)

0.5471213663536674

## Late Season Form

In [201]:
feature_keys = ['tourney seed','weighted win pct','owp','oowp','avg win margin','std win margin','avg loss margin',
                'std loss margin','capped avg win margin','capped std win margin','capped avg loss margin',
                'capped std loss margin','close wins','close losses','weighted top64 wins','weighted top32 wins',
                'weighted top16 wins','weighted top8 wins','weighted top64 losses','weighted top32 losses',
                'weighted top16 losses','weighted top8 losses','last10 win pct','last10 weighted win pct',
                'last5 win pct','last5 weighted win pct','conference tourney wins','conference champ']

In [202]:
X, y = analysis.seasons_generate_tourney_model_data(feature_keys=feature_keys)

In [203]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

In [204]:
X_train.shape

(885, 56)

In [205]:
y_train.shape

(885,)

In [206]:
X_test.shape

(296, 56)

In [207]:
y_test.shape

(296,)

In [208]:
forest = RandomForestClassifier(n_estimators=1000,max_features='sqrt',max_depth=5,random_state=100).fit(X_train,y_train)

In [209]:
forest.score(X_train,y_train)

0.8169491525423729

In [210]:
forest.score(X_test,y_test)

0.6925675675675675

In [211]:
forest_pred = mm.bound_predictions(forest.predict_proba(X_test)[:,1])

In [212]:
analysis.score_model_predictions(y_test,forest_pred)

0.5476625589555301

# Max Depth Conclusions

The competition score of the final feature set ended up being the same, however using max_depth to control model complexity was clearly more effective in reducing over-fitting. The training test score was brought down towards roughly 0.8 rather than being locked at a perfect 1.0. Additionally the model performance of the random forest was better early with less features and information than with the previous strategy which limited the max_features parameter.