In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

I first extract the processed PCA and non-PCA season statistics data and load them as dataframes

In [5]:
stats = pd.read_csv("../data/processed/NFL_stats.csv")
pca_stats = pd.read_csv("../data/processed/PCA_NFL_stats.csv")

stats = stats.drop(["Year", "Team"], axis = 1)
pca_stats = pca_stats.drop(["Year", "Team"], axis = 1)

I then split the data into training and testing sets using stratification to ensure proportional representation of Superbowl winners. This is important because, by definition, winners (regarded as the positive class in this dataset) will constitute only 1/32 of the total data.

In [3]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 69)

for train_index, test_index in split.split(stats, stats["Superbowl Status"]):
    stats_train = stats.loc[train_index]
    stats_test = stats.loc[test_index]

sb_train = stats_train["Superbowl Status"]
sb_test = stats_test["Superbowl Status"]

stats_train = stats_train.drop(["Superbowl Status"], axis = 1)
stats_test = stats_test.drop(["Superbowl Status"], axis = 1)

stats_test

for train_index, test_index in split.split(pca_stats, pca_stats["Superbowl Status"]):
    pca_stats_train = pca_stats.loc[train_index]
    pca_stats_test = pca_stats.loc[test_index]

pca_sb_train = pca_stats_train["Superbowl Status"]
pca_sb_test = pca_stats_test["Superbowl Status"]

pca_stats_train = pca_stats_train.drop(["Superbowl Status"], axis = 1)
pca_stats_test = pca_stats_test.drop(["Superbowl Status"], axis = 1)

pca_stats_test

Unnamed: 0,pca0
113,-1.145308
106,0.698567
306,-0.787280
583,-0.671622
131,-0.898767
...,...
146,-0.920795
277,1.570849
175,-1.107329
281,0.369291


I then use SMOTE to oversample the minority class (teams that won a SuperBowl).

In [6]:
smote = SMOTE(random_state=69)
stats_train, sb_train = smote.fit_resample(stats_train, sb_train)
pca_stats_train, pca_sb_train = smote.fit_resample(pca_stats_train, pca_sb_train)

I then use Randomized Search to optimize two randomized forest classifiers for both PCA and non-PCA data

In [7]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [8]:
rf = RandomForestClassifier()

rf_grid = RandomizedSearchCV(rf, param_distributions= random_grid, scoring = "recall")
rf_grid.fit(stats_train, sb_train)

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/sklearn/utils/_param_v

In [10]:
rf_grid.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 30,
 'bootstrap': True}

In [16]:
param_grid = {
    'n_estimators': [900,1000,1100],
    'min_samples_split':[2],
    'min_samples_leaf':[1],
    'max_features':['sqrt'],
    'max_depth':[20,30,40],
    'bootstrap': [True]
}

rf_grid = GridSearchCV(rf, param_grid, scoring = "recall")
rf_grid.fit(stats_train, sb_train)


In [17]:
preds = rf_grid.predict(stats_test)

In [18]:
confusion_matrix(sb_test, preds)

array([[117,   1],
       [  4,   0]])

In [19]:
print(classification_report(sb_test, preds))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       118
           1       0.00      0.00      0.00         4

    accuracy                           0.96       122
   macro avg       0.48      0.50      0.49       122
weighted avg       0.94      0.96      0.95       122



In [20]:
pca_random_grid = RandomizedSearchCV(rf, random_grid, scoring = 'recall')

pca_random_grid.fit(pca_stats_train, pca_sb_train)

30 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/sklearn/utils/_param_v

In [21]:
pca_random_grid.best_params_

{'n_estimators': 1400,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [37]:
param_grid = {
    'n_estimators': [1200,1400,1600],
    'min_samples_split':[5],
    'min_samples_leaf':[1],
    'max_features':['sqrt'],
    'max_depth':[5,10,15],
    'bootstrap': [True]
}

pca_rf_grid = GridSearchCV(rf, param_grid, scoring = "recall")
pca_rf_grid.fit(pca_stats_train, pca_sb_train)

In [39]:
pca_rf_grid.best_params_

{'bootstrap': True,
 'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 1200}

In [34]:
preds = pca_rf_grid.predict(pca_stats_test)

In [35]:
confusion_matrix(pca_sb_test,preds )

array([[79, 39],
       [ 2,  2]])

In [36]:
print(classification_report(pca_sb_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.67      0.79       118
           1       0.05      0.50      0.09         4

    accuracy                           0.66       122
   macro avg       0.51      0.58      0.44       122
weighted avg       0.94      0.66      0.77       122

