In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report

I first extract the processed PCA and non-PCA season statistics data and load them as dataframes

In [2]:
stats = pd.read_csv("../data/processed/NFL_stats.csv")
pca_stats = pd.read_csv("../data/processed/PCA_NFL_stats.csv")

stats = stats.drop(["Year", "Team"], axis = 1)
pca_stats = pca_stats.drop(["Year", "Team"], axis = 1)

I then split the data into training and testing sets using stratification to ensure proportional representation of Superbowl winners. This is important because, by definition, winners (regarded as the positive class in this dataset) will constitute only 1/32 of the total data.

In [3]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 69)

for train_index, test_index in split.split(stats, stats["Superbowl Status"]):
    stats_train = stats.loc[train_index]
    stats_test = stats.loc[test_index]

sb_train = stats_train["Superbowl Status"]
sb_test = stats_test["Superbowl Status"]

stats_train = stats_train.drop(["Superbowl Status"], axis = 1)
stats_test = stats_test.drop(["Superbowl Status"], axis = 1)

stats_test

for train_index, test_index in split.split(pca_stats, pca_stats["Superbowl Status"]):
    pca_stats_train = pca_stats.loc[train_index]
    pca_stats_test = pca_stats.loc[test_index]

pca_sb_train = pca_stats_train["Superbowl Status"]
pca_sb_test = pca_stats_test["Superbowl Status"]

pca_stats_train = pca_stats_train.drop(["Superbowl Status"], axis = 1)
pca_stats_test = pca_stats_test.drop(["Superbowl Status"], axis = 1)

pca_stats_test

Unnamed: 0,pca0
113,-1.145308
106,0.698567
306,-0.787280
583,-0.671622
131,-0.898767
...,...
146,-0.920795
277,1.570849
175,-1.107329
281,0.369291


I then use SMOTE to oversample the minority class (teams that won a SuperBowl).

In [4]:
smote = SMOTE(random_state=69)
stats_train, sb_train = smote.fit_resample(stats_train, sb_train)
pca_stats_train, pca_sb_train = smote.fit_resample(pca_stats_train, pca_sb_train)

I then use Randomized Search to optimize two randomized forest classifiers for both PCA and non-PCA data

In [1]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

NameError: name 'np' is not defined