# Logistic Regression

#### I employ a logistic regression to predict the winner of the SuperBowl based on regular season statistics.

##### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

ModuleNotFoundError: No module named 'imblearn'

I first extract the processed PCA and non-PCA season statistics data and load them as dataframes

In [None]:
stats = pd.read_csv("../data/processed/NFL_stats.csv")
pca_stats = pd.read_csv("../data/processed/PCA_NFL_stats.csv")

stats = stats.drop(["Year", "Team"], axis = 1)
pca_stats = pca_stats.drop(["Year", "Team"], axis = 1)

I then split the data into training and testing sets using stratification to ensure proportional representation of Superbowl winners. This is important because, by definition, winners (regarded as the positive class in this dataset) will constitute only 1/32 of the total data.

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 69)

for train_index, test_index in split.split(stats, stats["Superbowl Status"]):
    stats_train = stats.loc[train_index]
    stats_test = stats.loc[test_index]

sb_train = stats_train["Superbowl Status"]
sb_test = stats_test["Superbowl Status"]

stats_train = stats_train.drop(["Superbowl Status"], axis = 1)
stats_test = stats_test.drop(["Superbowl Status"], axis = 1)

stats_test

for train_index, test_index in split.split(pca_stats, pca_stats["Superbowl Status"]):
    pca_stats_train = pca_stats.loc[train_index]
    pca_stats_test = pca_stats.loc[test_index]

pca_sb_train = pca_stats_train["Superbowl Status"]
pca_sb_test = pca_stats_test["Superbowl Status"]

pca_stats_train = pca_stats_train.drop(["Superbowl Status"], axis = 1)
pca_stats_test = pca_stats_test.drop(["Superbowl Status"], axis = 1)

pca_stats_test

I then use SMOTE to oversample the minority class (teams that won a SuperBowl).

In [None]:
smote = SMOTE(random_state=69)
stats_train, sb_train = smote.fit_resample(stats_train, sb_train)
pca_stats_train, pca_sb_train = smote.fit_resample(pca_stats_train, pca_sb_train)

I then train logistic regression classifiers for both non-PCA and PCA data, optimizing with GridSearchCV

In [None]:
LogReg = LogisticRegression()

params = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
}

grid_log = GridSearchCV(LogReg, params)
pca_grid_log = GridSearchCV(LogReg, params)

grid_log = grid_log.fit(stats_train, sb_train)
pca_grid_log = pca_grid_log.fit(pca_stats_train, pca_sb_train)
