# Tune Hyperparameters of Baseline Models (Logistic Regression, GBDT) 

Determine optimal hyperparameters for predicting NFL play calls using Logistic Regression and Gradient Boosted Decision Trees. Uses a hyperband search strategy to tune various parameters for each model type. Data is kept non sequential. For comparing performance with sequential neural networks.

## Load Libraries

In [1]:
import json
import os
import time

import pandas as pd
import numpy as np

import keras_tuner as kt

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn import metrics
from sklearn.model_selection import PredefinedSplit

from functools import partial

pd.options.mode.chained_assignment = None

No supported GPU was found.


## Define Functions For Model Specs and Data Processing

In [2]:
def specify(specs):
    """
    Make specifications for running the model tuning script.
    Sets directories for storing results and global variables.
    ---------------------------------------------
    Inputs: A .json file containing user specifications
    Returns: Various directories, a list of continous features, max play lags,
    and distribution strategy
    """
    # determine whether to run locally or on hpc
    HPC = specs['HPC']['value']

    # based on hpc decision, set directories for data and storing results
    # also decide distribution strategy for keras tuner parallelization 
    if HPC:
        # hpc data dir
        data_dir = os.getcwd() + '/processed_pbp.csv'

        # store results 
        results_dir = os.getcwd() + '/search_results'
    
    else:
        # local data dir
        data_dir = specs['LOCAL_DATA_DIR']['value']
        
        # local results dir
        results_dir = specs['LOCAL_RESULTS_DIR']['value']

    # get a list of continous feature variables
    cont_feats = specs['CONT_FEATS']['value']

    # get max play lags
    max_lag = specs['MAX_PLAY_LAG']['value']

    return data_dir, results_dir, cont_feats, max_lag
# ********************************************************************
def add_lagged_play_calls(input_df, max_lag):
    """
    Given a pbp data frame, add feature columns
    for up to max_lag lagged play call values. The 
    first max_lag play calls will be dropped for each 
    team.
    ---------------------------------------------
    Inputs: input_df: Pandas df shape (total_plays, n_features)
            max_lag: (int) The maximum number of lagged play call
            values to add as columns to the pbp data frame
    Returns: A pbp data frame 
    """
    for lag in range(max_lag):
        input_df['play_lag' + str(lag + 1)] = input_df.groupby(['posteam'])['pass'].shift(lag + 1)
        
    return input_df.dropna()
# ********************************************************************
def process_data(input_df, continous_feats, num_feats, prior_selected_feats = None, feat_select = False):
    """
    Converts a pbp data frame into X and y matrices 
    for training/validation. Continous features are standardized 
    and normalized.   
    ---------------------------------------------
    Inputs: input_df: Pandas df shape (total_plays, n_features)
            continous_feats:(ls) List of continous features
            num_feats:(int) Number of features to select
            prior_selected_feats: np array, features selected from training 
            feat_select: (bool) whether to select features or use previously selected
    Returns: X: np array shape (num_samples, selected_features)
             y: np array shape (num_samples)
    """
    # scale the continous features of the input
    scaler = StandardScaler()
    normalizer = Normalizer()

    input_df.loc[:,continous_feats] = scaler.fit_transform(input_df.loc[:,continous_feats])
    input_df.loc[:,continous_feats] = normalizer.fit_transform(input_df.loc[:,continous_feats])

    # select features if specified 
    if feat_select:
        selector = SelectKBest(score_func = partial(mutual_info_classif, random_state=19), k = num_feats)
        selector.fit(input_df.iloc[:,4:].drop(['pass'], axis = 1), input_df.loc[:,'pass'])
        input_df_sel = pd.concat([input_df.iloc[:,0:4], input_df.loc[:,'pass'], input_df.loc[:,selector.get_feature_names_out()]], axis = 1)
        
        # save selected features
        selected_feats = selector.get_feature_names_out()

    else:
        input_df_sel = pd.concat([input_df.iloc[:,0:4], input_df.loc[:,'pass'], input_df.loc[:,prior_selected_feats]], axis = 1)

    # convert to numpy feature and response arrays 
    X = input_df_sel.iloc[:,5:].to_numpy()
    y = input_df_sel.iloc[:,4].to_numpy()
    
    if feat_select:
        return np.asarray(X).astype(np.float32), np.asarray(y).astype(np.float32), selected_feats
    else:
        return np.asarray(X).astype(np.float32), np.asarray(y).astype(np.float32)
# ********************************************************************


## Get specifications and Load Data Set

In [3]:
# open json file
with open('/Users/joe/documents/Masters_Project/NFL-Play-Call-Prediction-with-LSTM-Neural-Networks/src/specifications.json') as f:
    specifications = json.load(f)

# get specs
data_dir, results_dir, cont_feats, max_lag = specify(specifications)

# load data
pbp = add_lagged_play_calls(pd.read_csv(data_dir), max_lag)

## Process Data and Select Features

In [4]:
# train and final test sets
train_df = pbp.iloc[0:21434,:]      # weeks 1-12
test_df = pbp.iloc[21435:,:]        # weeks 13-17 

# process data 
X_train, y_train, sel_feats = process_data(train_df, cont_feats, 60, feat_select = True)
X_test, y_test = process_data(test_df, cont_feats, 60, prior_selected_feats = sel_feats)

## Custom Cross Validation Splitting

In [167]:
# training and validation indices
train_ind = np.full((14524,), -1, dtype = int)       # weeks 1-8
val_ind = np.full((6910,), 0, dtype = int)           # weeks 9-12

# val fold for PredefinedSplit
val_fold = np.append(train_ind, val_ind)

# split into training and validation
ps = PredefinedSplit(val_fold)

for train_index, test_index in ps.split():
    print("TRAIN:", train_index, "TEST:", test_index)


TRAIN: [    0     1     2 ... 14521 14522 14523] TEST: [14524 14525 14526 ... 21431 21432 21433]


## Define Custom Function for Tuning Baseline Models

In [168]:
def build_baseline(hp):
    """
    Chooses and builds baseline models (GBDT or Log Regr)
    for hyperparameter tuning
    --------------------------------------
    Input: hp (null) A null argument that defines the
        hyperparameter space
        Returns: A baseline classifier
    """
    # tune the type of model
    model_type = hp.Choice("model_type", ["GBDT", "LOG_REG"])

    if model_type == "GBDT":
        with hp.conditional_scope("model_type", ["GBDT"]):
            model = GradientBoostingClassifier(
                loss = "exponential",
                n_estimators = hp.Int("n_estimators", min_value = 100, max_value = 500),
                subsample = hp.Float("subsample", min_value = 0.5, max_value = 1.0),
                min_samples_split = hp.Int("min_samples_split", min_value = 2, max_value = 15),
                min_samples_leaf = hp.Int("min_samples_leaf", min_value = 1, max_value = 10),
                max_depth = hp.Int("max_depth", min_value = 1, max_value = 5),
                max_features = 'sqrt'
            )
    
    else:
        with hp.conditional_scope("model_type", ["LOG_REG"]):
            model = LogisticRegression(
                penalty = "elasticnet",
                l1_ratio = hp.Float("l1_ratio", min_value = 0, max_value = 1),
                solver = "saga"
            )

    return model



## Hyperparameter Tuning

In [179]:
# build tuner 
tuner = kt.tuners.SklearnTuner(
        oracle = kt.oracles.RandomSearchOracle(
        objective = kt.Objective('score', 'max'),
        max_trials = 100),
        hypermodel = build_baseline,
        scoring = metrics.make_scorer(metrics.accuracy_score),
        metrics = [metrics.roc_auc_score, metrics.recall_score, metrics.precision_score, metrics.log_loss],
        cv = ps,
        directory = results_dir,
        overwrite = True,
        project_name = 'baseline models')

In [180]:
tuner.search(X_train, y_train)

Trial 100 Complete [00h 00m 00s]
score: 0.7219971056439942

Best score So Far: 0.7319826338639652
Total elapsed time: 00h 04m 16s
INFO:tensorflow:Oracle triggered exit


## Retrain the Model Using Best Hyperparameters and Evaluate on Test Data

In [None]:
def evaluate_model(model, X_test, y_test):
    """
    Calculate various classification metrics for a 
    baseline model, save results to dataframe
    ----------------------------------------------
    inputs: model: trained model object
            X_test: test feature matrix
            y_test: observed target values 
    output: pandas df
    """
    # get the model type 
    typ = type(model).__name__

    # make predictions with model 
    y_pred = model.predict(X_test)

    # score predictions
    auc = metrics.roc_auc_score(y_test, y_pred)
    pre = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)

    # save to data frame
    return pd.DataFrame(np.array([typ,auc,pre,rec,acc,f1]).reshape(-1, 6),columns = ['Model','AUC', 'PRE', 'REC', 'ACC', 'F1'])

In [212]:
# store best params
param_results = pd.DataFrame()

# get 10 best parameters from search 
best_hps = tuner.get_best_hyperparameters(60)

# evaluate each model on test set and save results
for hp in best_hps:
    # build model with hyperparams
    mod = build_baseline(hp)

    # fit model 
    mod.fit(X_train, y_train)

    # evaluate
    temp_df = evaluate_model(mod, X_test, y_test)

    # add to results 
    param_results = pd.concat([param_results, temp_df], axis = 0)


In [214]:
param_results.to_csv(results_dir + '/baseline_param_results.csv')

In [216]:
tuner.results_summary()

Results summary
Results in /Users/joe/documents/mas_proj_results/baseline models
Showing 10 best trials
Objective(name='score', direction='max')
Trial summary
Hyperparameters:
model_type: GBDT
n_estimators: 408
subsample: 0.898362662734536
min_samples_split: 9
min_samples_leaf: 10
max_depth: 2
Score: 0.7319826338639652
Trial summary
Hyperparameters:
model_type: GBDT
n_estimators: 114
subsample: 0.998048484928794
min_samples_split: 9
min_samples_leaf: 6
max_depth: 4
Score: 0.7318379160636759
Trial summary
Hyperparameters:
model_type: GBDT
n_estimators: 114
subsample: 0.8285771468563583
min_samples_split: 6
min_samples_leaf: 3
max_depth: 4
Score: 0.7308248914616498
Trial summary
Hyperparameters:
model_type: GBDT
n_estimators: 499
subsample: 0.6051937252215472
min_samples_split: 4
min_samples_leaf: 4
max_depth: 2
Score: 0.7306801736613604
Trial summary
Hyperparameters:
model_type: GBDT
n_estimators: 311
subsample: 0.9180318268779729
min_samples_split: 14
min_samples_leaf: 4
max_depth: 3
S