In [57]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import PowerTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from tqdm import tqdm

In [2]:
pd.set_option('display.max_columns', None)

# Import Preprocessed Data

In [45]:
data = pd.read_csv('../data/preprocess_data.csv')
data = data.sample(frac=1).reset_index(drop=True)

idx = data['Yards']>=0
data = data[idx]

print(data.shape)
data.head()

(27678, 62)


Unnamed: 0,GameId,PlayId,NflIdRusher,DisplayName,Season,Yards,X,Y,YardLine,HomeScoreBeforePlay,VisitorScoreBeforePlay,DefendersInTheBox_IMP,PlayerWeight,PlayerHeight_ADJ,Temperature_IMP,Humidity_IMP,WindSpeed_IMP,Team_away,Team_home,Team_nan,Quarter_1.0,Quarter_2.0,Quarter_3.0,Quarter_4.0,Quarter_5.0,Quarter_nan,Down_1.0,Down_2.0,Down_3.0,Down_4.0,Down_nan,OffenseFormation_ACE,OffenseFormation_EMPTY,OffenseFormation_I_FORM,OffenseFormation_JUMBO,OffenseFormation_PISTOL,OffenseFormation_SHOTGUN,OffenseFormation_SINGLEBACK,OffenseFormation_WILDCAT,OffenseFormation_nan,PlayDirection_left,PlayDirection_right,PlayDirection_nan,StadiumType_ADJ_Indoor,StadiumType_ADJ_Outdoor,StadiumType_ADJ_Retractable Roof,StadiumType_ADJ_Retractable Roof - Closed,StadiumType_ADJ_Retractable Roof - Opened,StadiumType_ADJ_nan,GameWeather_ADJ_Clear,GameWeather_ADJ_Cloudy,GameWeather_ADJ_Indoor,GameWeather_ADJ_Light Rain,GameWeather_ADJ_Mostly Clear,GameWeather_ADJ_Mostly Cloudy,GameWeather_ADJ_Mostly Sunny,GameWeather_ADJ_Partly Cloudy,GameWeather_ADJ_Partly Sunny,GameWeather_ADJ_Rain,GameWeather_ADJ_Snow,GameWeather_ADJ_Sunny,GameWeather_ADJ_nan
0,2019092208,20190922082499,2557997,Christian McCaffrey,2019,5,60.34,28.71,46,17,14,7,205,71,68.0,0.0,0.0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2017120308,20171203083759,2495328,Bilal Powell,2017,0,17.86,24.38,4,30,31,6,204,70,48.0,64.0,0.0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,2018112506,20181125063733,2560949,Josh Adams,2018,3,90.19,28.08,25,22,22,6,225,74,56.0,58.0,10.0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,2019112403,20191124030376,2557976,Joe Mixon,2019,1,86.71,25.13,30,0,0,6,220,73,46.0,62.0,12.5,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,2017112601,20171126010661,2550189,Isaiah Crowell,2017,9,15.64,24.52,11,7,3,7,225,71,46.0,52.0,3.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


# Split Data (Training, Testing, CFV)

Testing: 2019 season  
Training/CFV: 2017 & 2018 seasons

In [46]:
data.groupby('Season')['PlayId'].count()/data.shape[0]

Season
2017    0.379652
2018    0.364983
2019    0.255365
Name: PlayId, dtype: float64

In [47]:
idx = data['Season'] != 2019

train_data = data[idx].reset_index(drop=True)
test_data = data[~idx].reset_index(drop=True)

In [48]:
train_data.head()

Unnamed: 0,GameId,PlayId,NflIdRusher,DisplayName,Season,Yards,X,Y,YardLine,HomeScoreBeforePlay,VisitorScoreBeforePlay,DefendersInTheBox_IMP,PlayerWeight,PlayerHeight_ADJ,Temperature_IMP,Humidity_IMP,WindSpeed_IMP,Team_away,Team_home,Team_nan,Quarter_1.0,Quarter_2.0,Quarter_3.0,Quarter_4.0,Quarter_5.0,Quarter_nan,Down_1.0,Down_2.0,Down_3.0,Down_4.0,Down_nan,OffenseFormation_ACE,OffenseFormation_EMPTY,OffenseFormation_I_FORM,OffenseFormation_JUMBO,OffenseFormation_PISTOL,OffenseFormation_SHOTGUN,OffenseFormation_SINGLEBACK,OffenseFormation_WILDCAT,OffenseFormation_nan,PlayDirection_left,PlayDirection_right,PlayDirection_nan,StadiumType_ADJ_Indoor,StadiumType_ADJ_Outdoor,StadiumType_ADJ_Retractable Roof,StadiumType_ADJ_Retractable Roof - Closed,StadiumType_ADJ_Retractable Roof - Opened,StadiumType_ADJ_nan,GameWeather_ADJ_Clear,GameWeather_ADJ_Cloudy,GameWeather_ADJ_Indoor,GameWeather_ADJ_Light Rain,GameWeather_ADJ_Mostly Clear,GameWeather_ADJ_Mostly Cloudy,GameWeather_ADJ_Mostly Sunny,GameWeather_ADJ_Partly Cloudy,GameWeather_ADJ_Partly Sunny,GameWeather_ADJ_Rain,GameWeather_ADJ_Snow,GameWeather_ADJ_Sunny,GameWeather_ADJ_nan
0,2017120308,20171203083759,2495328,Bilal Powell,2017,0,17.86,24.38,4,30,31,6,204,70,48.0,64.0,0.0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,2018112506,20181125063733,2560949,Josh Adams,2018,3,90.19,28.08,25,22,22,6,225,74,56.0,58.0,10.0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2017112601,20171126010661,2550189,Isaiah Crowell,2017,9,15.64,24.52,11,7,3,7,225,71,46.0,52.0,3.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,2017101506,20171015060094,2540011,Chris Thompson,2017,0,40.95,25.9,36,0,0,6,195,68,68.0,85.0,4.0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,2017112604,20171126040663,2550512,Damien Williams,2017,8,91.68,23.02,23,14,0,6,224,71,47.0,37.0,17.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [85]:
drop_cols = ['GameId', 'PlayId', 'NflIdRusher', 'DisplayName', 'Season', 'Yards']
quant_cols = ['X', 'Y', 'YardLine', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'DefendersInTheBox_IMP',
             'PlayerWeight', 'PlayerHeight_ADJ', 'Temperature_IMP', 'Humidity_IMP', 'WindSpeed_IMP']
X_train = train_data.drop(drop_cols, axis=1)
X_test = test_data.drop(drop_cols, axis=1)

# Try power tranformation
#X_train = train_data[quant_cols]
#pt = PowerTransformer()
#X_train = pt.fit_transform(X_train)
#X_test = test_data[quant_cols]
#X_test = pt.transform(X_test)

y_train = train_data['Yards']
y_test = test_data['Yards']

# Build Models

In [7]:
METRICS = ['r2', 'explained_variance','neg_root_mean_squared_error', 'neg_mean_absolute_percentage_error']

### Decision Tree

In [86]:
#Baseline model: Use default parameters
dtr = DecisionTreeRegressor(max_depth=25, min_samples_split=5, min_samples_leaf=5)

dtr_cfv_results = cross_validate(dtr, X_train, y_train, 
                                 scoring=METRICS, 
                                 cv=5, n_jobs=-1,
                                 return_train_score=True, return_estimator=True)
dtr_cfv_results

{'fit_time': array([0.22829199, 0.21102929, 0.21094489, 0.2077868 , 0.21007609]),
 'score_time': array([0.00545096, 0.00501585, 0.00525737, 0.00470018, 0.00494003]),
 'estimator': [DecisionTreeRegressor(max_depth=25, min_samples_leaf=5, min_samples_split=5),
  DecisionTreeRegressor(max_depth=25, min_samples_leaf=5, min_samples_split=5),
  DecisionTreeRegressor(max_depth=25, min_samples_leaf=5, min_samples_split=5),
  DecisionTreeRegressor(max_depth=25, min_samples_leaf=5, min_samples_split=5),
  DecisionTreeRegressor(max_depth=25, min_samples_leaf=5, min_samples_split=5)],
 'test_r2': array([-0.35288941, -0.38000703, -0.39679577, -0.44888174, -0.40660146]),
 'train_r2': array([0.37982538, 0.40996716, 0.42691312, 0.42033648, 0.42961789]),
 'test_explained_variance': array([-0.35238355, -0.379765  , -0.39679382, -0.44841972, -0.40644312]),
 'train_explained_variance': array([0.37982538, 0.40996716, 0.42691312, 0.42033648, 0.42961789]),
 'test_neg_root_mean_squared_error': array([-7.30577

In [87]:
estimator = dtr_cfv_results['estimator'][0]
sort_idx = np.argsort(estimator.feature_importances_)
estimator.feature_names_in_[sort_idx]

array(['GameWeather_ADJ_nan', 'StadiumType_ADJ_nan', 'PlayDirection_nan',
       'OffenseFormation_nan', 'OffenseFormation_WILDCAT',
       'OffenseFormation_EMPTY', 'OffenseFormation_ACE', 'Down_nan',
       'StadiumType_ADJ_Retractable Roof - Closed', 'Quarter_nan',
       'Team_nan', 'StadiumType_ADJ_Retractable Roof - Opened',
       'GameWeather_ADJ_Partly Sunny', 'GameWeather_ADJ_Mostly Clear',
       'GameWeather_ADJ_Snow', 'GameWeather_ADJ_Light Rain',
       'OffenseFormation_PISTOL', 'GameWeather_ADJ_Mostly Sunny',
       'Down_4.0', 'OffenseFormation_JUMBO', 'StadiumType_ADJ_Indoor',
       'GameWeather_ADJ_Indoor', 'PlayDirection_left', 'Quarter_5.0',
       'GameWeather_ADJ_Clear', 'StadiumType_ADJ_Retractable Roof',
       'Quarter_1.0', 'Team_home', 'GameWeather_ADJ_Partly Cloudy',
       'Team_away', 'GameWeather_ADJ_Rain', 'GameWeather_ADJ_Sunny',
       'PlayDirection_right', 'Down_3.0', 'GameWeather_ADJ_Mostly Cloudy',
       'Quarter_2.0', 'Quarter_4.0', 'Down_2.0',

### Regression

In [73]:
#Baseline model: Use default parameters
lr = LinearRegression(normalize=True)

lr_cfv_results = cross_validate(lr, X_train, y_train, 
                                 scoring=METRICS, 
                                 cv=5, return_train_score=True, return_estimator=True)
lr_cfv_results

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

{'fit_time': array([0.01104689, 0.01197195, 0.01069307, 0.00872016, 0.00862098]),
 'score_time': array([0.00255585, 0.00208521, 0.00178123, 0.00151706, 0.00156212]),
 'estimator': [LinearRegression(normalize=True),
  LinearRegression(normalize=True),
  LinearRegression(normalize=True),
  LinearRegression(normalize=True),
  LinearRegression(normalize=True)],
 'test_r2': array([0.01452236, 0.01335511, 0.00392329, 0.0111623 , 0.01209602]),
 'train_r2': array([0.01171333, 0.01195487, 0.01431506, 0.01254799, 0.01225314]),
 'test_explained_variance': array([0.01479565, 0.01344173, 0.00451849, 0.01117273, 0.0123098 ]),
 'train_explained_variance': array([0.01171333, 0.01195487, 0.01431506, 0.01254799, 0.01225314]),
 'test_neg_root_mean_squared_error': array([-6.23532015, -6.41633312, -6.56461129, -6.06452288, -6.28397927]),
 'train_neg_root_mean_squared_error': array([-6.33003575, -6.28476699, -6.24679691, -6.37137538, -6.31806302]),
 'test_neg_mean_absolute_percentage_error': array([-2.40327

In [74]:
#Baseline model: Use default parameters
lrr = Ridge(normalize=True)

lrr_cfv_results = cross_validate(lrr, X_train, y_train, 
                                 scoring=METRICS, 
                                 cv=5, return_train_score=True, return_estimator=True)
lrr_cfv_results

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alp

{'fit_time': array([0.00728488, 0.00844908, 0.00714397, 0.00698018, 0.00672412]),
 'score_time': array([0.00174832, 0.00158691, 0.00170064, 0.00145102, 0.00125003]),
 'estimator': [Ridge(normalize=True),
  Ridge(normalize=True),
  Ridge(normalize=True),
  Ridge(normalize=True),
  Ridge(normalize=True)],
 'test_r2': array([0.01063004, 0.00964503, 0.00515209, 0.00904214, 0.00946957]),
 'train_r2': array([0.00895564, 0.00920951, 0.01098057, 0.00966718, 0.00933362]),
 'test_explained_variance': array([0.0108542 , 0.00973823, 0.00565257, 0.0090447 , 0.00973669]),
 'train_explained_variance': array([0.00895564, 0.00920951, 0.01098057, 0.00966718, 0.00933362]),
 'test_neg_root_mean_squared_error': array([-6.24762179, -6.42838548, -6.56056086, -6.07102084, -6.29232704]),
 'train_neg_root_mean_squared_error': array([-6.3388612 , -6.29349227, -6.25735419, -6.38066262, -6.3273934 ]),
 'test_neg_mean_absolute_percentage_error': array([-2.42797967e+15, -2.34256678e+15, -2.26388973e+15, -2.21099183e

### Evaluate

### Optimize

### Output models

# Extract Important Features

# Summarize Results

# TODOs
1. Import preprocessed data-DONE
2. Split data into training and testing
3. CFV
4. Build decision tree model with default hyperparameters
5. Build regression model with default hyperparameters
6. Evaluate performance
7. Optimize models and re-evaluate performance  
    a. STRETCH: Using Bayesian Optimization
8. Extract most import features  
    a. STRETCH: Implement SHAP
9. Output model objects to the `objects` directory
10. Summarize findings