In [1]:
# import Pandas, NumPy and SciKit Learn modules
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, make_scorer

In [2]:
# load nfl data (games between 2009 to 2019)
nfl_df = pd.read_excel('data/nfl_data_2019_season_thru_week17.xlsx')

In [3]:
print(nfl_df.info())
print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1183 entries, 0 to 1182
Columns: 1023 entries, game_date to pg4_str_diff_2
dtypes: datetime64[ns](5), float64(720), int64(288), object(10)
memory usage: 9.2+ MB
None



In [4]:
# add game schedule
schedule_df = pd.read_excel('data/schedule.xlsx')
print(schedule_df.tail())
df = (pd.merge(nfl_df, schedule_df[['game_date', 'team', 'opp', 'year', 'week']], 
                   left_on = ['game_date', 'team', 'opp'],
                    right_on = ['game_date', 'team', 'opp']))
print()
print(schedule_df.info())

      game_date opp team       week  year
5869 2020-01-12  KC  HOU   Division  2019
5870 2020-01-12  GB  SEA   Division  2019
5871 2020-01-19  KC  TEN  ConfChamp  2019
5872 2020-01-19  SF   GB  ConfChamp  2019
5873 2020-02-02  KC   SF  SuperBowl  2019

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5874 entries, 0 to 5873
Data columns (total 5 columns):
game_date    5874 non-null datetime64[ns]
opp          5874 non-null object
team         5874 non-null object
week         5874 non-null object
year         5874 non-null int64
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 229.6+ KB
None


In [5]:
# check out the dataset
print(df.sort_values('game_date', ascending = True).tail(10))

     game_date team  opp  home_game  team_score  opp_score  complete_pass  \
17  2019-12-22  ATL  JAX          1          24         12             32   
636 2019-12-22  IND  CAR          1          38          6             14   
934 2019-12-22  PHI  DAL          1          17          9             31   
699 2019-12-22   KC  CHI          0          26          3             23   
698 2019-12-22   KC  CHI          0          26          3             23   
454 2019-12-22  DET  DEN          0          17         27             12   
455 2019-12-22  DET  DEN          0          17         27             12   
18  2019-12-22  ATL  JAX          1          24         12             32   
533 2019-12-23   GB  MIN          0          23         10             26   
534 2019-12-23   GB  MIN          0          23         10             26   

     pass_attempt  interception  sack  ...  pg4_avg_opp_fumble_lost_1  \
17             45             2     1  ...                          1   
636   

In [6]:
# custom scoring function
# set factor equal to one for precision and increase to further penalize false positives
# False positives are bad for nfl survivor
prec_plus_factor = 1.05
def calc_precision(y, y_hat):
    true_positives = ((y == True) & (y_hat == True)).sum()
    false_positives = ((y == False) & (y_hat == True)).sum()
    if math.isnan(float(true_positives)) or math.isnan(float(false_positives)):
        return np.nan
    return float(true_positives) / (true_positives + false_positives)

def calc_adjusted_precision(y, y_hat):
    true_positives = ((y == True) & (y_hat == True)).sum()
    false_positives = ((y == False) & (y_hat == True)).sum()
    if math.isnan(float(true_positives)) or math.isnan(float(false_positives)):
        return np.nan
    return float(true_positives) / (true_positives + false_positives * prec_plus_factor)

y = np.array([0, 1, 1, 1, 0])
x = np.array([0, 0, 1, 1, 1])
print('precision {:.4f}'.format(calc_precision(y, x)))
print('adjusted precision {:.4f}'.format(calc_adjusted_precision(y, x)))



def calc_adjusted_precision_est(est, X, y):
    y_hat = est.predict(X).reshape(y.shape)
    true_positives = ((y == True) & (y_hat == True)).sum()
    false_positives = ((y == False) & (y_hat == True)).sum()
    return 1. / (float(true_positives) / (true_positives + false_positives * prec_plus_factor))


precision_plus_scorer = make_scorer(calc_adjusted_precision, greater_is_better = True, needs_proba = False)


precision 0.6667
adjusted precision 0.6557


In [16]:
# set up hyperparameters and features 

# number of games to lookback (moving averages)
games_lookback = 6

# features
# 'is_favorite',
x_columns = ['home_game', 'NFC', 'North', 'South', 'West', 'rank_diff', 'directed_spread', 
             'adj_str_diff_{}'.format(games_lookback), 'str_diff_{}'.format(games_lookback),
             'adj_avg_str_diff_{}'.format(games_lookback), 'avg_str_diff_{}'.format(games_lookback),
            'team_wins_{}'.format(games_lookback), 'opp_wins_{}'.format(games_lookback), 
             #'completion_percentage', 'interception_percentage', 'yards_gained',
             'avg_completion_percentage_{}'.format(games_lookback), 
             'avg_interception_percentage_{}'.format(games_lookback),
             'avg_yards_gained_{}'.format(games_lookback),
             'team_opp_wins_{}'.format(games_lookback)]
  
y_columns = ['win']



In [17]:
#start_week = df[df['game_date'] == df['game_date'].min()][['week']].iloc[0].astype(int).values[0]
current_year = 2019
current_week = 14
upcoming_year = current_year + 1 * (current_week == 17)
df['week'] = pd.to_numeric(df['week'], errors = 'coerce')
upcoming_week = (current_week + 1) * (current_week < 17) + 1 * (current_week == 17)
split_date = df[(df['week'] == current_week) & (df['year'] == current_year)]['game_date'].max()
val_split_date = df[(df['week'] == upcoming_week) & (df['year'] == upcoming_year)]['game_date'].max()
print(df[['game_date', 'year', 'week']].info())
print()
#print(split_date)
#print(start_week)
print('Current week and year: {} and {}'.format(current_week, current_year))
print('Upcoming week and year: {} and {}'.format(upcoming_week, upcoming_year))
print('split_date: {:%Y-%m-%d}'.format(split_date))
print('val_split_date: {:%Y-%m-%d}'.format(val_split_date))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1183 entries, 0 to 1182
Data columns (total 3 columns):
game_date    1183 non-null datetime64[ns]
year         1183 non-null int64
week         1183 non-null int64
dtypes: datetime64[ns](1), int64(2)
memory usage: 37.0 KB
None

Current week and year: 14 and 2019
Upcoming week and year: 15 and 2019
split_date: 2019-12-09
val_split_date: 2019-12-16


In [18]:
train_dataset = df[df['game_date'] <= split_date]
test_dataset = df[(df['year'] == upcoming_year) & (df['week'] == upcoming_week)]
print(train_dataset.info())
print()
print(test_dataset.info())
print(test_dataset[['game_date', 'team', 'opp']].head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1135 entries, 0 to 1182
Columns: 1025 entries, game_date to week
dtypes: datetime64[ns](5), float64(720), int64(290), object(10)
memory usage: 8.9+ MB
None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 19 to 1158
Columns: 1025 entries, game_date to week
dtypes: datetime64[ns](5), float64(720), int64(290), object(10)
memory usage: 192.4+ KB
None
     game_date team  opp
19  2019-12-15  ATL   SF
20  2019-12-15  ATL   SF
144 2019-12-15  BUF  PIT
145 2019-12-15  BUF  PIT
456 2019-12-15  DET   TB


In [19]:
x_cols = [item for item in x_columns if item in train_dataset.columns]
X_train, X_val, y_train, y_val = (train_test_split(train_dataset[x_cols], train_dataset[y_columns], 
                                                     test_size = 0.1,
                                                    random_state = 0))

print('# of features: {}'.format(X_train.shape[1]))
print('{} training samples cases | {} test samples'.format(X_train.shape[0], X_val.shape[0]))
dc = DummyClassifier(random_state = 0).fit(X_train, y_train.values.ravel())
pr = dc.predict_proba(X_train)
fpr, tpr, _ = roc_curve(y_train, pr[:, 1].reshape(len(pr)))
roc_auc = auc(fpr, tpr)
print('Dummy ROCAUC is {:.4f}'.format(roc_auc))

# fine tune random forest parameters
grid_values = {'max_features': [5, 10], 'max_depth': [4, 5], 'n_estimators': [50, 75]}
rf = RandomForestClassifier(random_state = 0)
grid = (GridSearchCV(rf, param_grid = grid_values, \
                    scoring = calc_adjusted_precision_est, cv = 4).fit(X_train, y_train.values.ravel()))
print('RF best score is : {:.4f}'.format(grid.best_score_))
print('RF best parameters: {}\n'.format(grid.best_params_))

rf = (RandomForestClassifier(random_state = 0, max_features = grid.best_params_['max_features'], 
                             max_depth = grid.best_params_['max_depth'], 
                             n_estimators = grid.best_params_['n_estimators'])
      .fit(X_train, y_train.values.ravel()))
pr = rf.predict_proba(X_train)
fpr, tpr, _ = roc_curve(y_train, pr[:, 1].reshape(len(pr)))
train_roc_auc = auc(fpr, tpr)
pr = rf.predict_proba(X_val)
print('max pr: {:.4f}'.format(np.max(pr[:, 1])))
fpr, tpr, _ = roc_curve(y_val, pr[:, 1].reshape(len(pr)))
roc_auc = auc(fpr, tpr)
print('RF Train ROCAUC {:.4f} and RF Test ROCAUC {:.4f}'.format(train_roc_auc, roc_auc))
importance_df = pd.DataFrame({'Features': X_train.columns, 'Importances': rf.feature_importances_}, 
                             columns = ['Features', 'Importances'], index = range(len(X_train.columns)))

print()

print('precision')
print(rf.predict(X_train).shape)
precision = calc_precision(y_train, rf.predict(X_train).reshape(y_train.shape))
print('Train Precision: {:4f}'.format(precision['win']))
print()
precision = calc_precision(y_val, rf.predict(X_val).reshape(y_val.shape))
print('Validation Precision: {:4f}'.format(precision['win']))
precision = calc_adjusted_precision(y_val, rf.predict(X_val).reshape(y_val.shape))
print('Adjusted Validation Precision: {:4f}'.format(precision['win']))

print()

print(importance_df.sort_values('Importances', ascending = False).head(20)[['Features', 'Importances']])
print()
print(nfl_df[['team', 'opp', 'directed_spread', 'is_favorite', 'vegas_spread', 'win', 'point_diff']].tail())

# of features: 13
1021 training samples cases | 114 test samples
Dummy ROCAUC is 0.5000
RF best score is : 1.5657
RF best parameters: {'max_depth': 4, 'max_features': 10, 'n_estimators': 75}

max pr: 0.8819
RF Train ROCAUC 0.8261 and RF Test ROCAUC 0.6937

precision
(1021,)
Train Precision: 0.730250

Validation Precision: 0.698113
Adjusted Validation Precision: 0.687732

                         Features  Importances
2                 directed_spread     0.520731
3                  adj_str_diff_6     0.086549
5              adj_avg_str_diff_6     0.069603
9     avg_completion_percentage_6     0.057819
6                  avg_str_diff_6     0.057595
11             avg_yards_gained_6     0.050465
1                       rank_diff     0.041198
4                      str_diff_6     0.041012
10  avg_interception_percentage_6     0.039662
0                       home_game     0.017816
12                team_opp_wins_6     0.007382
7                     team_wins_6     0.006051
8              

In [20]:
# Check precision on test set
precision = (calc_precision(test_dataset[y_columns], rf.predict(test_dataset[x_cols]).
                                 reshape(test_dataset[y_columns].shape)))
print('Test Precision: {:4f}'.format(precision['win']))

Test Precision: 0.666667
