In [43]:
print(X['Home'].unique())

['DEN' 'STL' 'DAL' 'DET' 'JAX' 'NOR' 'BUF' 'NYJ' 'CHI' 'IND' 'PIT' 'SFO'
 'CAR' 'CLE' 'SDG' 'WAS' 'NWE' 'TAM' 'KAN' 'HOU' 'ATL' 'NYG' 'SEA' 'PHI'
 'ARI' 'BAL' 'OAK' 'GNB' 'CIN' 'MIN' 'MIA' 'TEN' 'LAR' 'LAC']


Currently tests two baseline models for the 2017 season.  Will update with more data as it is acquired.

First baseline is an average of the pointspread predictions from various casinos and betting outlets. This does not require training, though we could train a ridge regression model and expect a similar result (as we'd expect the betting outlets to be relatively equally predictive on average as well as fairly correlated, so their weights should be very close under L2-regularized linear regression).

In [2]:
import numpy as np
import pandas as pd

In [3]:
#Import data
odds = pd.read_csv("base_data/nfl_odds_2017.csv")
game_stats = pd.read_csv("base_data/nflstats2017.csv")
game_stats.rename(index = str, columns = {'HPS': 'APS', 'HPSY': 'APSY', 'HPS.1':'HPS', 'HPSY.1':'HPSY'}, inplace = True)

In [5]:
#Formatting to create key and filter
odds["datestring"] = pd.to_datetime(odds['date']).dt.strftime('%Y%m%d')
odds["Spread_key"] = odds['datestring'] + odds['Home']
odds['Spread_val'] = odds['HomeScore'] - odds['AwayScore']
spreads = odds.filter(regex = "Spread_")

#Compute square loss over 2017 season
X = spreads.values[:,:-2].astype('float32')
y = spreads['Spread_val'].values.astype('float32')
scores = np.nanmean(X, axis = 1)
residual = (scores - y)**2
loss = np.nanmean(residual)/len(y)
print(loss)

1.30069278331


  # Remove the CWD from sys.path while we load stuff.


Second baseline is a regression tree model that will use basic categorical features (Home/Away team, start time, home/away team stats) to predict the outcome.  For current purposes, will train on first 8 weeks of the season and test on the rest (this will change with more data).  This requires a lot more preprocessing, as we need to aggregate average statistics at that point in the season for each team.

In [6]:
import numpy as np
import pandas as pd

season_data = []
spread_data = []

for y in range(2013,2018):
    temp = pd.read_csv('processed_data/' + str(y) + 'processed.csv')
    spread_data.append(temp['Spread'])
    season_data.append(temp.drop(['Spread','Key','Datetime'], axis = 1))

X = pd.concat(season_data)
X_cat = pd.get_dummies(X,['Home', 'Away'])

In [20]:
y_train = pd.concat(spread_data[0:3])
y_val = spread_data[3]
y_test = spread_data[4]

train_size = len(y_train)
val_size = len(y_val)
test_size = len(y_test)

X_train = X_cat[0:train_size]
X_val = X_cat[train_size:(train_size + val_size)]
X_test = X_cat[(train_size + val_size): ]

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

pipe = Pipeline(steps = [('scaler', StandardScaler()),
                        ('l2',Ridge()) ])

lam_reg = [x + 750 for x in range(0,100)]

grid = GridSearchCV(pipe,
                    param_grid={'l2__alpha': lam_reg},
                    cv=2)


grid.fit(X_train,y_train)



GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('l2', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'l2__alpha': [750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804,...830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [34]:
print(sum((grid.predict(X_val) - y_val)**2)/val_size)
print(grid.best_estimator_)    

149.400499748
Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('l2', Ridge(alpha=786, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])


20.6623335649


In [42]:
sum((grid.predict(X_test).transpose() - y_test)**2)/test_size

200.98722768943364

In [44]:
from sklearn.tree import DecisionTreeRegressor

tree_grid = GridSearchCV(DecisionTreeRegressor,
                    param_grid={'l2__max_dept': [None, 10, 100, 3, 25]},
                    cv=2)


In [None]:
print(sum((tree_grid.predict(X_val) - y_val)**2)/val_size)
print(tree_grid.best_estimator_)    