In [159]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import RobustScaler

In [2]:
df = pd.read_csv('git_sample.csv')

In [3]:
Inputs = ['Favored Team Cover','Favored Final Spread','Favored Team Rank','Spread','Fav Court', 'Fav Tempo', 'Fav ADJO', 
              'Fav ADJD', 'Fav O - PPP', 'Fav O - EFG%','Fav D - PPP', 'Fav D - EFG%', 'Underdog Team Rank' , 
              'Underdog Tempo', 'Underdog ADJO', 'Underdog ADJD', 'Underdog O - PPP', 'Underdog O - EFG%', 
              'Underdog D - PPP', 'Underdog D - EFG%']

X = df[['Fav ADJO', 'Favored Team Predicted Win', 'Fav ADJD','Favored Team Rank', 'Fav O - PPP', 'Underdog ADJO', 
        'Underdog ADJD', 'Underdog Team Rank', 'Fav Court_H', 'Fav Court_N']]
y = df['Favored Final Spread'].values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [90]:
# get a general idea of how close this model performs
# input predicted y and y_test
# Output the percent of the guesses that are within 1,2,3,4 or less points of final spread

def model_performance(y,prediction):
    pred = list(prediction)
    y_list = list(y)
    count_1,count_2,count_3,count_4 = 0,0,0,0
    for i in range(len(pred)):
        if abs((pred[i]) - y_list[i]) <= 1:
            count_1 += 1
        if abs((pred[i]) - y_list[i]) <= 2:
            count_2 += 1
        if abs((pred[i]) - y_list[i]) <= 3:
            count_3 += 1            
        if abs((pred[i]) - y_list[i]) <= 4:
            count_4 += 1            
    score_1 = round((count_1/len(pred) * 100),4)
    score_2 = round((count_2/len(pred) * 100),4)
    score_3 = round((count_3/len(pred) * 100),4)
    score_4 = round((count_4/len(pred) * 100),4)
    return score_1,score_2,score_3,score_4

In [5]:
# Multiple Linear Regression
# Basic Model with limited variables,
# Variables with high p values were removed > 0.1

mlr_model = sm.OLS(y, X)
results = mlr_model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.321
Model:,OLS,Adj. R-squared (uncentered):,0.319
Method:,Least Squares,F-statistic:,157.0
Date:,"Wed, 13 Jan 2021",Prob (F-statistic):,3.2799999999999998e-270
Time:,16:28:24,Log-Likelihood:,-12558.0
No. Observations:,3323,AIC:,25140.0
Df Residuals:,3313,BIC:,25200.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Fav ADJO,-0.1552,0.037,-4.151,0.000,-0.228,-0.082
Favored Team Predicted Win,0.4584,0.038,12.045,0.000,0.384,0.533
Fav ADJD,0.1348,0.057,2.385,0.017,0.024,0.246
Favored Team Rank,0.0168,0.010,1.722,0.085,-0.002,0.036
Fav O - PPP,0.0749,0.044,1.711,0.087,-0.011,0.161
Underdog ADJO,0.1157,0.041,2.849,0.004,0.036,0.195
Underdog ADJD,-0.1783,0.036,-5.013,0.000,-0.248,-0.109
Underdog Team Rank,-0.0321,0.009,-3.749,0.000,-0.049,-0.015
Fav Court_H,5.6892,0.446,12.767,0.000,4.815,6.563

0,1,2,3
Omnibus:,28.091,Durbin-Watson:,2.041
Prob(Omnibus):,0.0,Jarque-Bera (JB):,41.342
Skew:,0.069,Prob(JB):,1.05e-09
Kurtosis:,3.529,Cond. No.,862.0


In [106]:
# Support Vector Machine Regressor 
clf = svm.SVR()
clf.fit(X_train, y_train)
svmr2 = clf.score(X,y)
y_pred = np.array(clf.predict(X_test))
svm_rmse = mean_squared_error(y_test, y_pred, squared=False)
svm_accuracy = model_performance(y_pred,y_test)[2]


svm_r2 = 'SVM Regressor R^2: {}'
svm_reg = 'SVM Regressor RMSE: {}'
svm_performance = 'SVM Regressor Accuracy Within 3 Points of Actual: {}%'
print(svm_reg.format(svm_rmse))
print(svm_r2.format(svmr2))
print(svm_performance.format(svm_accuracy))

SVM Regressor RMSE: 10.818551166314018
SVM Regressor R^2: 0.06721086836689438
SVM Regressor Accuracy Within 3 Points of Actual: 22.9689%


In [125]:
mlr_model = sm.OLS(y_train, X_train).fit()
y_pred = mlr_model.predict(X_test)
mlr_rmse = mean_squared_error(y_test, y_pred, squared=False)
mlr_accuracy = model_performance(y_pred,y_test)[2]


mlr_reg = 'MLR Regression RMSE: {}'
mlr_performance = 'MLR Regressor Accuracy Within 3 Points of Actual: {}%'
print(mlr_reg.format(mlr_rmse))
print(mlr_performance.format(mlr_accuracy))

MLR Regression RMSE: 10.142152483171179
MLR Regressor Accuracy Within 3 Points of Actual: 24.1725%


In [126]:
lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lassor2 = lasso.score(X,y)
lasso_pred = np.array(lasso.predict(X_test))
lassoRmse = mean_squared_error(y_test, y_pred, squared=False)
lassoAccuracy = model_performance(lasso_pred, y_test)[2]

                 
lasso_r2 = 'Lasso Regressor R^2: {}'
lasso_rmse = 'Lasso Regressor RMSE: {}'
lasso_performance = 'Lasso Regressor Accuracy Within 3 Points of Actual Spread: {}%'
print(lasso_rmse.format(lassoRmse))
print(lasso_r2.format(lassor2))
print(lasso_performance.format(lassoAccuracy))

Lasso Regressor RMSE: 10.142152483171179
Lasso Regressor R^2: 0.17313023811951034
Lasso Regressor Accuracy Within 3 Points of Actual Spread: 23.4704%


In [132]:
reg = GradientBoostingRegressor()
reg.fit(X_train, y_train)
regr2 = reg.score(X,y)
reg_pred = np.array(reg.predict(X_test))
regRmse = mean_squared_error(y_test, y_pred, squared=False)
regAccuracy = model_performance(reg_pred, y_test)[2]


reg_r2 = 'Gradient Boosted Regressor R^2: {}'
reg_rmse = 'Gradient Boosted Regressor RMSE: {}'
reg_performance = 'Gradient Boosted Regressor Accuracy Within 3 Points of Actual Spread: {}%'
print(reg_rmse.format(regRmse))
print(reg_r2.format(regr2))
print(reg_performance.format(regAccuracy))

Gradient Boosted Regressor RMSE: 10.142152483171179
Gradient Boosted Regressor R^2: 0.2974058417995673
Gradient Boosted Regressor Accuracy Within 3 Points of Actual Spread: 24.3731%


##

In [133]:
# Tuning GBR hyperparameters
parameters = {'learning_rate': [0.01,0.02,0.03,0.04,0.2],
              'subsample'    : [0.9, 0.5, 0.2, 0.1],
              'n_estimators' : [100,200,500,1000],
              'max_depth'    : [2,4,6,8,10]
             }

grid_GBR = GridSearchCV(estimator=reg, param_grid = parameters, cv = 2, n_jobs=-1)
grid_GBR.fit(X, y)

In [134]:

print('Best estimator: ', grid_GBR.best_estimator_)
print('Best parameters: ', grid_GBR.best_params_)

Best estimator:  GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=0.2, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Best parameters:  {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 500, 'subsample': 0.2}


In [147]:
regr = RandomForestRegressor(random_state=0)
regr.fit(X_train, y_train)
regrr2 = regr.score(X,y)
regr_pred = np.array(regr.predict(X_test))
regrRmse = mean_squared_error(y_test, y_pred, squared=False)
regrAccuracy = model_performance(regr_pred, y_test)[2]

regr_r2 = 'Random Forest Regressor R^2: {}'
regr_rmse = 'Random Forest Regressor RMSE: {}'
regr_performance = 'Random Forest Regressor Accuracy Within 3 Points of Actual Spread: {}%'
print(regr_rmse.format(regrRmse))
print(regr_r2.format(regrr2))
print(regr_performance.format(regrAccuracy))

Random Forest Regressor RMSE: 10.142152483171179
Random Forest Regressor R^2: 0.6736032087268671
Random Forest Regressor Accuracy Within 3 Points of Actual Spread: 24.5737%


In [148]:
rfr_parameters = {'max_depth': [5,10,15,20,30,50,70], 'n_estimators' : [10,30,50,100,150,200,300],}
grid_regr = GridSearchCV(estimator=regr, param_grid = rfr_parameters, cv=4,scoring = 'r2')
grid_regr.fit(X, y)
print('Best estimator: ', grid_regr.best_estimator_)
print('Best parameters: ', grid_regr.best_params_)

Best estimator:  RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)
Best parameters:  {'max_depth': 5, 'n_estimators': 300}


In [161]:
ada = AdaBoostRegressor(random_state=0)
ada.fit(X_train, y_train)
adarr2 = ada.score(X,y)
ada_pred = np.array(ada.predict(X_test))
adaRmse = mean_squared_error(y_test, y_pred, squared=False)
adaAccuracy = model_performance(ada_pred, y_test)[2]

ada_r2 = 'Ada Boost Regressor R^2: {}'
ada_rmse = 'Ada Boost Regressor RMSE: {}'
ada_performance = 'Ada Boost Regressor Accuracy Within 3 Points of Actual Spread: {}%'
print(ada_rmse.format(adaRmse))
print(ada_r2.format(adarr2))
print(ada_performance.format(adaAccuracy))

Ada Boost Regressor RMSE: 10.142152483171179
Ada Boost Regressor R^2: 0.18305746863852235
Ada Boost Regressor Accuracy Within 3 Points of Actual Spread: 23.5707%
