##Notes: the approach taken in this baseline model is from this 2020 article
##by Lianne and Justin, thanks to them for sharing. They used 
##ridge regression alpha = 0.001

https://www.justintodata.com/improve-sports-betting-odds-guide-in-python/



In [18]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, f1_score


In [47]:
##we will try the following models on the base-line data ... just win/loss and which teams

##note KNN or other clusters might be helpful group the teams in smart way ... but not now.
#models

##regression
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

#classifiers (non-tree)
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


#tree-based classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost import XGBRegressor

##regression models
lr = Ridge(alpha=0.001) 
rfr = RandomForestRegressor(max_depth=3, random_state=0)
xgbr = XGBRegressor()

regr_models = [lr, rfr, xgbr]

##classifier models
lrc = RidgeClassifier()
gnb = GaussianNB()
lgr = LogisticRegression(random_state = 0)
svc = SVC()

#tree-based classifiers
rfc =  RandomForestClassifier(max_depth=3, random_state=0)
bc = BaggingClassifier()
gbc = GradientBoostingClassifier()
xgbc = XGBClassifier( use_label_encoder=False, num_class = [0,1])

class_models= [lrc, gnb, lgr, svc, rfc, bc, gbc, xgbc]

In [5]:

data = pd.read_csv("/Users/joejohns/data_bootcamp/GitHub/final_project_nhl_prediction/Data/Processed_Data/Approach_1 and_2_win_loss_and_cumul_1seas_Pisch_data/data_bet_stats_mp.csv")
data.drop(columns=[ 'Unnamed: 0'], inplace=True)

In [6]:
data['won'] = data['won'].apply(int)
data_playoffs = data.loc[data['playoffGame'] == 1, :].copy()  #set aside playoff games ... probably won't use them.
data=  data.loc[data['playoffGame'] == 0, :].copy() 

#sorted(data.columns)

In [7]:
all_seasons = sorted(set(data['season']))
all_seasons

[20082009,
 20092010,
 20102011,
 20112012,
 20122013,
 20132014,
 20142015,
 20152016,
 20162017,
 20172018,
 20182019,
 20192020]

In [8]:


def make_HA_data(X, season, list_var_names = None ):
    X = X.loc[X['season'] == season, :].copy()
    X_H = X.loc[X['HoA'] == 'home',:].copy()
    X_A = X.loc[X['HoA'] == 'away',:].copy()
    X_H['goal_difference'] = X_H['goalsFor'] - X_H['goalsAgainst']  ##note every thing is based in home data
    X_H.reset_index(drop = True, inplace = True)
    X_A.reset_index(drop = True, inplace = True)
    df_visitor = pd.get_dummies(X_H['nhl_name'], dtype=np.int64)
    df_home = pd.get_dummies(X_A['nhl_name'], dtype=np.int64)
    df_model = df_home.sub(df_visitor) 
    df_model['date'] = X_H['date']
    df_model['full_date'] = X_H['full_date']
    
    df_model['game_id'] = X_H['game_id']
    df_model['home_id'] = X_H['team_id']
    df_model['away_id'] = X_A['team_id'] 
    y = X_H.loc[:,['date', 'full_date','game_id', 'Open','goal_difference', 'won']].copy()   ##these are from home team perspective; 'Open' is for betting 
    return (df_model, y)


In [9]:
X_dic = {}
y_dic = {}
for sea in all_seasons:
    X_dic[sea] = make_HA_data(data, sea)[0]
    y_dic[sea] = make_HA_data(data, sea)[1]


In [15]:
#this is for  regressors predicting wins - losses, can use this to turn output into win prediction 

def make_win(x):
    if x <= 0:
        return 0
    if x >0:
        return 1

v_make_win = np.vectorize(make_win)

#useage: v_make_win(y_pred)

In [38]:
##naive method: train on first half of season, 600 games, test on second half of season
##with no further training

def naive_test_train_regr_models(model, cut_off = 600, regr = True):
    all_seasons2 = [sea for sea in all_seasons if sea != 20122013]#2012 is shortened season
    total_acc = 0
    counter = 0
    model_name = str(model)
    print("results for ", model_name)
    print(" ")
    for sea in all_seasons2:
       
        #set teh predictor variables, :-5 does the job, would be better 
        #and safer to name the columns explcitly ... but the columns are date
        #and so on ... no leakage worries. OK for this base line
        
        X = X_dic[sea].iloc[:, :-5].copy()
        
        #select season, remove date, etc. select target y
        if regr == True:
            y = y_dic[sea].loc[:, 'goal_difference'].copy()
        else:
            y = y_dic[sea].loc[:, 'won'].copy()
        
        
        
        #carry out naive train-test split
        y_train = y[0: cut_off].copy()
        y_test = y[cut_off :].copy()
        X_train = X[0: cut_off].copy()
        X_test = X[cut_off :].copy()
        
        #train model, find predictions
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test) #this is regression pred on Hg - Ag
        
        y_pred_win = v_make_win(y_pred) #this is the pred of who wins HW =1, AW =0
        y_test_win = v_make_win(y_test)  #this gives the correct win, loss
        #note: if y, y_pred and y_test are already 1, 0 then v_make_win will 
        #keep them the same (<= 0 --> 0, >0 ---> 1)
        
        accuracy = accuracy_score(y_test_win, y_pred_win)
        f1 = f1_score(y_test_win, y_pred_win) #, average = None)
        
        counter+=1
        total_acc+= accuracy
        
        print("seaoson: ", sea)
        print("acc: ", accuracy, " f1: ", f1)
    
    avg_acc = total_acc/counter
    print('avg acuracy: ', avg_acc)
    print(" ")    
        #evaluate_regression(y_test, y_pred)
        #evaluate_binary_classification(y_test_win, y_pred_win
       
        
        
        

In [40]:
##try for ridge regression

naive_test_train_regr_models(model = lr, cut_off = 700, regr = True)  ##ok looks like 20162017 is unusually good for some reason

results for  Ridge(alpha=0.001)
 
seaoson:  20082009
acc:  0.5351043643263758  f1:  0.6067415730337079
seaoson:  20092010
acc:  0.5587121212121212  f1:  0.616144975288303
seaoson:  20102011
acc:  0.5075471698113208  f1:  0.5583756345177664
seaoson:  20112012
acc:  0.5056603773584906  f1:  0.5787781350482315
seaoson:  20132014
acc:  0.5660377358490566  f1:  0.6166666666666667
seaoson:  20142015
acc:  0.5358490566037736  f1:  0.5844594594594594
seaoson:  20152016
acc:  0.5283018867924528  f1:  0.5819397993311037
seaoson:  20162017
acc:  0.5622641509433962  f1:  0.6233766233766233
seaoson:  20172018
acc:  0.5849387040280211  f1:  0.6403641881638847
seaoson:  20182019
acc:  0.5481611208406305  f1:  0.6160714285714286
seaoson:  20192020
acc:  0.5497382198952879  f1:  0.6055045871559633
avg acuracy:  0.543846809787357
 


In [41]:
##avg is around 54% for ridge regression

In [48]:
##now try all regressors

for model in regr_models:
    naive_test_train_regr_models(model = model, cut_off = 700, regr = True) 

results for  Ridge(alpha=0.001)
 
seaoson:  20082009
acc:  0.5351043643263758  f1:  0.6067415730337079
seaoson:  20092010
acc:  0.5587121212121212  f1:  0.616144975288303
seaoson:  20102011
acc:  0.5075471698113208  f1:  0.5583756345177664
seaoson:  20112012
acc:  0.5056603773584906  f1:  0.5787781350482315
seaoson:  20132014
acc:  0.5660377358490566  f1:  0.6166666666666667
seaoson:  20142015
acc:  0.5358490566037736  f1:  0.5844594594594594
seaoson:  20152016
acc:  0.5283018867924528  f1:  0.5819397993311037
seaoson:  20162017
acc:  0.5622641509433962  f1:  0.6233766233766233
seaoson:  20172018
acc:  0.5849387040280211  f1:  0.6403641881638847
seaoson:  20182019
acc:  0.5481611208406305  f1:  0.6160714285714286
seaoson:  20192020
acc:  0.5497382198952879  f1:  0.6055045871559633
avg acuracy:  0.543846809787357
 
results for  RandomForestRegressor(max_depth=3, random_state=0)
 
seaoson:  20082009
acc:  0.5009487666034156  f1:  0.6516556291390728
seaoson:  20092010
acc:  0.520833333333

In [44]:
##now try all classifiers
for model in class_models: 
    naive_test_train_regr_models(model = model, cut_off = 700, regr = False) 

results for  RidgeClassifier()
 
seaoson:  20082009
acc:  0.5199240986717267  f1:  0.5990491283676704
seaoson:  20092010
acc:  0.5681818181818182  f1:  0.6357827476038338
seaoson:  20102011
acc:  0.5320754716981132  f1:  0.5782312925170067
seaoson:  20112012
acc:  0.5377358490566038  f1:  0.6213292117465223
seaoson:  20132014
acc:  0.5773584905660377  f1:  0.6303630363036303
seaoson:  20142015
acc:  0.5584905660377358  f1:  0.6151315789473685
seaoson:  20152016
acc:  0.5509433962264151  f1:  0.6046511627906976
seaoson:  20162017
acc:  0.5528301886792453  f1:  0.6348228043143298
seaoson:  20172018
acc:  0.5831873905429071  f1:  0.6609686609686609
seaoson:  20182019
acc:  0.5691768826619965  f1:  0.6283987915407855
seaoson:  20192020
acc:  0.5418848167539267  f1:  0.5823389021479713
avg acuracy:  0.5537989971887751
 
results for  GaussianNB()
 
seaoson:  20082009
acc:  0.4990512333965844  f1:  0.5352112676056339
seaoson:  20092010
acc:  0.5037878787878788  f1:  0.5544217687074829
seaoson



seaoson:  20092010
acc:  0.5492424242424242  f1:  0.6072607260726074
seaoson:  20102011
acc:  0.5018867924528302  f1:  0.5432525951557093
seaoson:  20112012
acc:  0.5188679245283019  f1:  0.5880452342487884
seaoson:  20132014
acc:  0.560377358490566  f1:  0.5989672977624785




seaoson:  20142015
acc:  0.5811320754716981  f1:  0.6185567010309279
seaoson:  20152016
acc:  0.5283018867924528  f1:  0.5748299319727891
seaoson:  20162017
acc:  0.5415094339622641  f1:  0.6048780487804878
seaoson:  20172018
acc:  0.5936952714535902  f1:  0.6647398843930636
seaoson:  20182019
acc:  0.5148861646234676  f1:  0.5568
seaoson:  20192020
acc:  0.5340314136125655  f1:  0.5572139303482587
avg acuracy:  0.5405229434098835
 




In [None]:
##conclusions: some of the average scores are around 55% and some of the top 
##scores on a season are as high as 58, 59%


##next steps: 
##1. tune the models
##2. look into partial_fit across the seaosn for appropriate models that have that