In [10]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, f1_score


import matplotlib.pyplot as plt
import seaborn as sns


##couple evaluation functions 
def evaluate_binary_classification(model_name, y_test, y_pred, y_proba=None):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    #try:
    if y_proba != None:
        rocauc_score = roc_auc_score(y_test, y_proba)
    else:
        rocauc_score = "no roc"
    #except: 
    #    pass     
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True)
    plt.tight_layout()
    plt.title(f'{model_name}', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()
    print("accuracy: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1 score: ", f1)
    print("rocauc: ", rocauc_score)
    print(cm)
    #return accuracy, precision, recall, f1, rocauc_score

def evaluate_regression(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print("mae", mae)
    print("mse", mse)
    print('r2', r2)
    
##display null values


def perc_null(X):
    
    total = X.isnull().sum().sort_values(ascending=False)
    data_types = X.dtypes
    percent = (X.isnull().sum()/X.isnull().count()).sort_values(ascending=False)

    missing_data = pd.concat([total, data_types, percent], axis=1, keys=['Total','Type' ,'Percent'])
    return missing_data


In [1]:
##clean up df




feat_drop = [ 
'startRinkSide',
'HoA',   #this is mp  ###many of these are repeated from mp_data
'HoA_bet',
'VH',
'home_or_away',
'team',
'name',
'Team',
'Unnamed: 0',
'playerTeam',
'position',
'blocked',  ## Same as bSAAgainst
'pim', ## same as penaltyminFor
'goals',  ##goalsFor
'shots',
'giveaways',
 'hits',
]    



feat_rename = {'Date':'date', 'mp_date':'full_date', 'HoA_gm_stats':'HoA',  }

In [7]:

data = pd.read_csv("/Users/joejohns/data_bootcamp/GitHub/final_project_nhl_prediction/Data/Shaped_Data/data_bet_stats_mp.csv")
data.drop(columns=[ 'Unnamed: 0'], inplace=True)

In [67]:
data['won'] = data['won'].apply(int)
data_playoffs = data.loc[data['playoffGame'] == 1, :].copy()  #set aside playoff games ... probably won't use them.
data=  data.loc[data['playoffGame'] == 0, :].copy() 

sorted(data.columns)

['HoA',
 'Open',
 'blockedShotAttemptsAgainst',
 'blockedShotAttemptsFor',
 'corsiPercentage',
 'dZoneGiveawaysAgainst',
 'dZoneGiveawaysFor',
 'date',
 'faceOffWinPercentage',
 'faceOffsWonAgainst',
 'faceOffsWonFor',
 'fenwickPercentage',
 'flurryAdjustedxGoalsAgainst',
 'flurryAdjustedxGoalsFor',
 'flurryScoreVenueAdjustedxGoalsAgainst',
 'flurryScoreVenueAdjustedxGoalsFor',
 'freezeAgainst',
 'freezeFor',
 'full_date',
 'game_id',
 'giveawaysAgainst',
 'giveawaysFor',
 'goalsAgainst',
 'goalsFor',
 'head_coach',
 'highDangerGoalsAgainst',
 'highDangerGoalsFor',
 'highDangerShotsAgainst',
 'highDangerShotsFor',
 'highDangerxGoalsAgainst',
 'highDangerxGoalsFor',
 'hitsAgainst',
 'hitsFor',
 'iceTime',
 'lowDangerGoalsAgainst',
 'lowDangerGoalsFor',
 'lowDangerShotsAgainst',
 'lowDangerShotsFor',
 'lowDangerxGoalsAgainst',
 'lowDangerxGoalsFor',
 'mediumDangerGoalsAgainst',
 'mediumDangerGoalsFor',
 'mediumDangerShotsAgainst',
 'mediumDangerShotsFor',
 'mediumDangerxGoalsAgainst',
 '

In [21]:
def get_dates(X, season):  #season 20162017 int; returns the list of all dates in chronological order from that season 20161004, 20161005, ...
    X = X.loc[X['season'] == season, :].copy()
    dates_1 = list(set(X.loc[(X['date'] >= 900) & (X['date'] <= 1231) , :]['full_date']))  #2016 part
    dates_2 = list(set(X.loc[(X['date'] >= 100) & (X['date'] <= 800) , :]['full_date']))  #2017 part
    dates = dates_1 + dates_2 #all dates in order
    return dates

#sorted(data.columns)

In [98]:
dates = get_dates(data, 20162017)

In [49]:


def make_HA_data(X, season, list_var_names = None ):
    X = X.loc[X['season'] == season, :].copy()
    X_H = X.loc[X['HoA'] == 'home',:].copy()
    X_A = X.loc[X['HoA'] == 'away',:].copy()
    X_H['goal_difference'] = X_H['goalsFor'] - X_H['goalsAgainst']  ##note every thing is based in home data
    X_H.reset_index(drop = True, inplace = True)
    X_A.reset_index(drop = True, inplace = True)
    df_visitor = pd.get_dummies(X_H['nhl_name'], dtype=np.int64)
    df_home = pd.get_dummies(X_A['nhl_name'], dtype=np.int64)
    df_model = df_home.sub(df_visitor) 
    df_model['date'] = X_H['date']
    df_model['full_date'] = X_H['full_date']
    
    df_model['game_id'] = X_H['game_id']
    df_model['home_id'] = X_H['team_id']
    df_model['away_id'] = X_A['team_id'] 
    y = X_H.loc[:,['date', 'full_date','game_id', 'Open','goal_difference', 'won']].copy()   ##these are from home team perspective; 'Open' is for betting 
    return (df_model, y)


In [None]:
def regr_model_results(model,model_name, window_size, prediction_size)
    results_dic ={}
    results_dic['date'] = []
    results_dic['mae'] = []
    results_dic['mse'] = []
    results_dic['r2'] = []
    for i in range(step, len(dates), step): ##step =10, so 17 rounds
        model.fit(X.loc[X['mp_date'].isin(dates[i-step:i]), :], y.loc[y['mp_date'].isin(dates[i-step:i]),'goal_difference' ] )
        model.fit(X.loc[X['mp_date'].isin(dates[max(i-win,0):i]), :],y.loc[y['mp_date'].isin(dates[max(i-win,0):i]),'goal_difference' ])
        y_pred = lr1.predict(X.loc[X['mp_date'].isin(dates[i:i+pred]), :])
        y2_pred = lr2.predict(X.loc[X['mp_date'].isin(dates[i:i+pred]), :])
        y1_pred_win = v_make_win(y1_pred)
        y2_pred_win = v_make_win(y2_pred)                    
        y_test = y.loc[y['mp_date'].isin(dates[i:i+pred]),'goal_difference' ]
        y_test_win = y.loc[y['mp_date'].isin(dates[i:i+pred]),'won' ]
    
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test_win, y_pred)
        
        
    results_dic['date'] = []
    results_dic['mae'] = []
    results_dic['mse'] = []
    results_dic['r2'] = []   
    results_dic['model_name'].append(model_name)
    results_dic['date'].append(dates[i])
    
    
    #results_dic['predictions'].append(y1_pred)
    #results_dic['actual'].append(y_test)
    mae = mean_absolute_error(y_test, y1_pred)
    mse = mean_squared_error(y_test, y1_pred)
    r2 = r2_score(y_test_win, y1_pred_win)
    accuracy = accuracy_score(y_test_win, y1_pred_win)
    precision = precision_score(y_test_win, y1_pred_win, zero_division = 0)
    recall = recall_score(y_test_win, y1_pred_win)
    f1 = f1_score(y_test_win, y1_pred_win)
                          
                          
    results_dic['mae'].append(mae)
    results_dic['mse'].append(mse)
    results_dic['r2'].append(r2)
          
    #preds_dic['model_version'].append("RidgeReg(C=0.001)_ONE_day")
    #preds_dic['date'].append(dates[i])
    

In [None]:
def top3_max_val_params(model, X, dates, drop_firs=False):
    


In [None]:
#fir regressors predicting wins - losses, can use this to turn output into win prediction 

def make_win(x):
    if x <= 0:
        return 0
    if x >0:
        return 1

v_make_win = np.vectorize(make_win)

#v_make_win(y_pred)

In [40]:
##note KNN or other clusters might be helpful group the teams in smart way ... but not now.



#models

##regression
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

#classifiers (non-tree)
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


#tree-based classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost import XGBRegressor

##regression models
lr = Ridge(alpha=0.001) 
rfr = RandomForestRegressor(max_depth=3, random_state=0)
xgbr = XGBRegressor()

##classifier models
lrc = RidgeClassifier()
gnb = GaussianNB()
lgr = LogisticRegression(random_state = 0)
svc = SVC()

#tree-based classifiers
rfc =  RandomForestClassifier(max_depth=3, random_state=0)
bc = BaggingClassifier()
gbc = GradientBoostingClassifier()
xgbc = XGBClassifier()



##hyper_parameters from here 
##https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
##for xgboost from here 
##https://machinelearningmastery.com/extreme-gradient-boosting-ensemble-in-python/

#xgb

trees = [10, 50, 100, 500, 1000, 5000]  #100  #num of trees
max_depth = range(1,11)  ##3-5
rates = [0.0001, 0.001, 0.01, 0.1, 1.0]  #0.1
subsample in arange(0.1, 1.1, 0.1):  #0.4, 0.5  ##this is 0.1, 0.2 ... 1.0 # % of features to sample


#svc 
kernels in [‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’] #if you use poly, then adjust degree
C in [100, 10, 1.0, 0.1, 0.001]

#gb

learning_rate in [0.001, 0.01, 0.1]
n_estimators [10, 100, 1000]
subsample in [0.5, 0.7, 1.0]
max_depth in [3, 7, 9]


#rfc
max_features [1 to 20]  #key
max_features in [‘sqrt’, ‘log2’]
n_estimators in [10, 100, 1000]

#bc
n_estimators in [10, 100, 1000]

svm_dic = {'kernels':[‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’]}
lrc_dic = {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
lgr_hp_dic = {'solver': [‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’], 'penalty' : [‘none’, ‘l1’, ‘l2’, ‘elasticnet’],
'C' :[100, 10, 1.0, 0.1, 0.01]}

In [34]:
##part 1
##use models with default
##use data set with H/A +1, -1
##do full window for now

##next:
##check if just excluding first 10 days helps (chaotic)
##check if different windows help

##next
## can try tuning (for loops by hand, or ... use grid_search (use ML mastery code))
##-I think tuning will be faster ... just do by hand ... loop over the possible things 
##-ONE for loop over i = (a,b,c,d)... for each model i[0]

##Orrr can try adding features ... here we have to worry about:
##-adding basic features eg pp, and correct fo%
##-scaling numericals
##-dummy vars for categoricals (are there any?) besides H/A
##-num_windows and which lengths for moving avgs
##-filtering the features for increasing complexity inteligently
##-There is a dicotemy: 
##(a)use H/A + numerics or  ... here I think it can be made more like time-series
##(b) just use mumerics (moving avg) ... here I think the order of the games is not important (note Leung did this, and random train)



In [97]:

X = make_HA_data(data, 20162017)[0]
y = make_HA_data(data, 20162017)[1]

In [194]:
models_list = [('lrc',lrc), ('gnb', gnb), ('lgr', lgr), ('svc',svc), ('rfc', rfc), ('bc', bc),  ('gbc', gbc), ] # ('xgbc',  xgbc)]

##start with small list 



In [69]:
 y.loc[y['full_date'].isin(dates[10: 30]), 'won']

157    1
158    0
159    1
160    1
161    1
      ..
294    1
295    0
296    0
297    1
298    1
Name: won, Length: 142, dtype: int64

In [82]:

svc.fit(X.loc[X['full_date'].isin(dates[10: 30]), :], y.loc[y['full_date'].isin(dates[10: 30]), 'won'])
y_pred = svc.predict(X.iloc[600:, :])
y_test = y.iloc[600:, 5]
precision_score(y_test, y_pred)

0.5428571428571428

In [162]:
#y_pred

In [77]:
svc.fit(X.iloc[0:600, :], y.iloc[0:600, 5])
y_pred = svc.predict(X.iloc[600:, :])
y_test.value_counts()

1    342
0    288
Name: won, dtype: int64

In [186]:
def regr_model_results(model, model_name, X, dates, step, window_size, prediction_size, drop_first_k_days = 0): #X = data 
    results_dic['date'] = []
    results_dic['mae'] = []
    results_dic['mse'] = []
    results_dic['r2'] = []   
    
    
    #drop first k days from dates and X
    dates = dates[drop_first_k_days :]
    X = X.loc[X['full_date'].isin(dates), :].copy()  

    for i in range(step, len(dates), step): ##eg step =10, so 17 rounds
        model.fit(X.loc[X['full_date'].isin(dates[max(i-window_size ,0):i]), :],y.loc[y['full_date'].isin(dates[max(i-window_size,0):i]),'goal_difference' ])
        y_pred = model.predict(X.loc[X['full_date'].isin(dates[i:i+prediction_size]), :])
        y_test = y.loc[y['full_date'].isin(dates[i:i+prediction_size]),'goal_difference' ]
    
        accuracy = accuracy_score(y_test, y_pred)
        #recision = precision_score(y_test, y_pred, zero_division = 0)
        #recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average = None)
            
        
        results_dic['model_name'].append(model_name)
        results_dic['date'].append(dates[i])
    
        results_dic['mae'].append(mae)
        results_dic['mse'].append(mse)
        results_dic['r2'].append(r2)
        
    return results_dic #!
        
        

In [206]:
np.mean(np.array([1,3,4]))

2.6666666666666665

In [216]:
def class_model_results(model, model_name, X, dates, step, window_size, prediction_size, drop_first_k_days = 0): #X = data 
    results_dic ={}
    results_dic['model_name'] = []
    results_dic['date'] = []
    results_dic['accuracy'] = []
    results_dic['f1_score'] = []

    #results_dic['precision'] = []
  #  results_dic['recall'] = []
    
    #drop first k days from dates and X
    dates = dates[drop_first_k_days :]
    X = X.loc[X['full_date'].isin(dates), :].copy()  

    for i in range(step, len(dates), step): ##eg step =10, so 17 rounds
        model.fit(X.loc[X['full_date'].isin(dates[max(i-window_size ,0):i]), :],y.loc[y['full_date'].isin(dates[max(i-window_size,0):i]),'won' ])
        y_pred = model.predict(X.loc[X['full_date'].isin(dates[i:i+prediction_size]), :])
        y_test = y.loc[y['full_date'].isin(dates[i:i+prediction_size]),'won' ]
    
        accuracy = accuracy_score(y_test, y_pred)
        #recision = precision_score(y_test, y_pred, zero_division = 0)
        #recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred) #, average = None)
            
        results_dic['model_name'].append(model_name)  #append same model name every iter so same length as others
        results_dic['date'].append(dates[i])
                          
        results_dic['accuracy'].append(accuracy)
        results_dic['f1_score'].append(f1)
        #results_dic['precision'].append(precision)
        #results_dic['recall'].append(recall)
    results_dic['model_name'].append('model_name'+'_avg')  #append same model name every iter so same length as others
    results_dic['date'].append('average')
    results_dic['accuracy'].append(round(np.mean(np.array(results_dic['accuracy'])), 2) ) 
    results_dic['f1_score'].append(round(np.mean(np.array(results_dic['f1_score'])), 2) ) 
    return results_dic #!
        
        

In [142]:
len(dates)

172

In [143]:
dates[120:125]

[20170104, 20170105, 20170106, 20170107, 20170108]

In [165]:
#you X.loc[X['full_date'].isin(dates[120:125]), :]

In [215]:
##debigging (dates had playoff games in it )

step = 20
prediction_size = 5
for i in range(step, len(dates), step): 
    #y_pred = svc.predict(X.loc[X['full_date'].isin(dates[25:25+5]), :])
    y_pred = xgbc.predict(X.loc[X['full_date'].isin(dates[i:i+prediction_size]), :])
    print(i, i+prediction_size, y_pred[0:1])

ValueError: y contains previously unseen labels: [4 5 6 7 8 9]

In [219]:
all_results_dic = {}
xgb_param = xgbc.get_xgb_params()
xgb_param['num_class'] = 2

for mod_nm_pair in [('xgbc', xgbc)]:  #models_list:
    model_name = mod_nm_pair[0]
    model = mod_nm_pair[1]
    results_dic = class_model_results(model=model, model_name=model_name, X = X, dates = dates, step = 20, window_size =75, prediction_size =5, drop_first_k_days = 0)
    all_results_dic[model_name] = results_dic 

XGBoostError: value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.

In [209]:

res_df_dic ={}

for name in all_results_dic.keys():
    res_df_dic[name] = pd.DataFrame(all_results_dic[name])
   


In [210]:
name1 = list(res_df_dic.keys())[0]
df_all_results = pd.DataFrame(res_df_dic[name1])

for name in list(res_df_dic.keys())[1:]:
    df_all_results = pd.concat([df_all_results, res_df_dic[name]], axis = 0)

In [213]:
df_all_results.iloc[50:,:]

Unnamed: 0,model_name,date,accuracy,f1_score
5,bc,20170104,0.513514,0.571429
6,bc,20170124,0.5,0.5
7,bc,20170217,0.472222,0.344828
8,bc,average,0.52,0.54
0,gbc,20161114,0.645161,0.731707
1,gbc,20161204,0.617647,0.711111
2,gbc,20161227,0.547619,0.55814
3,gbc,20170304,0.454545,0.608696
4,gbc,20170324,0.605263,0.680851
5,gbc,20170104,0.594595,0.615385


In [135]:
res_df_dic['svc'].iloc[:,2:].apply(np.mean)

accuracy    0.520851
dtype: float64

In [112]:
dates[-1]

20170228

##why is accuracy so bad last date? lgr and svc have some decent results around 0.6 (over all 0.52)


#check how lr does ...as classifier and regression ... see if it's still at 58, 59 % ... and if not
#what needs to change to get it back? only new thing is window, step, predict_size params ...
#could also do naive train on first half text on second as I did at first ...




##xgbc needs to have num_class =2 ... can either switch to xgbr and use 

#params = { "objective": "multi:softmax", 'num_class': 2} ## ! 3}
#model = xgb.XGBRegressor(**params)

#or you can stick with xgbc and use cv instead of fit

xgb_param = model.get_xgb_params()
    extra = {'num_class': 3}
    xgb_param.update(extra)
    cvresult = xgb.cv(xgb_param, xgtrain, ...)

In [None]:
##part 1
##use models with default
##use data set with H/A +1, -1
##do full window for now

##next:
##check if just excluding first 10 days helps (chaotic)
##check if different windows help

##next
## can try tuning (for loops by hand, or ... use grid_search (use ML mastery code))
##-I think tuning will be faster ... just do by hand ... loop over the possible things 
##-ONE for loop over i = (a,b,c,d)... for each model i[0]

#probably do features (below) next ... it's time to bring in some serious features now that 
#the basic infrastructure for quick evaluation is set up ....
#then I will probably write some optimizer loops over windows, models, and feature sets

##Orrr can try adding features ... here we have to worry about:
##-adding basic features eg pp, and correct fo%
##-scaling numericals
##-dummy vars for categoricals (are there any?) besides H/A
##-num_windows and which lengths for moving avgs
##-filtering the features for increasing complexity inteligently
##-There is a dicotemy: 
##(a)use H/A + numerics or  ... here I think it can be made more like time-series
##(b) just use mumerics (moving avg) ... here I think the order of the games is not important (note Leung did this, and random train)
