In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import math
import random
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, roc_curve, auc, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing

In [7]:
# Load data
df = pd.read_csv("nhl_game_data/game_teams_stats.csv")
df.head()

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways
0,2011030221,1,away,False,OT,Peter DeBoer,3,26,31,12,3,1,44.9,6,7
1,2011030221,4,home,True,OT,Peter Laviolette,4,36,27,6,6,1,55.1,13,4
2,2011030222,1,away,True,REG,Peter DeBoer,4,35,32,12,4,0,50.9,8,7
3,2011030222,4,home,False,REG,Peter Laviolette,1,20,24,32,5,0,49.1,9,6
4,2011030223,4,away,False,OT,Peter Laviolette,3,28,28,4,5,1,50.8,2,1


In [8]:
df.describe()

Unnamed: 0,game_id,team_id,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways
count,22868.0,22868.0,22868.0,22868.0,22868.0,22868.0,22868.0,22868.0,22868.0,22868.0,22868.0
mean,2014134000.0,17.429115,2.743047,30.396405,23.607574,10.175791,3.179334,0.592356,50.0,8.648155,6.919801
std,2609562.0,11.958998,1.640201,6.83259,8.595527,8.038863,1.562107,0.759193,7.203451,4.861654,3.840531
min,2010020000.0,1.0,0.0,10.0,2.0,0.0,0.0,0.0,20.8,0.0,0.0
25%,2012020000.0,8.0,2.0,26.0,17.0,6.0,2.0,0.0,45.2,5.0,4.0
50%,2014021000.0,16.0,3.0,30.0,23.0,8.0,3.0,0.0,50.0,8.0,6.0
75%,2016021000.0,24.0,4.0,35.0,29.0,12.0,4.0,1.0,54.8,12.0,9.0
max,2018030000.0,54.0,10.0,64.0,80.0,183.0,12.0,6.0,79.2,36.0,31.0


In [9]:
null_counts = df.isnull().sum()
null_counts

game_id                   0
team_id                   0
HoA                       0
won                       0
settled_in                0
head_coach                0
goals                     0
shots                     0
hits                      0
pim                       0
powerPlayOpportunities    0
powerPlayGoals            0
faceOffWinPercentage      0
giveaways                 0
takeaways                 0
dtype: int64

In [10]:
# Change catergorical data to numerical
df['won_true'] = df['won'].astype(int)
# drop unnecessary columns
df.drop(['game_id'], axis=1, inplace=True)
df.drop(['team_id'], axis=1, inplace=True)
df.drop(['HoA'], axis=1, inplace=True)
df.drop(['won'], axis=1, inplace=True)
df.drop(['settled_in'], axis=1, inplace=True)
df.drop(['head_coach'], axis=1, inplace=True)
# normalize 
df['faceOffWinPercentage'] = (df['faceOffWinPercentage'] / 100)
df['shots'] = (df['shots'] - df['shots'].mean()) / (df['shots'].max() - df['shots'].min())
df['hits'] = (df['hits'] - df['hits'].mean()) / (df['hits'].max() - df['hits'].min())
df['pim'] = (df['pim'] - df['pim'].mean()) / (df['pim'].max() - df['pim'].min())
df['powerPlayOpportunities'] = (df['powerPlayOpportunities'] - df['powerPlayOpportunities'].mean()) / (df['powerPlayOpportunities'].max() - df['powerPlayOpportunities'].min())
df['powerPlayGoals'] = (df['powerPlayGoals'] - df['powerPlayGoals'].mean()) / (df['powerPlayGoals'].max() - df['powerPlayGoals'].min())
df['giveaways'] = (df['giveaways'] - df['giveaways'].mean()) / (df['giveaways'].max() - df['giveaways'].min())
df['takeaways'] = (df['takeaways'] - df['takeaways'].mean()) / (df['takeaways'].max() - df['takeaways'].min())
df['faceOffWinPercentage'] = (df['faceOffWinPercentage'] - df['faceOffWinPercentage'].mean()) / (df['faceOffWinPercentage'].max() - df['faceOffWinPercentage'].min())
df.head()

Unnamed: 0,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,won_true
0,3,-0.081415,0.094775,0.009968,-0.014944,0.067941,-0.087329,-0.07356,0.002587,0
1,4,0.10377,0.043493,-0.022819,0.235056,0.067941,0.087329,0.120885,-0.094187,1
2,4,0.085252,0.107595,0.009968,0.068389,-0.098726,0.015411,-0.018004,0.002587,1
3,1,-0.192526,0.005031,0.119258,0.151722,-0.098726,-0.015411,0.009773,-0.029671,0
4,3,-0.044378,0.056313,-0.033747,0.151722,0.067941,0.013699,-0.184671,-0.190961,0


In [11]:
# Saving X and y data 
y = df[df.columns[-1]] # last column
y.shape # only one list
# X is features
X = df[df.columns[0:9]]
X.shape # size is amount o features
print(X.head())
print(y.head())

   goals     shots      hits       pim  powerPlayOpportunities  \
0      3 -0.081415  0.094775  0.009968               -0.014944   
1      4  0.103770  0.043493 -0.022819                0.235056   
2      4  0.085252  0.107595  0.009968                0.068389   
3      1 -0.192526  0.005031  0.119258                0.151722   
4      3 -0.044378  0.056313 -0.033747                0.151722   

   powerPlayGoals  faceOffWinPercentage  giveaways  takeaways  
0        0.067941             -0.087329  -0.073560   0.002587  
1        0.067941              0.087329   0.120885  -0.094187  
2       -0.098726              0.015411  -0.018004   0.002587  
3       -0.098726             -0.015411   0.009773  -0.029671  
4        0.067941              0.013699  -0.184671  -0.190961  
0    0
1    1
2    1
3    0
4    0
Name: won_true, dtype: int32


In [12]:
# creating logistic regression model with just training set (whole set currently)
lr = LogisticRegression(C=100000, tol=.0000001, solver='newton-cg')
lr.fit(X, y)
lr.score(X, y)
# print out beta, coefficients
# lr.coef_

0.7778555186286514

In [13]:
# smf model to determine if features are insignificant. 

results = smf.logit('won_true ~ goals + shots + hits + pim + powerPlayOpportunities + powerPlayGoals + faceOffWinPercentage + giveaways + takeaways', data = df).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.464512
         Iterations 7


0,1,2,3
Dep. Variable:,won_true,No. Observations:,22868.0
Model:,Logit,Df Residuals:,22858.0
Method:,MLE,Df Model:,9.0
Date:,"Wed, 22 Apr 2020",Pseudo R-squ.:,0.3299
Time:,13:27:29,Log-Likelihood:,-10622.0
converged:,True,LL-Null:,-15851.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.2013,0.047,-67.932,0.000,-3.294,-3.109
goals,1.1986,0.017,71.354,0.000,1.166,1.232
shots,-2.0037,0.140,-14.302,0.000,-2.278,-1.729
hits,-0.2699,0.156,-1.732,0.083,-0.575,0.035
pim,-4.7902,0.449,-10.667,0.000,-5.670,-3.910
powerPlayOpportunities,0.0102,0.149,0.068,0.946,-0.281,0.301
powerPlayGoals,-0.7999,0.165,-4.857,0.000,-1.123,-0.477
faceOffWinPercentage,0.5907,0.140,4.230,0.000,0.317,0.864
giveaways,-0.7564,0.131,-5.789,0.000,-1.012,-0.500


Power Play Opportunities and Hits seem to be less significant to the dependant variable. Removing the variables seem best. 

In [14]:
# Drop insignificant features
df.drop(['powerPlayOpportunities'], axis=1, inplace=True)
df.drop(['hits'], axis=1, inplace=True)
# Remove these features from X
X = df[df.columns[0:7]]
X.head()

Unnamed: 0,goals,shots,pim,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways
0,3,-0.081415,0.009968,0.067941,-0.087329,-0.07356,0.002587
1,4,0.10377,-0.022819,0.067941,0.087329,0.120885,-0.094187
2,4,0.085252,0.009968,-0.098726,0.015411,-0.018004,0.002587
3,1,-0.192526,0.119258,-0.098726,-0.015411,0.009773,-0.029671
4,3,-0.044378,-0.033747,0.067941,0.013699,-0.184671,-0.190961


In [15]:
results = smf.logit('won_true ~ goals + shots + pim + powerPlayGoals + faceOffWinPercentage + giveaways + takeaways', data = df).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.464579
         Iterations 7


0,1,2,3
Dep. Variable:,won_true,No. Observations:,22868.0
Model:,Logit,Df Residuals:,22860.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 22 Apr 2020",Pseudo R-squ.:,0.3298
Time:,13:27:49,Log-Likelihood:,-10624.0
converged:,True,LL-Null:,-15851.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.2051,0.047,-68.272,0.000,-3.297,-3.113
goals,1.1998,0.017,71.707,0.000,1.167,1.233
shots,-2.0076,0.138,-14.581,0.000,-2.277,-1.738
pim,-4.8202,0.437,-11.028,0.000,-5.677,-3.963
powerPlayGoals,-0.7942,0.153,-5.186,0.000,-1.094,-0.494
faceOffWinPercentage,0.5937,0.140,4.252,0.000,0.320,0.867
giveaways,-0.7822,0.130,-6.030,0.000,-1.036,-0.528
takeaways,1.2936,0.143,9.062,0.000,1.014,1.573


In [16]:
# Refit the LR
lr = LogisticRegression(C=100000, tol=.0000001, solver='newton-cg')
lr.fit(X, y)
print(lr.score(X, y))
# Error rate
from sklearn.metrics import accuracy_score
lr_y_pred = lr.predict(X)
lr_a_score = accuracy_score(y, lr_y_pred)
lr_error_rate = 1 - lr_a_score
lr_error_rate

0.7775494140283365


0.22245058597166345