In [29]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.datasets import load_digits


digits = load_digits()

In [30]:
df = pd.read_csv("nhl_game_data/game_teams_stats.csv")
df.head()

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways
0,2011030221,1,away,False,OT,Peter DeBoer,3,26,31,12,3,1,44.9,6,7
1,2011030221,4,home,True,OT,Peter Laviolette,4,36,27,6,6,1,55.1,13,4
2,2011030222,1,away,True,REG,Peter DeBoer,4,35,32,12,4,0,50.9,8,7
3,2011030222,4,home,False,REG,Peter Laviolette,1,20,24,32,5,0,49.1,9,6
4,2011030223,4,away,False,OT,Peter Laviolette,3,28,28,4,5,1,50.8,2,1


In [31]:
#df['won_true'] = np.where(df['won'] == 'TRUE', 1, 0)
df['won_true'] = df['won'].astype(int)
# drop unnecessary columns
df.drop(['game_id'], axis=1, inplace=True)
df.drop(['team_id'], axis=1, inplace=True)
df.drop(['HoA'], axis=1, inplace=True)
df.drop(['won'], axis=1, inplace=True)
df.drop(['settled_in'], axis=1, inplace=True)
df.drop(['head_coach'], axis=1, inplace=True)
# normalize 
df['balance'] = (df['balance'] - df['balance'].mean()) / (df['balance'].max() - df['balance'].min())
df.head()

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,won_true
0,2011030221,1,away,False,OT,Peter DeBoer,3,26,31,12,3,1,44.9,6,7,0
1,2011030221,4,home,True,OT,Peter Laviolette,4,36,27,6,6,1,55.1,13,4,1
2,2011030222,1,away,True,REG,Peter DeBoer,4,35,32,12,4,0,50.9,8,7,1
3,2011030222,4,home,False,REG,Peter Laviolette,1,20,24,32,5,0,49.1,9,6,0
4,2011030223,4,away,False,OT,Peter Laviolette,3,28,28,4,5,1,50.8,2,1,0


In [33]:
y = df[df.columns[-1]] # last column
y.shape # only one list
# X is features
X = df[df.columns[0:9]]
X.shape # size is amount o features
X.head()

Unnamed: 0,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways
0,3,26,31,12,3,1,44.9,6,7
1,4,36,27,6,6,1,55.1,13,4
2,4,35,32,12,4,0,50.9,8,7
3,1,20,24,32,5,0,49.1,9,6
4,3,28,28,4,5,1,50.8,2,1


In [35]:
# creating logistic regression model with just training set (whole set currently)
lr = LogisticRegression(C=100000, tol=.0000001, solver='newton-cg')
lr.fit(X, y)
lr.score(X, y)
# print out beta, coefficients
# lr.coef_

0.7778555186286514

In [38]:
# first param in '' is the depandant variable, then ~ followed by the 
# features
results = smf.logit('won_true ~ goals + shots + hits + pim + powerPlayOpportunities + powerPlayGoals + faceOffWinPercentage + giveaways + takeaways', data = df).fit()

Optimization terminated successfully.
         Current function value: 0.464512
         Iterations 6


In [39]:
results.summary()

0,1,2,3
Dep. Variable:,won_true,No. Observations:,22868.0
Model:,Logit,Df Residuals:,22858.0
Method:,MLE,Df Model:,9.0
Date:,"Wed, 19 Feb 2020",Pseudo R-squ.:,0.3299
Time:,16:02:19,Log-Likelihood:,-10622.0
converged:,True,LL-Null:,-15851.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.2631,0.149,-15.168,0.000,-2.556,-1.971
goals,1.1986,0.017,71.354,0.000,1.166,1.232
shots,-0.0371,0.003,-14.302,0.000,-0.042,-0.032
hits,-0.0035,0.002,-1.732,0.083,-0.007,0.000
pim,-0.0262,0.002,-10.667,0.000,-0.031,-0.021
powerPlayOpportunities,0.0008,0.012,0.068,0.946,-0.023,0.025
powerPlayGoals,-0.1333,0.027,-4.857,0.000,-0.187,-0.080
faceOffWinPercentage,0.0101,0.002,4.230,0.000,0.005,0.015
giveaways,-0.0210,0.004,-5.789,0.000,-0.028,-0.014


In [40]:
from sklearn.metrics import accuracy_score
lr_y_pred = lr.predict(X)
lr_a_score = accuracy_score(y, lr_y_pred)
lr_error_rate = 1 - lr_a_score
lr_error_rate

0.22214448137134857

In [41]:
(((results.predict(X)) > .5) * 1 == y).mean()

#sm_y_pred = results.predict(X)
#sm_a_score = accuracy_score(y, sm_y_pred)

0.7778555186286514

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, random_state=3)
X_train.shape, X_valid.shape, X_test.shape

((12863, 9), (4288, 9), (5717, 9))