# Predicting NHL Game Outcomes

## Sandbox Notebook

Working notebook 
When components (e.g. scraping functions) are working, move them to eventual production home

In [1]:
# Standard Packages
import pandas as pd
import numpy as np
import requests
import re
import time
import os
import warnings

# Viz Packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# Modeling Packages
## Modeling Prep
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold, \
GridSearchCV, RandomizedSearchCV

## SKLearn Data Prep Modules
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, \
PolynomialFeatures, PowerTransformer, Normalizer, MaxAbsScaler

from sklearn.impute import SimpleImputer

## SKLearn Classification Models
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,\
ExtraTreesClassifier, VotingClassifier, StackingRegressor

## SKLearn Pipeline Setup
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## SKLearn Model Optimization
from sklearn.feature_selection import RFE, f_regression

## Boosting
from xgboost import XGBRegressor
from xgboost import XGBClassifier

## SKLearn Metrics
### Classification Scoring/Evaluation
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, \
ConfusionMatrixDisplay, log_loss, confusion_matrix, RocCurveDisplay, make_scorer, roc_auc_score

In [2]:
# Notebook Config
## Suppress Python Warnings (Future, Deprecation)
warnings.filterwarnings("ignore", category= FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Suppress Pandas Warnings (SettingWithCopy)
pd.options.mode.chained_assignment = None

## Pandas Display Config
pd.options.display.max_columns = None
pd.options.display.width = None

## Display SKLearn estimators as diagrams
from sklearn import set_config
set_config(display= 'diagram')

In [3]:
# Scraping Requirents
from bs4 import BeautifulSoup


## EDA

In [11]:
home_game_log_2021 = pd.read_csv('data/21_22-home-game-log.csv')
home_game_log_2021

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF/60,CA/60,CF%,FF/60,FA/60,FF%,SF/60,SA/60,SF%,GF/60,GA/60,GF%,xGF/60,xGA/60,xGF%,SCF/60,SCA/60,SCF%,HDCF/60,HDCA/60,HDCF%,HDSF/60,HDSA/60,HDSF%,HDGF/60,HDGA/60,HDGF%,HDSH%,HDSV%,MDCF/60,MDCA/60,MDCF%,MDSF/60,MDSA/60,MDSF%,MDGF/60,MDGA/60,MDGF%,MDSH%,MDSV%,LDCF/60,LDCA/60,LDCF%,LDSF/60,LDSA/60,LDSF%,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance
0,"2022-03-14 - Coyotes 5, Senators 3",Ottawa Senators,Limited ReportFull Report,60.000000,82.00,27.00,75.23,65.00,22.00,74.71,43.00,15.00,74.14,3.00,5.00,37.50,4.71,2.32,67.02,42.00,15.00,73.68,20.00,7.00,74.07,12.00,4.00,75.00,1.00,2.00,33.33,8.33,50.00,22.00,8.00,73.33,14.00,6.00,70.00,2.00,2.00,50.00,14.29,66.67,33.00,11.00,75.00,15.00,4.00,78.95,0.00,0.00,-,0.00,100.00,6.98,66.67,0.736,9201
1,"2022-03-24 - Stars 4, Hurricanes 3",Carolina Hurricanes,Limited ReportFull Report,64.983333,74.79,27.70,72.97,65.56,19.39,77.17,43.40,13.85,75.81,2.77,2.77,50.00,4.42,1.43,75.61,33.24,18.47,64.29,15.70,9.23,62.96,11.08,7.39,60.00,1.85,1.85,50.00,16.67,75.00,17.54,9.23,65.52,10.16,2.77,78.57,0.92,0.00,100.00,9.09,100.00,36.01,6.46,84.78,21.24,2.77,88.46,0.00,0.92,0.00,0.00,66.67,6.38,80.00,0.864,16421
2,"2022-03-20 - Rangers 2, Hurricanes 0",Carolina Hurricanes,Limited ReportFull Report,60.000000,94.00,36.00,72.31,71.00,29.00,71.00,44.00,18.00,70.97,0.00,2.00,0.00,5.91,2.92,66.96,58.00,20.00,74.36,20.00,9.00,68.97,11.00,7.00,61.11,0.00,2.00,0.00,0.00,71.43,38.00,11.00,77.55,18.00,5.00,78.26,0.00,0.00,-,0.00,100.00,32.00,16.00,66.67,15.00,6.00,71.43,0.00,0.00,-,0.00,100.00,0.00,88.89,0.889,18680
3,"2022-04-12 - Ducks 2, Panthers 3",Florida Panthers,Limited ReportFull Report,63.683333,90.45,34.86,72.18,70.66,30.15,70.09,51.82,22.61,69.62,2.83,1.88,60.00,5.01,1.69,74.78,52.76,17.90,74.67,20.73,5.65,78.57,11.31,4.71,70.59,0.94,0.94,50.00,8.33,80.00,32.03,12.25,72.34,20.73,6.60,75.86,0.00,0.94,0.00,0.00,85.71,32.03,14.13,69.39,17.90,10.36,63.33,1.88,0.00,100.00,10.53,100.00,5.45,91.67,0.971,16204
4,"2022-01-18 - Canadiens 5, Stars 3",Dallas Stars,Limited ReportFull Report,60.000000,112.00,44.00,71.79,81.00,31.00,72.32,51.00,22.00,69.86,3.00,5.00,37.50,6.44,2.04,75.89,59.00,24.00,71.08,23.00,9.00,71.88,16.00,8.00,66.67,3.00,3.00,50.00,18.75,62.50,36.00,15.00,70.59,16.00,7.00,69.57,0.00,2.00,0.00,0.00,71.43,46.00,16.00,74.19,17.00,5.00,77.27,0.00,0.00,-,0.00,100.00,5.88,77.27,0.832,17679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,"2022-02-12 - Maple Leafs 2, Canucks 3",Vancouver Canucks,Limited ReportFull Report,60.000000,40.00,91.00,30.53,36.00,73.00,33.03,24.00,53.00,31.17,3.00,2.00,60.00,2.70,5.19,34.25,20.00,44.00,31.25,5.00,20.00,20.00,4.00,15.00,21.05,2.00,2.00,50.00,50.00,86.67,15.00,24.00,38.46,8.00,16.00,33.33,1.00,0.00,100.00,12.50,100.00,16.00,43.00,27.12,11.00,21.00,34.38,0.00,0.00,-,0.00,100.00,12.50,96.23,1.087,9396
1308,"2022-03-18 - Panthers 3, Ducks 0",Anaheim Ducks,Limited ReportFull Report,60.000000,37.00,86.00,30.08,26.00,66.00,28.26,17.00,42.00,28.81,0.00,3.00,0.00,1.34,4.29,23.84,20.00,44.00,31.25,9.00,15.00,37.50,3.00,11.00,21.43,0.00,1.00,0.00,0.00,90.91,11.00,29.00,27.50,5.00,11.00,31.25,0.00,2.00,0.00,0.00,81.82,13.00,35.00,27.08,7.00,15.00,31.82,0.00,0.00,-,0.00,100.00,0.00,92.86,0.929,12296
1309,"2021-12-18 - Oilers 5, Kraken 3",Seattle Kraken,Limited ReportFull Report,60.000000,30.00,73.00,29.13,25.00,55.00,31.25,17.00,41.00,29.31,3.00,5.00,37.50,1.64,4.15,28.35,13.00,32.00,28.89,7.00,13.00,35.00,7.00,9.00,43.75,1.00,4.00,20.00,14.29,55.56,6.00,19.00,24.00,3.00,10.00,23.08,1.00,0.00,100.00,33.33,100.00,15.00,35.00,30.00,5.00,21.00,19.23,1.00,1.00,50.00,20.00,95.24,17.65,87.80,1.055,17151
1310,"2021-11-08 - Panthers 3, Rangers 4",New York Rangers,Limited ReportFull Report,60.000000,37.00,91.00,28.91,29.00,70.00,29.29,18.00,45.00,28.57,4.00,3.00,57.14,2.02,4.59,30.49,22.00,55.00,28.57,13.00,22.00,37.14,11.00,18.00,37.93,4.00,3.00,57.14,36.36,83.33,9.00,33.00,21.43,2.00,11.00,15.38,0.00,0.00,-,0.00,100.00,13.00,33.00,28.26,4.00,15.00,21.05,0.00,0.00,-,0.00,100.00,22.22,93.33,1.156,14877


In [12]:
home_game_log_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1312 entries, 0 to 1311
Data columns (total 59 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Game        1312 non-null   object 
 1   Team        1312 non-null   object 
 2   Unnamed: 2  1312 non-null   object 
 3   TOI         1312 non-null   float64
 4   CF/60       1312 non-null   float64
 5   CA/60       1312 non-null   float64
 6   CF%         1312 non-null   float64
 7   FF/60       1312 non-null   float64
 8   FA/60       1312 non-null   float64
 9   FF%         1312 non-null   float64
 10  SF/60       1312 non-null   float64
 11  SA/60       1312 non-null   float64
 12  SF%         1312 non-null   float64
 13  GF/60       1312 non-null   float64
 14  GA/60       1312 non-null   float64
 15  GF%         1312 non-null   object 
 16  xGF/60      1312 non-null   float64
 17  xGA/60      1312 non-null   float64
 18  xGF%        1312 non-null   float64
 19  SCF/60      1312 non-null  

### Top line feature (column) deletion and addition

Deleting:
- 'Unnamed: 2' held hyperlinks to related pages
- 'C' type (Corsi) shot stats 
  - we will be using 'F' type (Fenwick) shots (goals, on net and misses, but excludes blocks)
- 'SC' type (Scoring Chance) vars
    - we will be focusing on HD type

In [18]:
# Store column names in list for easier manipulation
game_log_cols = home_game_log_2021.columns.to_list()
print([col for col in game_log_cols])

['Game', 'Team', 'Unnamed: 2', 'TOI', 'CF/60', 'CA/60', 'CF%', 'FF/60', 'FA/60', 'FF%', 'SF/60', 'SA/60', 'SF%', 'GF/60', 'GA/60', 'GF%', 'xGF/60', 'xGA/60', 'xGF%', 'SCF/60', 'SCA/60', 'SCF%', 'HDCF/60', 'HDCA/60', 'HDCF%', 'HDSF/60', 'HDSA/60', 'HDSF%', 'HDGF/60', 'HDGA/60', 'HDGF%', 'HDSH%', 'HDSV%', 'MDCF/60', 'MDCA/60', 'MDCF%', 'MDSF/60', 'MDSA/60', 'MDSF%', 'MDGF/60', 'MDGA/60', 'MDGF%', 'MDSH%', 'MDSV%', 'LDCF/60', 'LDCA/60', 'LDCF%', 'LDSF/60', 'LDSA/60', 'LDSF%', 'LDGF/60', 'LDGA/60', 'LDGF%', 'LDSH%', 'LDSV%', 'SH%', 'SV%', 'PDO', 'Attendance', 'date']


In [None]:
# import data was filtered -  need to check validity of a few
# PDO - only reported for home team in this version of the data

In [None]:
# Columns to immediately drop
drop_on_load = ['Unnamed: 2']

In [None]:
LDCF/60     1312 non-null   float64
 45  LDCA/60     1312 non-null   float64
 46  LDCF%       1312 non-null   float64
 47  LDSF/60     1312 non-null   float64
 48  LDSA/60     1312 non-null   float64
 49  LDSF%       1312 non-null   float64
 50  LDGF/60     1312 non-null   float64
 51  LDGA/60     1312 non-null   float64
 52  LDGF%       1312 non-null   object 
 53  LDSH%       1312 non-null   float64
 54  LDSV%

In [13]:
# extract date from 
home_game_log_2021['date'] = home_game_log_2021['Game'].str[:10]
home_game_log_2021

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF/60,CA/60,CF%,FF/60,FA/60,FF%,SF/60,SA/60,SF%,GF/60,GA/60,GF%,xGF/60,xGA/60,xGF%,SCF/60,SCA/60,SCF%,HDCF/60,HDCA/60,HDCF%,HDSF/60,HDSA/60,HDSF%,HDGF/60,HDGA/60,HDGF%,HDSH%,HDSV%,MDCF/60,MDCA/60,MDCF%,MDSF/60,MDSA/60,MDSF%,MDGF/60,MDGA/60,MDGF%,MDSH%,MDSV%,LDCF/60,LDCA/60,LDCF%,LDSF/60,LDSA/60,LDSF%,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,date
0,"2022-03-14 - Coyotes 5, Senators 3",Ottawa Senators,Limited ReportFull Report,60.000000,82.00,27.00,75.23,65.00,22.00,74.71,43.00,15.00,74.14,3.00,5.00,37.50,4.71,2.32,67.02,42.00,15.00,73.68,20.00,7.00,74.07,12.00,4.00,75.00,1.00,2.00,33.33,8.33,50.00,22.00,8.00,73.33,14.00,6.00,70.00,2.00,2.00,50.00,14.29,66.67,33.00,11.00,75.00,15.00,4.00,78.95,0.00,0.00,-,0.00,100.00,6.98,66.67,0.736,9201,2022-03-1
1,"2022-03-24 - Stars 4, Hurricanes 3",Carolina Hurricanes,Limited ReportFull Report,64.983333,74.79,27.70,72.97,65.56,19.39,77.17,43.40,13.85,75.81,2.77,2.77,50.00,4.42,1.43,75.61,33.24,18.47,64.29,15.70,9.23,62.96,11.08,7.39,60.00,1.85,1.85,50.00,16.67,75.00,17.54,9.23,65.52,10.16,2.77,78.57,0.92,0.00,100.00,9.09,100.00,36.01,6.46,84.78,21.24,2.77,88.46,0.00,0.92,0.00,0.00,66.67,6.38,80.00,0.864,16421,2022-03-2
2,"2022-03-20 - Rangers 2, Hurricanes 0",Carolina Hurricanes,Limited ReportFull Report,60.000000,94.00,36.00,72.31,71.00,29.00,71.00,44.00,18.00,70.97,0.00,2.00,0.00,5.91,2.92,66.96,58.00,20.00,74.36,20.00,9.00,68.97,11.00,7.00,61.11,0.00,2.00,0.00,0.00,71.43,38.00,11.00,77.55,18.00,5.00,78.26,0.00,0.00,-,0.00,100.00,32.00,16.00,66.67,15.00,6.00,71.43,0.00,0.00,-,0.00,100.00,0.00,88.89,0.889,18680,2022-03-2
3,"2022-04-12 - Ducks 2, Panthers 3",Florida Panthers,Limited ReportFull Report,63.683333,90.45,34.86,72.18,70.66,30.15,70.09,51.82,22.61,69.62,2.83,1.88,60.00,5.01,1.69,74.78,52.76,17.90,74.67,20.73,5.65,78.57,11.31,4.71,70.59,0.94,0.94,50.00,8.33,80.00,32.03,12.25,72.34,20.73,6.60,75.86,0.00,0.94,0.00,0.00,85.71,32.03,14.13,69.39,17.90,10.36,63.33,1.88,0.00,100.00,10.53,100.00,5.45,91.67,0.971,16204,2022-04-1
4,"2022-01-18 - Canadiens 5, Stars 3",Dallas Stars,Limited ReportFull Report,60.000000,112.00,44.00,71.79,81.00,31.00,72.32,51.00,22.00,69.86,3.00,5.00,37.50,6.44,2.04,75.89,59.00,24.00,71.08,23.00,9.00,71.88,16.00,8.00,66.67,3.00,3.00,50.00,18.75,62.50,36.00,15.00,70.59,16.00,7.00,69.57,0.00,2.00,0.00,0.00,71.43,46.00,16.00,74.19,17.00,5.00,77.27,0.00,0.00,-,0.00,100.00,5.88,77.27,0.832,17679,2022-01-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,"2022-02-12 - Maple Leafs 2, Canucks 3",Vancouver Canucks,Limited ReportFull Report,60.000000,40.00,91.00,30.53,36.00,73.00,33.03,24.00,53.00,31.17,3.00,2.00,60.00,2.70,5.19,34.25,20.00,44.00,31.25,5.00,20.00,20.00,4.00,15.00,21.05,2.00,2.00,50.00,50.00,86.67,15.00,24.00,38.46,8.00,16.00,33.33,1.00,0.00,100.00,12.50,100.00,16.00,43.00,27.12,11.00,21.00,34.38,0.00,0.00,-,0.00,100.00,12.50,96.23,1.087,9396,2022-02-1
1308,"2022-03-18 - Panthers 3, Ducks 0",Anaheim Ducks,Limited ReportFull Report,60.000000,37.00,86.00,30.08,26.00,66.00,28.26,17.00,42.00,28.81,0.00,3.00,0.00,1.34,4.29,23.84,20.00,44.00,31.25,9.00,15.00,37.50,3.00,11.00,21.43,0.00,1.00,0.00,0.00,90.91,11.00,29.00,27.50,5.00,11.00,31.25,0.00,2.00,0.00,0.00,81.82,13.00,35.00,27.08,7.00,15.00,31.82,0.00,0.00,-,0.00,100.00,0.00,92.86,0.929,12296,2022-03-1
1309,"2021-12-18 - Oilers 5, Kraken 3",Seattle Kraken,Limited ReportFull Report,60.000000,30.00,73.00,29.13,25.00,55.00,31.25,17.00,41.00,29.31,3.00,5.00,37.50,1.64,4.15,28.35,13.00,32.00,28.89,7.00,13.00,35.00,7.00,9.00,43.75,1.00,4.00,20.00,14.29,55.56,6.00,19.00,24.00,3.00,10.00,23.08,1.00,0.00,100.00,33.33,100.00,15.00,35.00,30.00,5.00,21.00,19.23,1.00,1.00,50.00,20.00,95.24,17.65,87.80,1.055,17151,2021-12-1
1310,"2021-11-08 - Panthers 3, Rangers 4",New York Rangers,Limited ReportFull Report,60.000000,37.00,91.00,28.91,29.00,70.00,29.29,18.00,45.00,28.57,4.00,3.00,57.14,2.02,4.59,30.49,22.00,55.00,28.57,13.00,22.00,37.14,11.00,18.00,37.93,4.00,3.00,57.14,36.36,83.33,9.00,33.00,21.43,2.00,11.00,15.38,0.00,0.00,-,0.00,100.00,13.00,33.00,28.26,4.00,15.00,21.05,0.00,0.00,-,0.00,100.00,22.22,93.33,1.156,14877,2021-11-0


In [None]:
# Cols I need to add back in
cols_to_add = ['is_home_team', 'home_team_goals', 'home_team_won']

### EDA was wiped away, so have to circle back
### Luckily the data is pretty clean and most manipulation is derived stats to be imputed

In [None]:
# Check correlation of selected features
plt.figure(figsize=(20,10))
cor = shots_df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
# Run Baseline Model with all available features in our dataset
all_features = 

In [None]:
# Logistic Regression w/ Grid Search

# Parameters
c = [0.1, 1, 10, 100]
max_iter = [100, 1000]
solver = ['liblinear']
penalty = ['l1', 'l2']

logreg_params = {
    'logreg__C': c,
    'logreg__max_iter': max_iter,
    'logreg__solver': solver,
    'logreg__penalty': penalty
}

# Model
logreg_top_model_pipe = Pipeline(steps=[("logreg", LogisticRegression(n_jobs=1))], verbose=False)
logreg_top_model = GridSearchCV(estimator=logreg_top_model_pipe, param_grid=logreg_params, scoring='accuracy', cv=5, verbose=0)
logreg_top_model.fit(X_train, y_train)

# Displaying Data
y_pred = logreg_top_model.predict(X_test)
logreg_top_score = logreg_top_model.score(X_test, y_test)

# Model Selection
for i in range(0, 11):
    print(f"Model {i}. Current Top Score: {logreg_top_score}")
        
    # Split X and y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)
    
    # Model Building
    logreg_cur_pipe = Pipeline(steps=[("logreg", LogisticRegression(n_jobs=1))], verbose=False)
    logreg_cur_gs = GridSearchCV(estimator=logreg_cur_pipe, param_grid=logreg_params, scoring='accuracy', cv=5, verbose=0)
    logreg_cur_gs.fit(X_train, y_train)
    
    # Comparing and Replacing Data
    y_pred = logreg_cur_gs.predict(X_test)
    logreg_cur_score = logreg_cur_gs.score(X_test, y_test)
    
    if logreg_cur_score > logreg_top_score:
        logreg_top_model = logreg_cur_gs
        logreg_top_score = logreg_cur_score