# Predicting NHL Game Outcomes

## Sandbox Notebook

Working notebook 
When components (e.g. scraping functions) are working, move them to eventual production home

In [82]:
# Standard Packages
import pandas as pd
from pandas.testing import assert_frame_equal
import numpy as np
import requests
import re
import time
import os
import warnings

# Viz Packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# Modeling Packages
## Modeling Prep
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold, \
GridSearchCV, RandomizedSearchCV

## SKLearn Data Prep Modules
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, \
PolynomialFeatures, PowerTransformer, Normalizer, MaxAbsScaler

from sklearn.impute import SimpleImputer

## SKLearn Classification Models
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,\
ExtraTreesClassifier, VotingClassifier, StackingRegressor

## SKLearn Pipeline Setup
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## SKLearn Model Optimization
from sklearn.feature_selection import RFE, f_regression

# ## Boosting
# from xgboost import XGBRegressor
# from xgboost import XGBClassifier

## SKLearn Metrics
### Classification Scoring/Evaluation
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, \
ConfusionMatrixDisplay, log_loss, confusion_matrix, RocCurveDisplay, make_scorer, roc_auc_score

In [83]:
# Notebook Config
from pprintpp import pprint as pp
from tqdm import tqdm
from io import StringIO

## Suppress Python Warnings (Future, Deprecation)
warnings.filterwarnings("ignore", category= FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Suppress Pandas Warnings (SettingWithCopy)
pd.options.mode.chained_assignment = None

## Pandas Display Config
pd.options.display.max_columns = None
pd.options.display.width = None

## Display SKLearn estimators as diagrams
from sklearn import set_config
set_config(display= 'diagram')

In [84]:
# Scraping Requirents
from bs4 import BeautifulSoup


## EDA

In [85]:
home_game_log_2021 = pd.read_csv('data/21_22-home-game-log.csv')
home_game_log_2021

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF/60,CA/60,CF%,FF/60,FA/60,FF%,SF/60,SA/60,SF%,GF/60,GA/60,GF%,xGF/60,xGA/60,xGF%,SCF/60,SCA/60,SCF%,HDCF/60,HDCA/60,HDCF%,HDSF/60,HDSA/60,HDSF%,HDGF/60,HDGA/60,HDGF%,HDSH%,HDSV%,MDCF/60,MDCA/60,MDCF%,MDSF/60,MDSA/60,MDSF%,MDGF/60,MDGA/60,MDGF%,MDSH%,MDSV%,LDCF/60,LDCA/60,LDCF%,LDSF/60,LDSA/60,LDSF%,LDGF/60,LDGA/60,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance
0,"2022-03-14 - Coyotes 5, Senators 3",Ottawa Senators,Limited ReportFull Report,60.000000,82.00,27.00,75.23,65.00,22.00,74.71,43.00,15.00,74.14,3.00,5.00,37.50,4.71,2.32,67.02,42.00,15.00,73.68,20.00,7.00,74.07,12.00,4.00,75.00,1.00,2.00,33.33,8.33,50.00,22.00,8.00,73.33,14.00,6.00,70.00,2.00,2.00,50.00,14.29,66.67,33.00,11.00,75.00,15.00,4.00,78.95,0.00,0.00,-,0.00,100.00,6.98,66.67,0.736,9201
1,"2022-03-24 - Stars 4, Hurricanes 3",Carolina Hurricanes,Limited ReportFull Report,64.983333,74.79,27.70,72.97,65.56,19.39,77.17,43.40,13.85,75.81,2.77,2.77,50.00,4.42,1.43,75.61,33.24,18.47,64.29,15.70,9.23,62.96,11.08,7.39,60.00,1.85,1.85,50.00,16.67,75.00,17.54,9.23,65.52,10.16,2.77,78.57,0.92,0.00,100.00,9.09,100.00,36.01,6.46,84.78,21.24,2.77,88.46,0.00,0.92,0.00,0.00,66.67,6.38,80.00,0.864,16421
2,"2022-03-20 - Rangers 2, Hurricanes 0",Carolina Hurricanes,Limited ReportFull Report,60.000000,94.00,36.00,72.31,71.00,29.00,71.00,44.00,18.00,70.97,0.00,2.00,0.00,5.91,2.92,66.96,58.00,20.00,74.36,20.00,9.00,68.97,11.00,7.00,61.11,0.00,2.00,0.00,0.00,71.43,38.00,11.00,77.55,18.00,5.00,78.26,0.00,0.00,-,0.00,100.00,32.00,16.00,66.67,15.00,6.00,71.43,0.00,0.00,-,0.00,100.00,0.00,88.89,0.889,18680
3,"2022-04-12 - Ducks 2, Panthers 3",Florida Panthers,Limited ReportFull Report,63.683333,90.45,34.86,72.18,70.66,30.15,70.09,51.82,22.61,69.62,2.83,1.88,60.00,5.01,1.69,74.78,52.76,17.90,74.67,20.73,5.65,78.57,11.31,4.71,70.59,0.94,0.94,50.00,8.33,80.00,32.03,12.25,72.34,20.73,6.60,75.86,0.00,0.94,0.00,0.00,85.71,32.03,14.13,69.39,17.90,10.36,63.33,1.88,0.00,100.00,10.53,100.00,5.45,91.67,0.971,16204
4,"2022-01-18 - Canadiens 5, Stars 3",Dallas Stars,Limited ReportFull Report,60.000000,112.00,44.00,71.79,81.00,31.00,72.32,51.00,22.00,69.86,3.00,5.00,37.50,6.44,2.04,75.89,59.00,24.00,71.08,23.00,9.00,71.88,16.00,8.00,66.67,3.00,3.00,50.00,18.75,62.50,36.00,15.00,70.59,16.00,7.00,69.57,0.00,2.00,0.00,0.00,71.43,46.00,16.00,74.19,17.00,5.00,77.27,0.00,0.00,-,0.00,100.00,5.88,77.27,0.832,17679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,"2022-02-12 - Maple Leafs 2, Canucks 3",Vancouver Canucks,Limited ReportFull Report,60.000000,40.00,91.00,30.53,36.00,73.00,33.03,24.00,53.00,31.17,3.00,2.00,60.00,2.70,5.19,34.25,20.00,44.00,31.25,5.00,20.00,20.00,4.00,15.00,21.05,2.00,2.00,50.00,50.00,86.67,15.00,24.00,38.46,8.00,16.00,33.33,1.00,0.00,100.00,12.50,100.00,16.00,43.00,27.12,11.00,21.00,34.38,0.00,0.00,-,0.00,100.00,12.50,96.23,1.087,9396
1308,"2022-03-18 - Panthers 3, Ducks 0",Anaheim Ducks,Limited ReportFull Report,60.000000,37.00,86.00,30.08,26.00,66.00,28.26,17.00,42.00,28.81,0.00,3.00,0.00,1.34,4.29,23.84,20.00,44.00,31.25,9.00,15.00,37.50,3.00,11.00,21.43,0.00,1.00,0.00,0.00,90.91,11.00,29.00,27.50,5.00,11.00,31.25,0.00,2.00,0.00,0.00,81.82,13.00,35.00,27.08,7.00,15.00,31.82,0.00,0.00,-,0.00,100.00,0.00,92.86,0.929,12296
1309,"2021-12-18 - Oilers 5, Kraken 3",Seattle Kraken,Limited ReportFull Report,60.000000,30.00,73.00,29.13,25.00,55.00,31.25,17.00,41.00,29.31,3.00,5.00,37.50,1.64,4.15,28.35,13.00,32.00,28.89,7.00,13.00,35.00,7.00,9.00,43.75,1.00,4.00,20.00,14.29,55.56,6.00,19.00,24.00,3.00,10.00,23.08,1.00,0.00,100.00,33.33,100.00,15.00,35.00,30.00,5.00,21.00,19.23,1.00,1.00,50.00,20.00,95.24,17.65,87.80,1.055,17151
1310,"2021-11-08 - Panthers 3, Rangers 4",New York Rangers,Limited ReportFull Report,60.000000,37.00,91.00,28.91,29.00,70.00,29.29,18.00,45.00,28.57,4.00,3.00,57.14,2.02,4.59,30.49,22.00,55.00,28.57,13.00,22.00,37.14,11.00,18.00,37.93,4.00,3.00,57.14,36.36,83.33,9.00,33.00,21.43,2.00,11.00,15.38,0.00,0.00,-,0.00,100.00,13.00,33.00,28.26,4.00,15.00,21.05,0.00,0.00,-,0.00,100.00,22.22,93.33,1.156,14877


In [86]:
home_game_log_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1312 entries, 0 to 1311
Data columns (total 59 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Game        1312 non-null   object 
 1   Team        1312 non-null   object 
 2   Unnamed: 2  1312 non-null   object 
 3   TOI         1312 non-null   float64
 4   CF/60       1312 non-null   float64
 5   CA/60       1312 non-null   float64
 6   CF%         1312 non-null   float64
 7   FF/60       1312 non-null   float64
 8   FA/60       1312 non-null   float64
 9   FF%         1312 non-null   float64
 10  SF/60       1312 non-null   float64
 11  SA/60       1312 non-null   float64
 12  SF%         1312 non-null   float64
 13  GF/60       1312 non-null   float64
 14  GA/60       1312 non-null   float64
 15  GF%         1312 non-null   object 
 16  xGF/60      1312 non-null   float64
 17  xGA/60      1312 non-null   float64
 18  xGF%        1312 non-null   float64
 19  SCF/60      1312 non-null  

### Top line feature (column) deletion and addition

Deleting:
- 'Unnamed: 2' held hyperlinks to related pages
- 'C' type (Corsi) shot stats 
  - we will be using 'F' type (Fenwick) shots (goals, on net and misses, but excludes blocks)
- 'SC' type (Scoring Chance) vars
    - we will be focusing on HD type

In [87]:
# Store column names in list for easier manipulation
game_log_cols = home_game_log_2021.columns.to_list()
print([col for col in game_log_cols])

['Game', 'Team', 'Unnamed: 2', 'TOI', 'CF/60', 'CA/60', 'CF%', 'FF/60', 'FA/60', 'FF%', 'SF/60', 'SA/60', 'SF%', 'GF/60', 'GA/60', 'GF%', 'xGF/60', 'xGA/60', 'xGF%', 'SCF/60', 'SCA/60', 'SCF%', 'HDCF/60', 'HDCA/60', 'HDCF%', 'HDSF/60', 'HDSA/60', 'HDSF%', 'HDGF/60', 'HDGA/60', 'HDGF%', 'HDSH%', 'HDSV%', 'MDCF/60', 'MDCA/60', 'MDCF%', 'MDSF/60', 'MDSA/60', 'MDSF%', 'MDGF/60', 'MDGA/60', 'MDGF%', 'MDSH%', 'MDSV%', 'LDCF/60', 'LDCA/60', 'LDCF%', 'LDSF/60', 'LDSA/60', 'LDSF%', 'LDGF/60', 'LDGA/60', 'LDGF%', 'LDSH%', 'LDSV%', 'SH%', 'SV%', 'PDO', 'Attendance']


In [88]:
# import data was filtered -  need to check validity of a few
# PDO - only reported for home team in this version of the data

In [89]:
# Columns to immediately drop
drop_on_load = ['Unnamed: 2']

In [91]:
# Cols I need to add back in
cols_to_add = ['is_home_team', 'home_team_goals', 'home_team_won']

## EDA V2

In [93]:
home_df = pd.read_csv('data/home-19-22-5v5-adjusted.csv')
away_df = pd.read_csv('data/away-19-22-5v5-adjusted.csv')
print(home_df.shape, away_df.shape)

(3262, 33) (3262, 33)


In [94]:
# pd.testing.assert_frame_equal(home_df, away_df, check_dtype=False , check_index_type=False, 
#                               check_frame_type=False, check_names=False, 
#                               check_column_type=False, check_categorical=False)

# # 

In [95]:
home_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3262 entries, 0 to 3261
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Game        3262 non-null   object 
 1   Team        3262 non-null   object 
 2   Unnamed: 2  3262 non-null   object 
 3   TOI         3262 non-null   float64
 4   CF/60       3262 non-null   float64
 5   CA/60       3262 non-null   float64
 6   CF%         3262 non-null   float64
 7   FF/60       3262 non-null   float64
 8   FA/60       3262 non-null   float64
 9   FF%         3262 non-null   float64
 10  SF/60       3262 non-null   float64
 11  SA/60       3262 non-null   float64
 12  SF%         3262 non-null   float64
 13  GF/60       3262 non-null   float64
 14  GA/60       3262 non-null   float64
 15  GF%         3262 non-null   object 
 16  xGF/60      3262 non-null   float64
 17  xGA/60      3262 non-null   float64
 18  xGF%        3262 non-null   float64
 19  SCF/60      3262 non-null  

In [96]:
away_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3262 entries, 0 to 3261
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Game        3262 non-null   object 
 1   Team        3262 non-null   object 
 2   Unnamed: 2  3262 non-null   object 
 3   TOI         3262 non-null   float64
 4   CF/60       3262 non-null   float64
 5   CA/60       3262 non-null   float64
 6   CF%         3262 non-null   float64
 7   FF/60       3262 non-null   float64
 8   FA/60       3262 non-null   float64
 9   FF%         3262 non-null   float64
 10  SF/60       3262 non-null   float64
 11  SA/60       3262 non-null   float64
 12  SF%         3262 non-null   float64
 13  GF/60       3262 non-null   float64
 14  GA/60       3262 non-null   float64
 15  GF%         3262 non-null   object 
 16  xGF/60      3262 non-null   float64
 17  xGA/60      3262 non-null   float64
 18  xGF%        3262 non-null   float64
 19  SCF/60      3262 non-null  

In [97]:
home_df.index

RangeIndex(start=0, stop=3262, step=1)

In [98]:
# Store column names in list for easier manipulation
home_cols = home_df.columns.to_list()
print([col for col in home_cols])

['Game', 'Team', 'Unnamed: 2', 'TOI', 'CF/60', 'CA/60', 'CF%', 'FF/60', 'FA/60', 'FF%', 'SF/60', 'SA/60', 'SF%', 'GF/60', 'GA/60', 'GF%', 'xGF/60', 'xGA/60', 'xGF%', 'SCF/60', 'SCA/60', 'SCF%', 'HDCF/60', 'HDCA/60', 'HDCF%', 'HDGF/60', 'HDGA/60', 'HDGF%', 'HDSH%', 'HDSV%', 'SH%', 'SV%', 'PDO']


In [99]:
# check colums are equal now that names are issolated
away_cols = away_df.columns.to_list()
assert home_cols == away_cols

In [100]:
cols_to_drop = ['Unnamed: 2', 'CF/60', 'CA/60', 'CF%', 'SF/60', 'SA/60', 'SF%', 'GF%',
                'SCF/60', 'SCA/60', 'SCF%', 'HDGF/60', 'HDGA/60', 'HDGF%', 'HDSH%', 'HDSV%']

# cols to engineer in future
## standing points gained to date and for several moving averages (5 , 10, 20) - explain hot and cold streaks?

In [101]:
away_df

Unnamed: 0,Game,Team,Unnamed: 2,TOI,CF/60,CA/60,CF%,FF/60,FA/60,FF%,SF/60,SA/60,SF%,GF/60,GA/60,GF%,xGF/60,xGA/60,xGF%,SCF/60,SCA/60,SCF%,HDCF/60,HDCA/60,HDCF%,HDGF/60,HDGA/60,HDGF%,HDSH%,HDSV%,SH%,SV%,PDO
0,"2019-10-02 - Senators 3, Maple Leafs 5",Ottawa Senators,Limited ReportFull Report,44.133333,54.02,78.00,40.92,43.75,59.15,42.52,29.01,43.23,40.16,4.18,5.16,44.73,2.22,3.44,39.27,19.18,34.03,36.04,8.32,15.74,34.58,4.20,5.12,45.08,75.21,44.01,14.40,88.06,1.025
1,"2019-10-02 - Capitals 3, Blues 2",Washington Capitals,Limited ReportFull Report,50.866667,47.21,33.43,58.55,39.14,27.17,59.03,28.43,20.01,58.69,1.20,1.11,51.81,2.26,1.35,62.52,23.09,13.76,62.66,6.14,5.59,52.34,0.00,1.11,0.00,0.00,80.36,4.21,94.44,0.986
2,"2019-10-02 - Canucks 2, Oilers 3",Vancouver Canucks,Limited ReportFull Report,47.066667,67.92,44.76,60.28,48.03,30.38,61.25,30.41,24.10,55.79,2.64,3.58,42.39,1.99,1.71,53.76,31.14,19.81,61.12,9.11,6.04,60.13,1.35,2.40,35.98,34.15,60.71,8.67,85.13,0.938
3,"2019-10-02 - Sharks 1, Golden Knights 4",San Jose Sharks,Limited ReportFull Report,45.666667,40.18,62.73,39.04,30.07,51.94,36.66,19.82,32.21,38.10,1.33,2.60,33.91,1.40,3.84,26.72,8.95,32.63,21.52,5.27,16.02,24.75,0.00,1.30,0.00,0.00,87.94,6.72,91.93,0.987
4,"2019-10-03 - Panthers 2, Lightning 5",Florida Panthers,Limited ReportFull Report,45.500000,57.60,46.81,55.17,46.56,32.45,58.93,37.57,25.64,59.43,1.33,2.55,34.36,1.69,1.86,47.57,22.62,19.77,53.36,9.26,11.77,44.02,1.33,1.31,50.51,16.98,80.19,3.55,90.06,0.936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257,"2022-04-29 - Avalanche 1, Wild 4",Colorado Avalanche,Limited ReportFull Report,39.400000,57.96,37.86,60.49,46.61,30.65,60.33,34.01,22.45,60.24,1.53,4.44,25.61,1.95,1.95,49.95,27.82,17.22,61.77,5.99,7.71,43.70,0.00,2.98,0.00,0.00,52.17,4.50,80.21,0.847
3258,"2022-04-29 - Flames 1, Jets 3",Calgary Flames,Limited ReportFull Report,47.916667,57.16,77.20,42.54,45.89,59.43,43.57,33.25,43.77,43.17,1.32,2.34,36.13,2.44,4.08,37.48,20.62,43.01,32.41,9.15,18.27,33.38,1.33,1.18,52.92,16.92,92.14,3.98,94.66,0.986
3259,"2022-04-29 - Predators 4, Coyotes 5",Nashville Predators,Limited ReportFull Report,48.750000,74.72,41.64,64.22,58.08,31.75,64.66,32.01,22.81,58.40,5.25,5.73,47.82,3.01,1.60,65.22,36.52,18.43,66.46,13.88,4.30,76.33,3.99,2.26,63.85,48.16,47.48,16.40,74.89,0.913
3260,"2022-04-29 - Sharks 0, Kraken 3",San Jose Sharks,Limited ReportFull Report,52.416667,32.76,64.92,33.54,26.93,48.21,35.84,21.61,32.41,40.00,0.00,2.21,0.00,1.42,2.42,37.02,12.21,29.28,29.44,4.54,11.58,28.18,0.00,0.00,-,0.00,100.00,0.00,93.18,0.932


In [102]:
home_df = home_df.drop(columns= cols_to_drop, axis=1)
home_df

Unnamed: 0,Game,Team,TOI,FF/60,FA/60,FF%,GF/60,GA/60,xGF/60,xGA/60,xGF%,HDCF/60,HDCA/60,HDCF%,SH%,SV%,PDO
0,"2019-10-02 - Senators 3, Maple Leafs 5",Toronto Maple Leafs,44.133333,59.15,43.75,57.48,5.16,4.18,3.44,2.22,60.73,15.74,8.32,65.42,11.94,85.60,0.975
1,"2019-10-02 - Capitals 3, Blues 2",St Louis Blues,50.866667,27.17,39.14,40.97,1.11,1.20,1.35,2.26,37.48,5.59,6.14,47.66,5.56,95.79,1.014
2,"2019-10-02 - Canucks 2, Oilers 3",Edmonton Oilers,47.066667,30.38,48.03,38.75,3.58,2.64,1.71,1.99,46.24,6.04,9.11,39.87,14.87,91.33,1.062
3,"2019-10-02 - Sharks 1, Golden Knights 4",Vegas Golden Knights,45.666667,51.94,30.07,63.34,2.60,1.33,3.84,1.40,73.28,16.02,5.27,75.25,8.07,93.28,1.013
4,"2019-10-03 - Panthers 2, Lightning 5",Tampa Bay Lightning,45.500000,32.45,46.56,41.07,2.55,1.33,1.86,1.69,52.43,11.77,9.26,55.98,9.94,96.45,1.064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257,"2022-04-29 - Avalanche 1, Wild 4",Minnesota Wild,39.400000,30.65,46.61,39.67,4.44,1.53,1.95,1.95,50.05,7.71,5.99,56.30,19.79,95.50,1.153
3258,"2022-04-29 - Flames 1, Jets 3",Winnipeg Jets,47.916667,59.43,45.89,56.43,2.34,1.32,4.08,2.44,62.52,18.27,9.15,66.62,5.34,96.02,1.014
3259,"2022-04-29 - Predators 4, Coyotes 5",Arizona Coyotes,48.750000,31.75,58.08,35.34,5.73,5.25,1.60,3.01,34.78,4.30,13.88,23.67,25.11,83.60,1.087
3260,"2022-04-29 - Sharks 0, Kraken 3",Seattle Kraken,52.416667,48.21,26.93,64.16,2.21,0.00,2.42,1.42,62.98,11.58,4.54,71.82,6.82,100.00,1.068


In [103]:
home_df.loc[home_df['TOI'] == 60]

Unnamed: 0,Game,Team,TOI,FF/60,FA/60,FF%,GF/60,GA/60,xGF/60,xGA/60,xGF%,HDCF/60,HDCA/60,HDCF%,SH%,SV%,PDO
1369,"2021-02-25 - Flames 1, Senators 6",Ottawa Senators,60.0,45.74,38.83,54.09,5.9,1.01,1.89,1.54,55.16,11.09,4.93,69.23,18.02,96.31,1.143


In [104]:
home_df['TOI'].equals(away_df['TOI'])

True

In [105]:
# drop same cols from away_df
away_df = away_df.drop(columns= cols_to_drop, axis=1)
assert away_df.shape  == home_df.shape

In [106]:
# extract date from game field
home_df['date'] = home_df['Game'].str[:11]
away_df['date'] = away_df['Game'].str[:11]
# change 'Team' field to 'isHomeTeam'
home_df['isHomeTeam'] = home_df['Team']
away_df['isAwayTeam'] = away_df['Team']

In [107]:
home_df

Unnamed: 0,Game,Team,TOI,FF/60,FA/60,FF%,GF/60,GA/60,xGF/60,xGA/60,xGF%,HDCF/60,HDCA/60,HDCF%,SH%,SV%,PDO,date,isHomeTeam
0,"2019-10-02 - Senators 3, Maple Leafs 5",Toronto Maple Leafs,44.133333,59.15,43.75,57.48,5.16,4.18,3.44,2.22,60.73,15.74,8.32,65.42,11.94,85.60,0.975,2019-10-02,Toronto Maple Leafs
1,"2019-10-02 - Capitals 3, Blues 2",St Louis Blues,50.866667,27.17,39.14,40.97,1.11,1.20,1.35,2.26,37.48,5.59,6.14,47.66,5.56,95.79,1.014,2019-10-02,St Louis Blues
2,"2019-10-02 - Canucks 2, Oilers 3",Edmonton Oilers,47.066667,30.38,48.03,38.75,3.58,2.64,1.71,1.99,46.24,6.04,9.11,39.87,14.87,91.33,1.062,2019-10-02,Edmonton Oilers
3,"2019-10-02 - Sharks 1, Golden Knights 4",Vegas Golden Knights,45.666667,51.94,30.07,63.34,2.60,1.33,3.84,1.40,73.28,16.02,5.27,75.25,8.07,93.28,1.013,2019-10-02,Vegas Golden Knights
4,"2019-10-03 - Panthers 2, Lightning 5",Tampa Bay Lightning,45.500000,32.45,46.56,41.07,2.55,1.33,1.86,1.69,52.43,11.77,9.26,55.98,9.94,96.45,1.064,2019-10-03,Tampa Bay Lightning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257,"2022-04-29 - Avalanche 1, Wild 4",Minnesota Wild,39.400000,30.65,46.61,39.67,4.44,1.53,1.95,1.95,50.05,7.71,5.99,56.30,19.79,95.50,1.153,2022-04-29,Minnesota Wild
3258,"2022-04-29 - Flames 1, Jets 3",Winnipeg Jets,47.916667,59.43,45.89,56.43,2.34,1.32,4.08,2.44,62.52,18.27,9.15,66.62,5.34,96.02,1.014,2022-04-29,Winnipeg Jets
3259,"2022-04-29 - Predators 4, Coyotes 5",Arizona Coyotes,48.750000,31.75,58.08,35.34,5.73,5.25,1.60,3.01,34.78,4.30,13.88,23.67,25.11,83.60,1.087,2022-04-29,Arizona Coyotes
3260,"2022-04-29 - Sharks 0, Kraken 3",Seattle Kraken,52.416667,48.21,26.93,64.16,2.21,0.00,2.42,1.42,62.98,11.58,4.54,71.82,6.82,100.00,1.068,2022-04-29,Seattle Kraken


In [108]:
home_df['Game'][0][-1]

'5'

In [109]:
home_df['home_score'] = home_df['Game'][0][-1]

In [110]:
home_df

Unnamed: 0,Game,Team,TOI,FF/60,FA/60,FF%,GF/60,GA/60,xGF/60,xGA/60,xGF%,HDCF/60,HDCA/60,HDCF%,SH%,SV%,PDO,date,isHomeTeam,home_score
0,"2019-10-02 - Senators 3, Maple Leafs 5",Toronto Maple Leafs,44.133333,59.15,43.75,57.48,5.16,4.18,3.44,2.22,60.73,15.74,8.32,65.42,11.94,85.60,0.975,2019-10-02,Toronto Maple Leafs,5
1,"2019-10-02 - Capitals 3, Blues 2",St Louis Blues,50.866667,27.17,39.14,40.97,1.11,1.20,1.35,2.26,37.48,5.59,6.14,47.66,5.56,95.79,1.014,2019-10-02,St Louis Blues,5
2,"2019-10-02 - Canucks 2, Oilers 3",Edmonton Oilers,47.066667,30.38,48.03,38.75,3.58,2.64,1.71,1.99,46.24,6.04,9.11,39.87,14.87,91.33,1.062,2019-10-02,Edmonton Oilers,5
3,"2019-10-02 - Sharks 1, Golden Knights 4",Vegas Golden Knights,45.666667,51.94,30.07,63.34,2.60,1.33,3.84,1.40,73.28,16.02,5.27,75.25,8.07,93.28,1.013,2019-10-02,Vegas Golden Knights,5
4,"2019-10-03 - Panthers 2, Lightning 5",Tampa Bay Lightning,45.500000,32.45,46.56,41.07,2.55,1.33,1.86,1.69,52.43,11.77,9.26,55.98,9.94,96.45,1.064,2019-10-03,Tampa Bay Lightning,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257,"2022-04-29 - Avalanche 1, Wild 4",Minnesota Wild,39.400000,30.65,46.61,39.67,4.44,1.53,1.95,1.95,50.05,7.71,5.99,56.30,19.79,95.50,1.153,2022-04-29,Minnesota Wild,5
3258,"2022-04-29 - Flames 1, Jets 3",Winnipeg Jets,47.916667,59.43,45.89,56.43,2.34,1.32,4.08,2.44,62.52,18.27,9.15,66.62,5.34,96.02,1.014,2022-04-29,Winnipeg Jets,5
3259,"2022-04-29 - Predators 4, Coyotes 5",Arizona Coyotes,48.750000,31.75,58.08,35.34,5.73,5.25,1.60,3.01,34.78,4.30,13.88,23.67,25.11,83.60,1.087,2022-04-29,Arizona Coyotes,5
3260,"2022-04-29 - Sharks 0, Kraken 3",Seattle Kraken,52.416667,48.21,26.93,64.16,2.21,0.00,2.42,1.42,62.98,11.58,4.54,71.82,6.82,100.00,1.068,2022-04-29,Seattle Kraken,5


In [111]:
# Custom function to extract the single character before a comma
def extract_num_before_comma(s):
    pattern = r"(?<=\d)(?=,)"
    matches = [s[m.start() - 1] for m in re.finditer(pattern, s)]
    return int(matches[0]) if matches else None

home_df['away_score'] = home_df['Game'].apply(extract_num_before_comma)

In [112]:
home_df.dtypes

Game           object
Team           object
TOI           float64
FF/60         float64
FA/60         float64
FF%           float64
GF/60         float64
GA/60         float64
xGF/60        float64
xGA/60        float64
xGF%          float64
HDCF/60       float64
HDCA/60       float64
HDCF%         float64
SH%           float64
SV%           float64
PDO           float64
date           object
isHomeTeam     object
home_score     object
away_score      int64
dtype: object

In [113]:
home_df['date'] = pd.to_datetime(home_df['date'])
home_df['home_score'] = home_df['home_score'].astype(int)

In [114]:
home_df

Unnamed: 0,Game,Team,TOI,FF/60,FA/60,FF%,GF/60,GA/60,xGF/60,xGA/60,xGF%,HDCF/60,HDCA/60,HDCF%,SH%,SV%,PDO,date,isHomeTeam,home_score,away_score
0,"2019-10-02 - Senators 3, Maple Leafs 5",Toronto Maple Leafs,44.133333,59.15,43.75,57.48,5.16,4.18,3.44,2.22,60.73,15.74,8.32,65.42,11.94,85.60,0.975,2019-10-02,Toronto Maple Leafs,5,3
1,"2019-10-02 - Capitals 3, Blues 2",St Louis Blues,50.866667,27.17,39.14,40.97,1.11,1.20,1.35,2.26,37.48,5.59,6.14,47.66,5.56,95.79,1.014,2019-10-02,St Louis Blues,5,3
2,"2019-10-02 - Canucks 2, Oilers 3",Edmonton Oilers,47.066667,30.38,48.03,38.75,3.58,2.64,1.71,1.99,46.24,6.04,9.11,39.87,14.87,91.33,1.062,2019-10-02,Edmonton Oilers,5,2
3,"2019-10-02 - Sharks 1, Golden Knights 4",Vegas Golden Knights,45.666667,51.94,30.07,63.34,2.60,1.33,3.84,1.40,73.28,16.02,5.27,75.25,8.07,93.28,1.013,2019-10-02,Vegas Golden Knights,5,1
4,"2019-10-03 - Panthers 2, Lightning 5",Tampa Bay Lightning,45.500000,32.45,46.56,41.07,2.55,1.33,1.86,1.69,52.43,11.77,9.26,55.98,9.94,96.45,1.064,2019-10-03,Tampa Bay Lightning,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257,"2022-04-29 - Avalanche 1, Wild 4",Minnesota Wild,39.400000,30.65,46.61,39.67,4.44,1.53,1.95,1.95,50.05,7.71,5.99,56.30,19.79,95.50,1.153,2022-04-29,Minnesota Wild,5,1
3258,"2022-04-29 - Flames 1, Jets 3",Winnipeg Jets,47.916667,59.43,45.89,56.43,2.34,1.32,4.08,2.44,62.52,18.27,9.15,66.62,5.34,96.02,1.014,2022-04-29,Winnipeg Jets,5,1
3259,"2022-04-29 - Predators 4, Coyotes 5",Arizona Coyotes,48.750000,31.75,58.08,35.34,5.73,5.25,1.60,3.01,34.78,4.30,13.88,23.67,25.11,83.60,1.087,2022-04-29,Arizona Coyotes,5,4
3260,"2022-04-29 - Sharks 0, Kraken 3",Seattle Kraken,52.416667,48.21,26.93,64.16,2.21,0.00,2.42,1.42,62.98,11.58,4.54,71.82,6.82,100.00,1.068,2022-04-29,Seattle Kraken,5,0


In [74]:
home_df['home_win'] = ((home_df['home_score'] > home_df['away_score'])).astype(int)
home_df['home_goal_dif'] = home_df['home_score'] - home_df['away_score']
home_df

Unnamed: 0,Game,Team,TOI,FF/60,FA/60,FF%,GF/60,GA/60,xGF/60,xGA/60,xGF%,HDCF/60,HDCA/60,HDCF%,SH%,SV%,PDO,date,isHomeTeam,home_score,away_score,home_win,home_goal_dif
0,"2019-10-02 - Senators 3, Maple Leafs 5",Toronto Maple Leafs,44.133333,59.15,43.75,57.48,5.16,4.18,3.44,2.22,60.73,15.74,8.32,65.42,11.94,85.60,0.975,2019-10-02,Toronto Maple Leafs,5,3,1,2
1,"2019-10-02 - Capitals 3, Blues 2",St Louis Blues,50.866667,27.17,39.14,40.97,1.11,1.20,1.35,2.26,37.48,5.59,6.14,47.66,5.56,95.79,1.014,2019-10-02,St Louis Blues,5,3,1,2
2,"2019-10-02 - Canucks 2, Oilers 3",Edmonton Oilers,47.066667,30.38,48.03,38.75,3.58,2.64,1.71,1.99,46.24,6.04,9.11,39.87,14.87,91.33,1.062,2019-10-02,Edmonton Oilers,5,2,1,3
3,"2019-10-02 - Sharks 1, Golden Knights 4",Vegas Golden Knights,45.666667,51.94,30.07,63.34,2.60,1.33,3.84,1.40,73.28,16.02,5.27,75.25,8.07,93.28,1.013,2019-10-02,Vegas Golden Knights,5,1,1,4
4,"2019-10-03 - Panthers 2, Lightning 5",Tampa Bay Lightning,45.500000,32.45,46.56,41.07,2.55,1.33,1.86,1.69,52.43,11.77,9.26,55.98,9.94,96.45,1.064,2019-10-03,Tampa Bay Lightning,5,2,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257,"2022-04-29 - Avalanche 1, Wild 4",Minnesota Wild,39.400000,30.65,46.61,39.67,4.44,1.53,1.95,1.95,50.05,7.71,5.99,56.30,19.79,95.50,1.153,2022-04-29,Minnesota Wild,5,1,1,4
3258,"2022-04-29 - Flames 1, Jets 3",Winnipeg Jets,47.916667,59.43,45.89,56.43,2.34,1.32,4.08,2.44,62.52,18.27,9.15,66.62,5.34,96.02,1.014,2022-04-29,Winnipeg Jets,5,1,1,4
3259,"2022-04-29 - Predators 4, Coyotes 5",Arizona Coyotes,48.750000,31.75,58.08,35.34,5.73,5.25,1.60,3.01,34.78,4.30,13.88,23.67,25.11,83.60,1.087,2022-04-29,Arizona Coyotes,5,4,1,1
3260,"2022-04-29 - Sharks 0, Kraken 3",Seattle Kraken,52.416667,48.21,26.93,64.16,2.21,0.00,2.42,1.42,62.98,11.58,4.54,71.82,6.82,100.00,1.068,2022-04-29,Seattle Kraken,5,0,1,5


In [75]:
len(home_df.loc[home_df['home_win'] == 1])

2677

In [76]:
tie_check = home_df[home_df['home_score'] == home_df['away_score']]
tie_check


Unnamed: 0,Game,Team,TOI,FF/60,FA/60,FF%,GF/60,GA/60,xGF/60,xGA/60,xGF%,HDCF/60,HDCA/60,HDCF%,SH%,SV%,PDO,date,isHomeTeam,home_score,away_score,home_win,home_goal_dif
12,"2019-10-04 - Jets 5, Devils 4",New Jersey Devils,52.000000,46.84,39.32,54.36,4.52,4.65,2.81,1.80,60.90,13.78,8.22,62.63,12.85,84.12,0.970,2019-10-04,New Jersey Devils,5,5,0,0
16,"2019-10-04 - Golden Knights 5, Sharks 1",San Jose Sharks,45.233333,38.33,38.53,49.87,1.22,4.27,2.58,3.26,44.17,11.12,19.35,36.49,4.57,86.22,0.908,2019-10-04,San Jose Sharks,5,5,0,0
24,"2019-10-05 - Red Wings 5, Predators 3",Nashville Predators,51.433333,55.46,37.53,59.64,2.16,4.95,2.26,1.26,64.07,6.19,2.55,70.87,5.15,79.12,0.843,2019-10-05,Nashville Predators,5,5,0,0
28,"2019-10-05 - Kings 5, Oilers 6",Edmonton Oilers,44.083333,33.43,40.55,45.19,3.80,5.69,2.00,2.74,42.16,7.49,14.71,33.75,13.99,82.23,0.962,2019-10-05,Edmonton Oilers,5,5,0,0
36,"2019-10-08 - Oilers 5, Islanders 2",New York Islanders,47.816667,36.06,38.27,48.51,1.15,3.94,1.26,1.62,43.80,5.38,6.97,43.54,4.19,82.95,0.871,2019-10-08,New York Islanders,5,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3228,"2022-04-26 - Coyotes 5, Wild 3",Minnesota Wild,37.800000,52.02,51.20,50.40,4.47,4.99,2.70,3.28,45.15,14.76,18.09,44.94,10.76,83.67,0.944,2022-04-26,Minnesota Wild,5,5,0,0
3231,"2022-04-26 - Ducks 5, Sharks 2",San Jose Sharks,51.550000,45.78,45.40,50.21,1.11,3.73,2.31,2.52,47.81,12.15,15.79,43.48,3.78,90.03,0.938,2022-04-26,San Jose Sharks,5,5,0,0
3236,"2022-04-27 - Kings 5, Kraken 3",Seattle Kraken,47.150000,63.04,45.94,57.85,3.66,5.21,3.16,2.46,56.24,15.60,10.58,59.58,8.03,83.50,0.915,2022-04-27,Seattle Kraken,5,5,0,0
3242,"2022-04-28 - Predators 5, Avalanche 4",Colorado Avalanche,42.733333,61.83,50.49,55.05,4.16,4.27,3.91,3.46,53.06,16.86,12.98,56.50,8.50,89.88,0.984,2022-04-28,Colorado Avalanche,5,5,0,0


['']

In [None]:
x

### EDA was wiped away, so have to circle back
### Luckily the data is pretty clean and most manipulation is derived stats to be imputed

In [None]:
# Check correlation of selected features
plt.figure(figsize=(20,10))
cor = shots_df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
# Run Baseline Model with all available features in our dataset
all_features = 

In [None]:
# Logistic Regression w/ Grid Search

# Parameters
c = [0.1, 1, 10, 100]
max_iter = [100, 1000]
solver = ['liblinear']
penalty = ['l1', 'l2']

logreg_params = {
    'logreg__C': c,
    'logreg__max_iter': max_iter,
    'logreg__solver': solver,
    'logreg__penalty': penalty
}

# Model
logreg_top_model_pipe = Pipeline(steps=[("logreg", LogisticRegression(n_jobs=1))], verbose=False)
logreg_top_model = GridSearchCV(estimator=logreg_top_model_pipe, param_grid=logreg_params, scoring='accuracy', cv=5, verbose=0)
logreg_top_model.fit(X_train, y_train)

# Displaying Data
y_pred = logreg_top_model.predict(X_test)
logreg_top_score = logreg_top_model.score(X_test, y_test)

# Model Selection
for i in range(0, 11):
    print(f"Model {i}. Current Top Score: {logreg_top_score}")
        
    # Split X and y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)
    
    # Model Building
    logreg_cur_pipe = Pipeline(steps=[("logreg", LogisticRegression(n_jobs=1))], verbose=False)
    logreg_cur_gs = GridSearchCV(estimator=logreg_cur_pipe, param_grid=logreg_params, scoring='accuracy', cv=5, verbose=0)
    logreg_cur_gs.fit(X_train, y_train)
    
    # Comparing and Replacing Data
    y_pred = logreg_cur_gs.predict(X_test)
    logreg_cur_score = logreg_cur_gs.score(X_test, y_test)
    
    if logreg_cur_score > logreg_top_score:
        logreg_top_model = logreg_cur_gs
        logreg_top_score = logreg_cur_score