In [3]:
import numpy as np
import requests
import json
import time
import pandas as pd

The first 4 digits of the game ID identify the season of the game (ie. 2017 for the 2017-2018 season). The next 2 digits give the type of game, where 01 = preseason, 02 = regular season, 03 = playoffs, 04 = all-star. The final 4 digits identify the specific game number. For regular season and preseason games, this ranges from 0001 to the number of games played. (1271 for seasons with 31 teams (2017 and onwards) and 1230 for seasons with 30 teams). For playoff games, the 2nd digit of the specific number gives the round of the playoffs, the 3rd digit specifies the matchup, and the 4th digit specifies the game (out of 7).

See https://gitlab.com/dword4/nhlapi for full documentation.

In [None]:
start_end = np.reshape(['2010-10-07','2011-06-15',
                        '2011-10-06','2012-06-11',
                        '2013-01-19','2013-06-24',
                        '2013-10-01','2014-06-13',
                        '2014-10-08','2015-06-15',
                        '2015-10-07','2016-06-12',
                        '2016-10-12','2017-06-11',
                        '2017-10-04','2018-06-07',
                        '2018-10-03','2019-05-08'],(9,2))

In [None]:
games = []

t1 = time.perf_counter()

for index in range(0,len(start_end)):
    url = f'https://statsapi.web.nhl.com/api/v1/schedule?startDate={start_end[index,0]}&endDate={start_end[index,1]}'
    r = requests.get(url)
    game_data = r.json()
    for date in range(0,len(game_data['dates'])):
        for game in range(0,len(game_data['dates'][date]['games'])):
            game_list = game_data['dates'][date]['games'][game]['gamePk']
            #Remove all-star games:
            if str(game_list)[5] != '4':
                games.append(game_list)
            
    time.sleep(r.elapsed.total_seconds())
            
    print(f'Got dates {start_end[index,0]} through {start_end[index,1]} in {round(r.elapsed.total_seconds(),2)} seconds')
    
games.sort()
            
t2 = time.perf_counter()

In [None]:
print(f'Finished in {round(t2-t1,2)} seconds')

In [None]:
mydata = []

t1 = time.perf_counter()

for game in games:
    url = f'https://statsapi.web.nhl.com/api/v1/game/{game}/feed/live'
    r = requests.get(url)
    game_data = r.json()
    
    play_id = {}    
    pbp = game_data['liveData']['plays']['playsByPeriod']
    if len(pbp) != 0:
        for plays in range(0,len(pbp)):
            play_list = pbp[plays]['plays']
            play_id[plays] = play_list

        for plays in play_id:
            for play in play_id[plays]:
                details = game_data['liveData']['plays']['allPlays'][play]
                details['result'].setdefault('strength', {'name': None})
                details.setdefault('players', [{'player': {}}])
                details.setdefault('team', {})

                player_list = details['players'][0]['player']
                player_list.setdefault('fullName', None)

                final_list = [game_data['gameData']['game']\
                              .get(key) for key in ['pk','season']]
                final_list.extend([game_data['gameData']['datetime']['dateTime']])
                final_list.extend([game_data['gameData']['teams']['home']['venue']['timeZone']['offset']])
                final_list.extend([player_list['fullName']])
                final_list.extend(details['team']\
                                  .get(key) for key in ['name'])
                final_list.extend(details['result']\
                                  .get(key) for key in ['event','secondaryType','emptyNet'])
                final_list.extend([details['result']['strength']['name']]) 
                final_list.extend(details['about']\
                                  .get(key) for key in ['eventId','period'])
                final_list.extend(details['coordinates']\
                                  .get(key) for key in ['x','y'])

                mydata.append(final_list)
            
    time.sleep(r.elapsed.total_seconds())
            
    print(f'Got game #{game} in {round(r.elapsed.total_seconds(),2)} seconds')
    
t2 = time.perf_counter()

In [None]:
print(f'Finished in {round((t2-t1)/60,2)} minutes')

In [None]:
df = pd.DataFrame(mydata)

df.columns = ['gameId',
              'season',
              'dateTime',
              'offset',
              'player',
              'team',
              'event',
              'shotType',
              'emptyNet',
              'strength',
              'eventId',
              'period',
              'x',
              'y']

df = df.astype({'gameId': int,
                'season': str,
                'dateTime': 'datetime64[h]',
                'offset': 'timedelta64[h]',
                'player': str,
                'team': str,
                'event': str,
                'shotType': str,
                'emptyNet': str,
                'strength': str,
                'eventId': int,
                'period': str,
                'x': float,
                'y': float})

df['dateTime'] = df['dateTime']+df['offset']
df['date'] = df['dateTime'].dt.date

df = df[['gameId',
         'season',
         'date',
         'eventId',
         'player',
         'team',
         'event',
         'shotType',
         'emptyNet',
         'strength',
         'period',
         'x',
         'y']].sort_values(['gameId','eventId'])

In [None]:
print(df.head())
print(df.tail())

In [None]:
df.to_pickle("./df.pkl")

In [1]:
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)

In [4]:
df = pd.read_pickle("./df.pkl")

In [4]:
df = df.dropna()
df['x2'] = df['x'].where(df['x'] >= 0, df['x']*-1)
df['y2'] = df['y'].where(df['x'] >= 0, df['y']*-1)

In [None]:
viz_df = df[df['event'].isin(['Missed Shot','Shot','Goal'])]

In [120]:
data = [go.Heatmap(z=viz_df.groupby(["x2", "y2"]).size().reset_index(name="freq")['freq'],
                   x=viz_df.groupby(["x2", "y2"]).size().reset_index(name="freq")['x2'],
                   y=viz_df.groupby(["x2", "y2"]).size().reset_index(name="freq")['y2'],
                   colorscale=[[0.00, 'rgba(69,117,180,0.00)'],
                               [0.25, 'rgba(116,173,209,0.500)'], 
                               [0.50, 'rgba(254,224,144,0.667)'], 
                               [0.75, 'rgba(215,48,39,0.833)'], 
                               [1.00, 'rgba(165,0,38,1.00)']],
                  colorbar=dict(title='Shots & Goals'),
                  zsmooth='best')]
                   
layout = go.Layout(xaxis=dict(range=[0,100],
                              showgrid=False,
                              zeroline=False,
                              showline=False,
                              ticks='',
                              showticklabels=False),
                   yaxis=dict(range=[-42.5,42.5],
                              showgrid=False,
                              zeroline=False,
                              showline=False,
                              ticks='',
                              showticklabels=False,
                              scaleanchor = 'x',
                              scaleratio = 0.85),
                   images=[dict(source="./NHL_Hockey_Rink.png",
                                xref="x",
                                yref="y",
                                x= 0,
                                y= 42.5,
                                sizex= 100,
                                sizey= 85,
                                sizing="stretch",
                                opacity= 0.5,
                                layer= "below")])

fig = go.Figure(data=data,layout=layout)

py.offline.iplot(fig)

In [115]:
filters = {'season': None,
           'player': ['Alex Ovechkin'],
           'team': None,
           'event': ['Missed Shot','Shot','Goal'],
           'shotType': None,
           'emptyNet': None,
           'strength': None,
           'period': None
          }

In [116]:
filters = {k: v for k, v in filters.items() if v is not None}
mask = pd.concat([df[k].isin(v) for k, v in filters.items()], axis=1).all(axis=1)
filtered_df = df[mask]

In [117]:
filtered_df.head()

Unnamed: 0,gameId,season,date,eventId,player,team,event,shotType,emptyNet,strength,period,x,y,x2,y2
3353,2010020011,20102011,2010-10-08,42,Alex Ovechkin,Washington Capitals,Missed Shot,,,,1,-59.0,9.0,59.0,-9.0
3378,2010020011,20102011,2010-10-08,210,Alex Ovechkin,Washington Capitals,Shot,Snap Shot,,,1,-30.0,-24.0,30.0,24.0
3379,2010020011,20102011,2010-10-08,211,Alex Ovechkin,Washington Capitals,Missed Shot,,,,1,-36.0,-16.0,36.0,16.0
3422,2010020011,20102011,2010-10-08,234,Alex Ovechkin,Washington Capitals,Shot,Snap Shot,,,2,62.0,13.0,62.0,13.0
3470,2010020011,20102011,2010-10-08,413,Alex Ovechkin,Washington Capitals,Shot,Wrist Shot,,,2,60.0,15.0,60.0,15.0


In [121]:
data = [go.Heatmap(z=filtered_df.groupby(["x2", "y2"]).size().reset_index(name="freq")['freq'],
                   x=filtered_df.groupby(["x2", "y2"]).size().reset_index(name="freq")['x2'],
                   y=filtered_df.groupby(["x2", "y2"]).size().reset_index(name="freq")['y2'],
                   colorscale=[[0.00, 'rgba(69,117,180,0.000)'],
                               [0.25, 'rgba(116,173,209,0.500)'], 
                               [0.50, 'rgba(254,224,144,0.667)'], 
                               [0.75, 'rgba(215,48,39,0.833)'], 
                               [1.00, 'rgba(165,0,38,1.000)']],
                  zsmooth='best')]
                   
layout = go.Layout(xaxis=dict(range=[0,100],
                              showgrid=False,
                              zeroline=False,
                              showline=False,
                              ticks='',
                              showticklabels=False),
                   yaxis=dict(range=[-42.5,42.5],
                              showgrid=False,
                              zeroline=False,
                              showline=False,
                              ticks='',
                              showticklabels=False,
                              scaleanchor = 'x',
                              scaleratio = 0.85),
                   images=[dict(source="./NHL_Hockey_Rink.png",
                                xref="x",
                                yref="y",
                                x= 0,
                                y= 42.5,
                                sizex= 100,
                                sizey= 85,
                                sizing="stretch",
                                opacity= 0.5,
                                layer= "below")])

fig = go.Figure(data=data,layout=layout)

py.offline.iplot(fig)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
model_df = df[(df['event'].isin(['Shot','Goal'])) &
              (df['emptyNet'] != 'True') &
              (df['shotType'] != 'None')]
model_df = model_df.replace({'Goal': 1, 'Shot': 0})
model_df = model_df[['event','shotType','x2','y2']]
model_df = pd.get_dummies(model_df)

In [None]:
X = model_df.iloc[:,1:]
y = model_df.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)

In [None]:
base_model = RandomForestClassifier(max_features='sqrt',\
                                    bootstrap=True,\
                                    n_estimators=100,\
                                    random_state=1)
base_model.fit(X_train, y_train)

print("Base Model Accuracy")
print("Training: {:.3f}".format(base_model.score(X_train, y_train)))
print("Test: {:.3f}".format(base_model.score(X_test, y_test)),"\n")

In [None]:
from sklearn.metrics import f1_score
y_pred = base_model.predict(X_test)
f1_score(y_test, y_pred)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 9)]
max_features = [int(x) for x in np.linspace(start = 1, stop = 9, num = 9)]
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

In [None]:
# First create the base model to tune
rf = RandomForestClassifier()

# Next search across 100 different combinations and use all available cores
rf_random = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_grid,
                               n_iter = 10,
                               cv = 5,
                               verbose = 2,
                               random_state = 1,
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
best_random = rf_random.best_estimator_
print("Best Random Accuracy")
print("Training: {:.3f}".format(best_random.score(X_train, y_train)))
print("Test: {:.3f}".format(best_random.score(X_test, y_test)))
print("F1: {:.3f}".format(f1_score(y_test, best_random.predict(X_test))))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb = GradientBoostingClassifier(random_state=1)
gb.fit(X_train, y_train)
print("Gradient Boosting Accuracy")
print("Training: {:.3f}".format(gb.score(X_train, y_train)))
print("Test: {:.3f}".format(gb.score(X_test, y_test)))
print("F1: {:.5f}".format(f1_score(y_test, gb.predict(X_test))))

In [None]:
y_pred = gb.predict(X_test)
f1_score(y_test, y_pred)

In [None]:
feature_importances = pd.DataFrame(base_model.feature_importances_,
                                   index = model_df.columns[1:],
                                   columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances)

In [None]:
X = X_test.iloc[:,0:2]

In [None]:
# rescale the data to zero mean and unit variance 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

from sklearn.neighbors import radius_neighbors_graph
dm = radius_neighbors_graph(X=X, radius=20, mode='distance')

In [None]:
print(dm)

In [None]:
filename = "mydata.data"
m, n = dm.shape

mm = np.memmap(filename, dtype='float32', mode='write', shape=(m, n))
mm = dm

In [None]:
del mm

In [None]:
mm = np.memmap(filename, dtype="float32", mode="readonly", shape=(m, n))

In [None]:
np.unique(mm)

In [None]:
from sklearn.cluster import DBSCAN
clusters = DBSCAN(eps=1, min_samples=2, algorithm='ball_tree', metric='precomputed', n_jobs=-1).fit_predict(dm)

In [None]:
# plot the cluster assignments
import matplotlib.pyplot as plt
plt.scatter(X[:,0], X[:,1], c = clusters)
plt.show

In [None]:
np.unique(clusters)

In [None]:
test = model_df.groupby(['x2','y2']).size().reset_index()

X = test.iloc[:,0:2]
sw = test.iloc[:,2]

In [None]:
X

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=12)
kmeans.fit(X)

In [None]:
import mglearn
mglearn.discrete_scatter(X['x2'], X['y2'], kmeans.labels_, markers='o')

In [None]:
data = [go.Scatter(x=X['x2'],
                   y=X['y2'],
                   mode='markers',
                   opacity=0.5,
                   marker=dict(color=~clusters))]
                   
layout = go.Layout(xaxis=dict(range=[0,100],
                              showgrid=False,
                              zeroline=False,
                              showline=False,
                              ticks='',
                              showticklabels=False),
                   yaxis=dict(range=[-42.5,42.5],
                              showgrid=False,
                              zeroline=False,
                              showline=False,
                              ticks='',
                              showticklabels=False,
                              scaleanchor = 'x',
                              scaleratio = 0.85),
                   images=[dict(source="./NHL_Hockey_Rink.png",
                                xref="x",
                                yref="y",
                                x= 0,
                                y= 42.5,
                                sizex= 100,
                                sizey= 85,
                                sizing="stretch",
                                opacity= 0.5,
                                layer= "below")])

fig = go.Figure(data=data,layout=layout)

In [None]:
py.offline.iplot(fig)