In [7]:
# Note: before running this notebook, you will need to install the nhlpy package
# Using pip in he commthis is simply:
# pip install nhlpy

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import json

from nhlpy.constants import BASE_URL
from nhlpy import team,game,schedule,player #There are other modules but this should do it for now

In [9]:
# Function that takes a game object created by nhlpy.game and 
# returns a 0 for a home team win and 1 for an away team win.
# Somehow, I could not find this data more directly using this
# package.

## Changed: add teams' names and abbrevs information 
## Changed: separated home and away dictionary keys

def whoWon(game):
    scoreData = game.all_stats()['liveData']['linescore']
    periodData = scoreData['periods']
    homegoals = 0
    awaygoals = 0
    
    # changed: 3 -> len(periodData) 
    # since not all games has 3 periods, eg.2019120001
    
    for i in range(len(periodData)):
        homegoals = homegoals + periodData[i]['home']['goals']
        awaygoals = awaygoals + periodData[i]['away']['goals']
    if homegoals == awaygoals:
        shootoutData = scoreData['shootoutInfo'] 
        homeshootout = shootoutData['home']['scores']
        awayshootout = shootoutData['away']['scores']
        winner = homeshootout < awayshootout
    else:
        winner = homegoals < awaygoals
    
    home_name = game.all_stats()['gameData']['teams']['home']['name']
    away_name = game.all_stats()['gameData']['teams']['away']['name']
    
    home_abbrev = game.all_stats()['gameData']['teams']['home']['abbreviation']
    away_abbrev = game.all_stats()['gameData']['teams']['away']['abbreviation']
    
    
    winner_dict = {'homeTeam': home_name,
                   'awayTeam': away_name,
                   'homeAbbrev': home_abbrev,
                   'awayAbbrev': away_abbrev,
                   'GamePK': game.all_stats()['gameData']['game']['pk'], 
                   'Winner': int(winner)}
    # return int(winner)
    return winner_dict

# A few notes: this will not work to determine ties (which happened prior to 2004),
# nor will it work in playoff games that go into overtime. I think it's okay for 
# now but I will need to fix if we include those games.

In [10]:
# Example game: Golden Knights @ Blue Jackets 1-4 (home team won)
exgame = game.Game(2017021023) # This number is the GamePk, the numerical code associated with a game
whoWon(exgame)

{'homeTeam': 'Columbus Blue Jackets',
 'awayTeam': 'Vegas Golden Knights',
 'homeAbbrev': 'CBJ',
 'awayAbbrev': 'VGK',
 'GamePK': 2017021023,
 'Winner': 0}

In [11]:
# This function takes a schedule object, and returns a dictionary
# with all of the information for the games in that season

# This was a method in nhlpy that is on Github, but not PyPI
# Defining it here as a function to make life easier
def season(schedule, seasonid=None):
    response = requests.get("{0}/schedule{1}{2}".format(BASE_URL, "?season=", seasonid))
    schedule.data = response.json()
    #del schedule.data["copyright"]
    return schedule.data

In [12]:
# Example season data:
thisschedule = schedule.Schedule()
thisseason = season(thisschedule,20202021)
print(thisseason.keys())
print(thisseason['totalGames'])
print(thisseason['dates'][0]['games'][4].keys())
print(thisseason['dates'][0]['games'][4]['gamePk'])


dict_keys(['copyright', 'totalItems', 'totalEvents', 'totalGames', 'totalMatches', 'metaData', 'wait', 'dates'])
952
dict_keys(['gamePk', 'link', 'gameType', 'season', 'gameDate', 'status', 'teams', 'venue', 'content'])
2020020005


In [13]:
# This is a function that gets all of the game pks from a given season
# Let's see if this is faster than hockey-scraper

def getPkList(season):
    numGames = season['totalGames']
    pklist = np.zeros(numGames)
    i = 0
    for date in season['dates']:
        for game in date['games']:
            pklist[i] = game['gamePk']
            i = i + 1
    return pklist

# In the future, it might make more sense to just grab everything we 
# need while looping through the season, but this should be a useful 
# tool for now.


In [14]:
# Example Pk List:
expklist = getPkList(thisseason)
expklist

array([2.02002000e+09, 2.02002000e+09, 2.02002000e+09, 2.02002000e+09,
       2.02002000e+09, 2.02002001e+09, 2.02002001e+09, 2.02002001e+09,
       2.02002001e+09, 2.02002001e+09, 2.02002001e+09, 2.02002001e+09,
       2.02002001e+09, 2.02002002e+09, 2.02002002e+09, 2.02002002e+09,
       2.02002002e+09, 2.02002002e+09, 2.02002002e+09, 2.02002002e+09,
       2.02002002e+09, 2.02002003e+09, 2.02002002e+09, 2.02002002e+09,
       2.02002003e+09, 2.02002003e+09, 2.02002003e+09, 2.02002003e+09,
       2.02002003e+09, 2.02002003e+09, 2.02002004e+09, 2.02002003e+09,
       2.02002004e+09, 2.02002004e+09, 2.02002004e+09, 2.02002004e+09,
       2.02002004e+09, 2.02002004e+09, 2.02002004e+09, 2.02002004e+09,
       2.02002004e+09, 2.02002004e+09, 2.02002005e+09, 2.02002005e+09,
       2.02002005e+09, 2.02002005e+09, 2.02002005e+09, 2.02002005e+09,
       2.02002005e+09, 2.02002006e+09, 2.02002006e+09, 2.02002006e+09,
       2.02002006e+09, 2.02002006e+09, 2.02002006e+09, 2.02002006e+09,
      

In [None]:
# Win list for a season: perhaps this is what we regress onto?
winlist = np.zeros(len(expklist))
for i, pk in enumerate(expklist):
    thisgame = game.Game(int(pk))
    win_dict = whoWon(thisgame)
    winlist[i] = win_dict['Winner']
    
# This loop takes a long time and isn't necessary to run anymore


### create winner_home_or_away csv dataset

In [None]:
## This takes a while. The file is already created, so it is not necessary to run this again.

a = 20202021
for i in range(10):
    a = a-1
    a = a-10000
    print('creating season data: ', a)
    filename = str(a)+'_winner.json'

    thisschedule = schedule.Schedule()
    thisseason = season(thisschedule, a)
    expklist = getPkList(thisseason)

    season_stats = []
    for i, pk in enumerate(expklist):
        thisgame = game.Game(int(pk))
        season_stats.append(whoWon(thisgame))

    with open(filename, 'w') as outfile:
        json.dump(season_stats, outfile)
    outfile.close()



In [19]:
# Demonstrating the player object
exforward = player.Player(8471214)
exdefenseman = player.Player(8476850)
exgoalie = player.Player(8475683)
print('forward:')
print(exforward.season(2020,2021)['stats'][0]['splits'][0]['stat'])
print('goalie:')
print(exgoalie.season(2017,2018)['stats'][0]['splits'][0]['stat'])
print('defenseman')
print(exdefenseman.season(2017,2018)['stats'][0]['splits'][0]['stat'])

forward:
{'timeOnIce': '877:35', 'assists': 18, 'goals': 24, 'pim': 12, 'shots': 182, 'games': 45, 'hits': 98, 'powerPlayGoals': 9, 'powerPlayPoints': 17, 'powerPlayTimeOnIce': '189:41', 'evenTimeOnIce': '686:59', 'penaltyMinutes': '12', 'faceOffPct': 0.0, 'shotPct': 13.2, 'gameWinningGoals': 6, 'overTimeGoals': 1, 'shortHandedGoals': 0, 'shortHandedPoints': 0, 'shortHandedTimeOnIce': '00:55', 'blocked': 23, 'plusMinus': -7, 'points': 42, 'shifts': 935, 'timeOnIcePerGame': '19:30', 'evenTimeOnIcePerGame': '15:15', 'shortHandedTimeOnIcePerGame': '00:01', 'powerPlayTimeOnIcePerGame': '04:12'}
goalie:
{'timeOnIce': '3911:34', 'ot': 6, 'shutouts': 5, 'ties': 0, 'wins': 37, 'losses': 22, 'saves': 1835, 'powerPlaySaves': 207, 'shortHandedSaves': 41, 'evenSaves': 1587, 'shortHandedShots': 46, 'evenShots': 1698, 'powerPlayShots': 249, 'savePercentage': 0.920723, 'goalAgainstAverage': 2.423581, 'games': 65, 'gamesStarted': 65, 'shotsAgainst': 1993, 'goalsAgainst': 158, 'timeOnIcePerGame': '60:1

In [22]:
# This function takes a gamepk, the code associated with any given game, and returns a dictionary with the
# weighted average of the player statistics at each position. The weights are based on the average time on ice
# for each player that played in this particular game.

def gameRosterStats(gamepk):
    # All stats are normalized to per 60 minutes
    F_home_assists = []
    F_home_goals = []
    F_home_pim = []
    F_home_shots = []
    F_home_blocked = []
    F_home_hits = []
    F_home_pm = []
    F_home_atoi = []
    
    D_home_assists = []
    D_home_goals = []
    D_home_pim = []
    D_home_shots = []
    D_home_blocked = []
    D_home_hits = []
    D_home_pm = []
    D_home_atoi = []
    
    G_home_GAA = []
    G_home_atoi = []
    
    F_away_assists = []
    F_away_goals = []
    F_away_pim = []
    F_away_shots = []
    F_away_blocked = []
    F_away_hits = []
    F_away_pm = []
    F_away_atoi = []
    
    D_away_assists = []
    D_away_goals = []
    D_away_pim = []
    D_away_shots = []
    D_away_blocked = []
    D_away_hits = []
    D_away_pm = []
    D_away_atoi = []
    
    G_away_GAA = []
    G_away_atoi = []
    
    thisgame = game.Game(gamepk)
    try:
        hometeam = thisgame.all_stats()['gameData']['teams']['home']['triCode']
    except:
        hometeam = thisgame.all_stats()['gameData']['teams']['home']['abbreviation']
    homeroster = thisgame.boxscore()['teams']['home']['players']
    awayroster = thisgame.boxscore()['teams']['away']['players']
    season = thisgame.all_stats()['gameData']['game']['season']
    year = int(season[0:4])
    for thisplayer in homeroster.keys():
        explayer = player.Player(homeroster[thisplayer]['person']['id'])
        splits = explayer.season(year,year+1)['stats'][0]['splits']
        if splits:
            stats = splits[0]['stat']
        else:
            continue
        try:
            position = homeroster[thisplayer]['person']['primaryPosition']['code']
        except:
            position = homeroster[thisplayer]['position']['code']
        try:
            atoi_string = stats['timeOnIcePerGame']
        except:
            continue
        atoi_list = atoi_string.split(':')
        atoi_float = float(atoi_list[0])+float(atoi_list[1])/60
        toi_string = stats['timeOnIce']
        toi_list = toi_string.split(':')
        toi_float = float(toi_list[0])+float(toi_list[1])/60
        hours = toi_float/60
        if position in 'N/A':
            continue
        elif position in 'G':
            gaa = stats['goalAgainstAverage']
            G_home_GAA.append(gaa)
            G_home_atoi.append(atoi_float)
        else:
            assists = stats['assists']/hours
            goals = stats['goals']/hours
            shots = stats['shots']/hours
            blocked = stats['blocked']/hours
            hits = stats['hits']/hours
            pim = stats['pim']/hours
            pm = stats['plusMinus']/hours
            if 'D' in position:
                D_home_assists.append(assists)
                D_home_goals.append(goals)
                D_home_pim.append(pim)
                D_home_shots.append(shots)
                D_home_blocked.append(blocked)
                D_home_hits.append(hits)
                D_home_pm.append(pm)
                D_home_atoi.append(atoi_float)
            else:
                F_home_assists.append(assists)
                F_home_goals.append(goals)
                F_home_pim.append(pim)
                F_home_shots.append(shots)
                F_home_blocked.append(blocked)
                F_home_hits.append(hits)
                F_home_pm.append(pm)
                F_home_atoi.append(atoi_float)
                
    for thisplayer in awayroster.keys():
        explayer = player.Player(awayroster[thisplayer]['person']['id'])
        splits = explayer.season(year,year+1)['stats'][0]['splits']
        if splits:
            stats = splits[0]['stat']
        else:
            continue
        try:
            position = awayroster[thisplayer]['person']['primaryPosition']['code']
        except:
            position = awayroster[thisplayer]['position']['code']
        try:
            atoi_string = stats['timeOnIcePerGame']
        except:
            continue
        atoi_list = atoi_string.split(':')
        atoi_float = float(atoi_list[0])+float(atoi_list[1])/60
        toi_string = stats['timeOnIce']
        toi_list = toi_string.split(':')
        toi_float = float(toi_list[0])+float(toi_list[1])/60
        hours = toi_float/60
        if position in 'N/A':
            continue
        elif position in 'G':
            gaa = stats['goalAgainstAverage']
            G_away_GAA.append(gaa)
            G_away_atoi.append(atoi_float)
        else:
            assists = stats['assists']/hours
            goals = stats['goals']/hours
            shots = stats['shots']/hours
            blocked = stats['blocked']/hours
            hits = stats['hits']/hours
            pim = stats['pim']/hours
            pm = stats['plusMinus']/hours
            if 'D' in position:
                D_away_assists.append(assists)
                D_away_goals.append(goals)
                D_away_pim.append(pim)
                D_away_shots.append(shots)
                D_away_blocked.append(blocked)
                D_away_hits.append(hits)
                D_away_pm.append(pm)
                D_away_atoi.append(atoi_float)
            else:
                F_away_assists.append(assists)
                F_away_goals.append(goals)
                F_away_pim.append(pim)
                F_away_shots.append(shots)
                F_away_blocked.append(blocked)
                F_away_hits.append(hits)
                F_away_pm.append(pm)
                F_away_atoi.append(atoi_float)


    stat_dict = {'F_home_assists' : (np.array(F_home_assists)*np.array(F_home_atoi)).sum()/sum(F_home_atoi),
                'F_home_goals' : (np.array(F_home_goals)*np.array(F_home_atoi)).sum()/sum(F_home_atoi),
                'F_home_pim' : (np.array(F_home_pim)*np.array(F_home_atoi)).sum()/sum(F_home_atoi),
                'F_home_shots' : (np.array(F_home_shots)*np.array(F_home_atoi)).sum()/sum(F_home_atoi),
                'F_home_blocked' : (np.array(F_home_blocked)*np.array(F_home_atoi)).sum()/sum(F_home_atoi),
                'F_home_hits' : (np.array(F_home_hits)*np.array(F_home_atoi)).sum()/sum(F_home_atoi),
                'F_home_pm' : (np.array(F_home_pm)*np.array(F_home_atoi)).sum()/sum(F_home_atoi),
                'D_home_assists' : (np.array(D_home_assists)*np.array(D_home_atoi)).sum()/sum(D_home_atoi),
                'D_home_goals' : (np.array(D_home_goals)*np.array(D_home_atoi)).sum()/sum(D_home_atoi),
                'D_home_pim' : (np.array(D_home_pim)*np.array(D_home_atoi)).sum()/sum(D_home_atoi),
                'D_home_shots' : (np.array(D_home_shots)*np.array(D_home_atoi)).sum()/sum(D_home_atoi),
                'D_home_blocked' : (np.array(D_home_blocked)*np.array(D_home_atoi)).sum()/sum(D_home_atoi),
                'D_home_hits' : (np.array(D_home_hits)*np.array(D_home_atoi)).sum()/sum(D_home_atoi),
                'D_home_pm' : (np.array(D_home_pm)*np.array(D_home_atoi)).sum()/sum(D_home_atoi),
                'G_home_GAA' : (np.array(G_home_GAA)*np.array(G_home_atoi)).sum()/sum(G_home_atoi),
                'F_away_assists' : (np.array(F_away_assists)*np.array(F_away_atoi)).sum()/sum(F_away_atoi),
                'F_away_goals' : (np.array(F_away_goals)*np.array(F_away_atoi)).sum()/sum(F_away_atoi),
                'F_away_pim' : (np.array(F_away_pim)*np.array(F_away_atoi)).sum()/sum(F_away_atoi),
                'F_away_shots' : (np.array(F_away_shots)*np.array(F_away_atoi)).sum()/sum(F_away_atoi),
                'F_away_blocked' : (np.array(F_away_blocked)*np.array(F_away_atoi)).sum()/sum(F_away_atoi),
                'F_away_hits' : (np.array(F_away_hits)*np.array(F_away_atoi)).sum()/sum(F_away_atoi),
                'F_away_pm' : (np.array(F_away_pm)*np.array(F_away_atoi)).sum()/sum(F_away_atoi),
                'D_away_assists' : (np.array(D_away_assists)*np.array(D_away_atoi)).sum()/sum(D_away_atoi),
                'D_away_goals' : (np.array(D_away_goals)*np.array(D_away_atoi)).sum()/sum(D_away_atoi),
                'D_away_pim' : (np.array(D_away_pim)*np.array(D_away_atoi)).sum()/sum(D_away_atoi),
                'D_away_shots' : (np.array(D_away_shots)*np.array(D_away_atoi)).sum()/sum(D_away_atoi),
                'D_away_blocked' : (np.array(D_away_blocked)*np.array(D_away_atoi)).sum()/sum(D_away_atoi),
                'D_away_hits' : (np.array(D_away_hits)*np.array(D_away_atoi)).sum()/sum(D_away_atoi),
                'D_away_pm' : (np.array(D_away_pm)*np.array(D_away_atoi)).sum()/sum(D_away_atoi),
                'G_away_GAA' : (np.array(G_away_GAA)*np.array(G_away_atoi)).sum()/sum(G_away_atoi),}
    return stat_dict

In [24]:
# An example:
pk = expklist[5]
gameRosterStats(int(pk))
# Note: gameRosterStats must take an integer, while the getPkList function returns a list of floats

{'F_home_assists': 1.0030582363535854,
 'F_home_goals': 0.7502693542745408,
 'F_home_pim': 1.598510978700081,
 'F_home_shots': 6.765785938101021,
 'F_home_blocked': 1.7304210569698772,
 'F_home_hits': 5.487407061977955,
 'F_home_pm': 0.08348255271830725,
 'D_home_assists': 0.7376073898790669,
 'D_home_goals': 0.27179707165023215,
 'D_home_pim': 1.3205618483438712,
 'D_home_shots': 4.1503702110428184,
 'D_home_blocked': 3.552533293314086,
 'D_home_hits': 4.889849420293088,
 'D_home_pm': -0.02391018851706134,
 'G_home_GAA': 2.907254776863085,
 'F_away_assists': 0.8884107165581252,
 'F_away_goals': 0.7534061238111284,
 'F_away_pim': 2.4076794933590278,
 'F_away_shots': 6.381481105795843,
 'F_away_blocked': 2.397263738560627,
 'F_away_hits': 8.645044015362279,
 'F_away_pm': 0.13073131891648743,
 'D_away_assists': 0.7109039867038689,
 'D_away_goals': 0.19232255619998462,
 'D_away_pim': 1.283898615569362,
 'D_away_shots': 4.626915435893541,
 'D_away_blocked': 5.0010846144785885,
 'D_away_hit

In [None]:
# This code chunk loops through the years given and creates csv files containing the team data and 
# the averaged positional data. It takes a VERY long time (45-60 minutes per season on my computer),
# and the files have already been created.

# After running this, I realized that these files included the preseason and playoff games on addition
# to the regular season games. Rather than fix and rerun the code, it was easier to remove these games
# mannually. Preseason and playoff games are indicated by a 1 or 3 in the 6th digit in the gamepk,
# respectively. Playoff games were saved in their own file and used as a our test set.

# This may not work prior to 2005, given the rule changes in the game.

thisschedule = schedule.Schedule()
pklist = getPkList(thisseason)
years = [2010,2009,2008,2007,2006,2005] # Change this to only pull years that are still needed
for j in years:
    year = int(str(j)+str(j+1))
    filename = str(j)+'-'+str(j+1)+'_positions_by_game.csv'
    thisseason = season(thisschedule,year)
    pklist = getPkList(thisseason)
    seasondata = pd.DataFrame(columns=['homeTeam','awayTeam','homeAbbrev','awayAbbrev','GamePK','Winner',
                                       'F_home_assists',
                                       'F_home_goals',
                                       'F_home_pim',
                                       'F_home_shots',
                                       'F_home_blocked',
                                       'F_home_pm',
                                       'D_home_assists',
                                       'D_home_goals',
                                       'D_home_pim',
                                       'D_home_shots',
                                       'D_home_blocked',
                                       'D_home_hits',
                                       'D_home_pm',
                                       'G_home_GAA',
                                       'F_away_assists',
                                       'F_away_goals',
                                       'F_away_pim',
                                       'F_away_shots',
                                       'F_away_blocked',
                                       'F_away_hits',
                                       'F_away_pm',
                                       'D_away_assists',
                                       'D_away_goals',
                                       'D_away_pim',
                                       'D_away_shots',
                                       'D_away_blocked',
                                       'D_away_hits',
                                       'D_away_pm',
                                       'G_away_GAA'])

    lines = []
    for i,pk in enumerate(pklist):
        teamdata = game.Game(int(pk)).all_stats()['gameData']['teams']
        if (teamdata['home']['id'] < 56) & (teamdata['away']['id'] < 56):  # Excluded exhibition games with teams outside the nhl
            this_dict = whoWon(game.Game(int(pk)))
            playerstats = gameRosterStats(int(pk))
            this_dict.update(playerstats)
            line = pd.DataFrame(this_dict, index = [0])
            seasondata = pd.concat([seasondata,line])
        else:
            continue
    seasondata.to_csv('../Positions by Game/'+filename)
    print(filename,"is done!")
        