# NHL.com scrapers

In [2]:
# Common imports
# import datetime
from datetime import date, datetime, timedelta
import time
import sys
import os
import requests
import logging
import pandas as pd

In [3]:
# NHL.com Today's Schedule Scraper
# get game metadata from NHL.com
# underlying function of harry schomer's hockey scraper package
# return df with unique id & name fields for games and teams 
## columns to scrape: game_id, game_date, home_team, home_team_id, away_team, away_team_id

def get_today_sched(date):
    '''
    args: 
    - date | format: YYYY-MM-DD

    output:
    - pandas df of given date's game slate
    '''

    url = f'https://statsapi.web.nhl.com/api/v1/schedule?date={date}'

    req = requests.get(url)
    schedule_dict = req.json()

    today_games = {}

#if there are no games exit the function with an empty dataframe
    if not schedule_dict['dates']:
        return pd.DataFrame()

#parse the schedule to create a dataframe to feed to the prediction model
    for x in schedule_dict['dates']:
        for game in x['games']:
            today_games[game['gamePk']] = {}
            today_games[game['gamePk']]['date'] = date
            today_games[game['gamePk']]['home_team'] = game['teams']['home']['team']['name']
            today_games[game['gamePk']]['home_team_id'] = game['teams']['home']['team']['id']
            today_games[game['gamePk']]['away_team'] = game['teams']['away']['team']['name']
            today_games[game['gamePk']]['away_team_id'] = game['teams']['away']['team']['id']

#turn dictionary of daily games to a dataframe:
    daily_games_df = pd.DataFrame.from_dict(today_games, orient='index')
    daily_games_df = daily_games_df.reset_index()
    daily_games_df.columns = ['game_id', 'game_date', 'home_team', 'home_team_id',
                              'away_team', 'away_team_id']

    return daily_games_df

In [4]:
# Test daily Scraper
get_today_sched('2023-04-11')

Unnamed: 0,game_id,game_date,home_team,home_team_id,away_team,away_team_id
0,2022021285,2023-04-11,New Jersey Devils,1,Buffalo Sabres,7
1,2022021287,2023-04-11,Philadelphia Flyers,4,Columbus Blue Jackets,29
2,2022021288,2023-04-11,Carolina Hurricanes,12,Detroit Red Wings,17
3,2022021289,2023-04-11,Tampa Bay Lightning,14,Toronto Maple Leafs,10
4,2022021290,2023-04-11,Boston Bruins,6,Washington Capitals,15
5,2022021286,2023-04-11,Pittsburgh Penguins,5,Chicago Blackhawks,16
6,2022021291,2023-04-11,Minnesota Wild,30,Winnipeg Jets,52
7,2022021292,2023-04-11,Colorado Avalanche,21,Edmonton Oilers,22
8,2022021293,2023-04-11,Vegas Golden Knights,54,Seattle Kraken,55
9,2022021294,2023-04-11,Anaheim Ducks,24,Vancouver Canucks,23


In [8]:
# Test forward and backward looking capability and variable date passing
%time
yesterday = '2023-04-10'
get_today_sched(yesterday) # works for backwards looking dates 

CPU times: user 2 µs, sys: 16 µs, total: 18 µs
Wall time: 20 µs


Unnamed: 0,game_id,game_date,home_team,home_team_id,away_team,away_team_id
0,2022021278,2023-04-10,Winnipeg Jets,52,San Jose Sharks,28
1,2022021275,2023-04-10,New York Rangers,3,Buffalo Sabres,7
2,2022021276,2023-04-10,Ottawa Senators,9,Carolina Hurricanes,12
3,2022021277,2023-04-10,Washington Capitals,15,New York Islanders,2
4,2022021279,2023-04-10,Florida Panthers,13,Toronto Maple Leafs,10
5,2022021280,2023-04-10,Detroit Red Wings,17,Dallas Stars,25
6,2022021281,2023-04-10,Chicago Blackhawks,16,Minnesota Wild,30
7,2022021282,2023-04-10,Calgary Flames,20,Nashville Predators,18
8,2022021283,2023-04-10,Arizona Coyotes,53,Seattle Kraken,55
9,2022021284,2023-04-10,Los Angeles Kings,26,Vancouver Canucks,23


In [9]:
# Test forward and backward looking capability
%time
tomorrow = '2023-04-12'
get_today_sched(tomorrow) #  works for forwards looking dates 

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 3.81 µs


Unnamed: 0,game_id,game_date,home_team,home_team_id,away_team,away_team_id
0,2022021295,2023-04-12,New York Islanders,2,Montréal Canadiens,8
1,2022021296,2023-04-12,St. Louis Blues,19,Dallas Stars,25
2,2022021297,2023-04-12,Calgary Flames,20,San Jose Sharks,28


In [15]:
# Past results scraper
%time

def get_yest_schedule(date):
    '''
    This function gets the NHL schedule from the NHL api and
    returns a dictionary
    Inputs:
    start_date - string of the first date to pass to the api url
    end_date - string of the end date for the api url
    Outputs:
    schedule_dict - dictionary created from api JSON
    '''

    api_url = ('https://statsapi.web.nhl.com/api/v1/schedule?'
               'date={}').format(date)
    logging.info(api_url)

    req = requests.get(api_url)
    schedule_dict = req.json()

    return schedule_dict

def get_game_ids(schedule_dict):
    '''
    This function flatten out the API json into a flat table scructure
    with the relevant stats for the SQL table
    Inputs:
    schedule_dict - dicitonary of the API GET request
    Outputs
    sched_df - pandas dataframe to be inserted into schedule table
    '''

    game_ids = []
    for item in schedule_dict['dates']:
        games = item['games']

        for game in games:
            game_ids.append(game['gamePk'])

    return game_ids

def create_sched_df(pbp_dict, date):
    '''
    this function takes a pbp JSON object and converts it into a list of values
    that will be compiled into a dataframe to be inserted into SQL table
    Inputs:
    game_dict - pbp JSON
    Outputs:
    outcome - list of results of game that will become row in data frame
    '''

    outcome = []
    linescore = pbp_dict['liveData']['linescore']

    outcome.append(pbp_dict['gamePk'])
    outcome.append(pbp_dict['gameData']['game']['type'])
    outcome.append(pbp_dict['gameData']['game']['season'])
    outcome.append(date)
    outcome.append(pbp_dict['liveData']['linescore']['teams']['home']['team']['id'])
    outcome.append(pbp_dict['liveData']['linescore']['teams']['home']['team']['name'])
    outcome.append(pbp_dict['liveData']['linescore']['teams']['home']['team']['abbreviation'])
    outcome.append(pbp_dict['liveData']['linescore']['teams']['home']['goals'])
    outcome.append(pbp_dict['liveData']['linescore']['teams']['away']['team']['id'])
    outcome.append(pbp_dict['liveData']['linescore']['teams']['away']['team']['name'])
    outcome.append(pbp_dict['liveData']['linescore']['teams']['away']['team']['abbreviation'])
    outcome.append(pbp_dict['liveData']['linescore']['teams']['away']['goals'])
    if pbp_dict['liveData']['linescore']['currentPeriod'] == 4:
        outcome.append(1)
    else:
        outcome.append(0)

    if pbp_dict['liveData']['linescore']['currentPeriod'] == 5:
        outcome.append(1)
    else:
        outcome.append(0)

    if pbp_dict['liveData']['linescore']['currentPeriod'] == 4:
        try:
            game_end_time = pbp_dict['liveData']['plays']['currentPlay']['about']['periodTime'].split(':')
            seconds = int(game_end_time[0]) * 60 + int(game_end_time[1])
            outcome.append(seconds)
        except KeyError:
            logging.exception('Error in NHL pbp')
            outcome.append(0)

    elif pbp_dict['liveData']['linescore']['currentPeriod'] == 5:
        outcome.append(300)

    else:
        outcome.append(0)

    if outcome[7] > outcome[11]:
        outcome.append(1)
    else:
        outcome.append(0)

    print(outcome)
    return outcome

def get_pbp(game_id):
    '''
    This function gets the NHL schedule from the NHL api and
    returns a dictionary
    Inputs:
    start_date - string of the first date to pass to the api url
    end_date - string of the end date for the api urwl
    Outputs:
    schedule_dict - dictionary created from api JSON
    '''

    api_url = f'http://statsapi.web.nhl.com/api/v1/game/{game_id}/feed/live'

    logging.info(api_url)
    req = requests.get(api_url)
    schedule_dict = req.json()

    return schedule_dict

def main():
    '''
    This script pulls the schedule data of past games and the results
    of each game and inserts them into an Postgres table
    '''
    date = (datetime.now() - timedelta(1)).strftime('%Y-%m-%d')

    # added
    start_time = time.time()

    logging.basicConfig(
                        filename='results.log',
                        format="%(asctime)s:%(levelname)s:%(message)s",
                        level=logging.INFO)
    rows = []
    schedule_dict = get_yest_schedule(date)
    games = get_game_ids(schedule_dict)

    if schedule_dict['totalItems'] == 0:
        logging.info("No Games Today")
        return
    else:
        for game in games:
            try:
                pbp_dict = get_pbp(game)
                rows.append(create_sched_df(pbp_dict, date))
            except:
                logging.exception('Exception')
                continue

    sched_df_columns = ['game_id', 'game_type', 'season', 'game_date',
                        'home_team_id', 'home_team', 'home_abbrev', 'home_score',
                        'away_team_id', 'away_team', 'away_abbrev', 'away_score',
                        'ot_flag', 'shootout_flag', 'seconds_in_ot',
                        'home_win']

    yest_df = pd.DataFrame(rows, columns=sched_df_columns)

    elapsed_time_secs = time.time() - start_time
    msg = "Execution took: %s secs (Wall clock time)" % timedelta(seconds=round(elapsed_time_secs))
    print(msg) 

    logging.info(yest_df)

    yest_df.to_csv('yest_games.csv', index=False)

    return yest_df

# if __name__ == '__main__':
#     main()

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.1 µs


In [16]:
# test yesterday results scraper
main()

2023-04-11 21:36:55,542:INFO:https://statsapi.web.nhl.com/api/v1/schedule?date=2023-04-10
2023-04-11 21:36:55,683:INFO:http://statsapi.web.nhl.com/api/v1/game/2022021278/feed/live
2023-04-11 21:36:55,757:INFO:http://statsapi.web.nhl.com/api/v1/game/2022021275/feed/live
2023-04-11 21:36:55,833:INFO:http://statsapi.web.nhl.com/api/v1/game/2022021276/feed/live
2023-04-11 21:36:55,913:INFO:http://statsapi.web.nhl.com/api/v1/game/2022021277/feed/live


[2022021278, 'R', '20222023', '2023-04-10', 52, 'Winnipeg Jets', 'WPG', 6, 28, 'San Jose Sharks', 'SJS', 2, 0, 0, 0, 1]
[2022021275, 'R', '20222023', '2023-04-10', 3, 'New York Rangers', 'NYR', 2, 7, 'Buffalo Sabres', 'BUF', 3, 0, 1, 300, 0]
[2022021276, 'R', '20222023', '2023-04-10', 9, 'Ottawa Senators', 'OTT', 3, 12, 'Carolina Hurricanes', 'CAR', 2, 0, 0, 0, 1]


2023-04-11 21:36:55,989:INFO:http://statsapi.web.nhl.com/api/v1/game/2022021279/feed/live
2023-04-11 21:36:56,079:INFO:http://statsapi.web.nhl.com/api/v1/game/2022021280/feed/live
2023-04-11 21:36:56,148:INFO:http://statsapi.web.nhl.com/api/v1/game/2022021281/feed/live


[2022021277, 'R', '20222023', '2023-04-10', 15, 'Washington Capitals', 'WSH', 5, 2, 'New York Islanders', 'NYI', 2, 0, 0, 0, 1]
[2022021279, 'R', '20222023', '2023-04-10', 13, 'Florida Panthers', 'FLA', 1, 10, 'Toronto Maple Leafs', 'TOR', 2, 1, 0, 278, 0]
[2022021280, 'R', '20222023', '2023-04-10', 17, 'Detroit Red Wings', 'DET', 1, 25, 'Dallas Stars', 'DAL', 6, 0, 0, 0, 0]


2023-04-11 21:36:56,226:INFO:http://statsapi.web.nhl.com/api/v1/game/2022021282/feed/live
2023-04-11 21:36:56,330:INFO:http://statsapi.web.nhl.com/api/v1/game/2022021283/feed/live
2023-04-11 21:36:56,398:INFO:http://statsapi.web.nhl.com/api/v1/game/2022021284/feed/live


[2022021281, 'R', '20222023', '2023-04-10', 16, 'Chicago Blackhawks', 'CHI', 2, 30, 'Minnesota Wild', 'MIN', 4, 0, 0, 0, 0]
[2022021282, 'R', '20222023', '2023-04-10', 20, 'Calgary Flames', 'CGY', 2, 18, 'Nashville Predators', 'NSH', 3, 0, 1, 300, 0]
[2022021283, 'R', '20222023', '2023-04-10', 53, 'Arizona Coyotes', 'ARI', 1, 55, 'Seattle Kraken', 'SEA', 4, 0, 0, 0, 0]


2023-04-11 21:36:56,487:INFO:      game_id game_type    season   game_date  home_team_id  \
0  2022021278         R  20222023  2023-04-10            52   
1  2022021275         R  20222023  2023-04-10             3   
2  2022021276         R  20222023  2023-04-10             9   
3  2022021277         R  20222023  2023-04-10            15   
4  2022021279         R  20222023  2023-04-10            13   
5  2022021280         R  20222023  2023-04-10            17   
6  2022021281         R  20222023  2023-04-10            16   
7  2022021282         R  20222023  2023-04-10            20   
8  2022021283         R  20222023  2023-04-10            53   
9  2022021284         R  20222023  2023-04-10            26   

             home_team home_abbrev  home_score  away_team_id  \
0        Winnipeg Jets         WPG           6            28   
1     New York Rangers         NYR           2             7   
2      Ottawa Senators         OTT           3            12   
3  Washington Capital

[2022021284, 'R', '20222023', '2023-04-10', 26, 'Los Angeles Kings', 'LAK', 3, 23, 'Vancouver Canucks', 'VAN', 0, 0, 0, 0, 1]
Execution took: 0:00:01 secs (Wall clock time)


Unnamed: 0,game_id,game_type,season,game_date,home_team_id,home_team,home_abbrev,home_score,away_team_id,away_team,away_abbrev,away_score,ot_flag,shootout_flag,seconds_in_ot,home_win
0,2022021278,R,20222023,2023-04-10,52,Winnipeg Jets,WPG,6,28,San Jose Sharks,SJS,2,0,0,0,1
1,2022021275,R,20222023,2023-04-10,3,New York Rangers,NYR,2,7,Buffalo Sabres,BUF,3,0,1,300,0
2,2022021276,R,20222023,2023-04-10,9,Ottawa Senators,OTT,3,12,Carolina Hurricanes,CAR,2,0,0,0,1
3,2022021277,R,20222023,2023-04-10,15,Washington Capitals,WSH,5,2,New York Islanders,NYI,2,0,0,0,1
4,2022021279,R,20222023,2023-04-10,13,Florida Panthers,FLA,1,10,Toronto Maple Leafs,TOR,2,1,0,278,0
5,2022021280,R,20222023,2023-04-10,17,Detroit Red Wings,DET,1,25,Dallas Stars,DAL,6,0,0,0,0
6,2022021281,R,20222023,2023-04-10,16,Chicago Blackhawks,CHI,2,30,Minnesota Wild,MIN,4,0,0,0,0
7,2022021282,R,20222023,2023-04-10,20,Calgary Flames,CGY,2,18,Nashville Predators,NSH,3,0,1,300,0
8,2022021283,R,20222023,2023-04-10,53,Arizona Coyotes,ARI,1,55,Seattle Kraken,SEA,4,0,0,0,0
9,2022021284,R,20222023,2023-04-10,26,Los Angeles Kings,LAK,3,23,Vancouver Canucks,VAN,0,0,0,0,1


## Drew Hyne's NHL API Documentation

Positions list
GET https://statsapi.web.nhl.com/api/v1/positions
- abbrev: "G",
- code: "G",
- fullName: "Goalie",
- type: "Goalie"

People

player bio
- GET https://statsapi.web.nhl.com/api/v1/people/ID 
- Gets details for a player, must specify the id value in order to return data.


Player stats
- GET https://statsapi.web.nhl.com/api/v1/people/ID/stats 
- Complex endpoint with lots of append options to change what kind of stats you wish to obtain
- ex: https://statsapi.web.nhl.com/api/v1/people/8471214/stats?stats=gameLog

Modifiers
- suffix ex: ?stats=statsSingleSeason&season=19801981 
- Obtains single season statistics for a player note - stats have changed over the years, the below sample is for Wayne Gretzky and does not include things like evenTimeOnIce and other time related stats


In [3]:
GET https://statsapi.web.nhl.com/api/v1/people/ID/stats

SyntaxError: invalid syntax (3025771123.py, line 1)