In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import re
%matplotlib inline

In [2]:
#Dictionary of team names and abbreviations. VEG is used instead of VGK.
teamAbvs = {'Anaheim Ducks':'ANA', 'Arizona Coyotes':'ARI', 'Boston Bruins': 'BOS', 'Buffalo Sabres':'BUF', 'Carolina Hurricanes':'CAR',
            'Calgary Flames':'CGY', 'Chicago Blackhawks':'CHI', 'Columbus Blue Jackets':'CBJ', 'Colorado Avalanche':'COL', 'Dallas Stars':'DAL',
           'Detroit Red Wings':'DET', 'Edmonton Oilers':'EDM', 'Florida Panthers':'FLA', 'Los Angeles Kings':'LAK',
           'Minnesota Wild':'MIN', 'Montreal Canadiens':'MTL', 'Nashville Predators':'NSH', 'New Jersey Devils':'NJD',
           'New York Islanders':'NYI', 'New York Rangers':'NYR', 'Ottawa Senators':'OTT', 'Phoenix Coyotes':'PHX', 'Philadelphia Flyers':'PHI',
           'Pittsburgh Penguins':'PIT', 'San Jose Sharks':'SJS', 'St. Louis Blues':'STL', 'Tampa Bay Lightning':'TBL',
           'Toronto Maple Leafs':'TOR','Vancouver Canucks':'VAN', 'Vegas Golden Knights':'VEG', 'Washington Capitals':'WSH', 'Winnipeg Jets':'WPG',
           }

In [16]:
def startSeason(season_url):
    season_year = season_url.split('_')[1]
    
    #Retrieving the initial url and HTML.
    season_page = urlopen(season_url)
    season_soup = BeautifulSoup(season_page)
    
    #Building the schedule table.
    season_games = season_soup.find(id='games')
    Date =[]
    Vis=[]
    VG=[]
    Home=[]
    HG=[]
    OT=[]
    Att=[]
    LOG=[]
    Notes=[]

    #Create a table with a list for each column of data.
    headers = [Date, Vis, VG, Home, HG, OT, Att, LOG, Notes]
    
    #Iterate through each row and column, assigning the corresponding data to the location in 'headers'.
    for row in season_games.findAll('tr'):
        cells = row.findAll(['td','th'])
        index = np.arange(0,len(cells)).tolist()
        for num in index:
            headers[num].append(cells[num].find(text=True))
            
    #Create the games Dataframe, set the column names to names instead of index values.
    season_gamesDf = pd.DataFrame(headers).transpose()
    season_gamesDf.columns = season_gamesDf.iloc[0]
    season_gamesDf = season_gamesDf.reindex(season_gamesDf.index.drop(0))
    
    #Create game HTML string for every game.
    gameHTML = []
    for entry in season_gamesDf.index:
        abv = teamAbvs[season_gamesDf['Home'][entry]]
        daystr = ''.join(season_gamesDf['Date'][entry].split('-'))
        gamestr = (daystr+'0'+abv)
        gamepage = ('https://www.hockey-reference.com/boxscores/'+gamestr+'.html')
        gameHTML.append(gamepage)
    #Add the new gameHTML column to gamesDf.
    season_gamesDf['Game HTML'] = gameHTML
    number_of_games = len(gameHTML)

    #Create the penalty columns for each game and the columns for running totals of each penalty.
    initialCalls = np.zeros(number_of_games)
    initialCalls.tolist()
    season_gamesDf['Slashing Calls in Game'] = initialCalls 
    season_gamesDf['Total Slashing Calls'] = initialCalls  

    season_gamesDf['roughing Calls in Game'] = initialCalls 
    season_gamesDf['Total roughing Calls'] = initialCalls 

    season_gamesDf['hooking Calls in Game'] = initialCalls  
    season_gamesDf['Total hooking Calls'] = initialCalls  

    season_gamesDf['interfering Calls in Game'] = initialCalls  
    season_gamesDf['Total interfering Calls'] = initialCalls  

    season_gamesDf['tripping Calls in Game'] = initialCalls  
    season_gamesDf['Total tripping Calls'] = initialCalls  

    season_gamesDf['highsticking Calls in Game'] = initialCalls  
    season_gamesDf['Total highsticking Calls'] = initialCalls 

    season_gamesDf['fighting Calls in Game'] = initialCalls 
    season_gamesDf['Total fighting Calls'] = initialCalls
    
    season_gamesDf['Updated'] = initialCalls
    
    #Get all the column names for the overall penalty table.
    penalty_columns = season_gamesDf.columns.tolist()

    #Output the dataframe object with all of the retrieved data.
    season_gamesDf.to_csv('NHL_penalties_'+season_year, index=False,  header=penalty_columns)
    
    #Creating an empty dataframe of penalties for each team in the season.
    teams = sorted(list(teamAbvs.values()))
    penalties = ['slashing', 'hooking', 'tripping', 'highsticking', 'interference', 'roughing', 'fighting', 'total']
    games = np.arange(1,number_of_games+1,1)
    team_penalties = pd.DataFrame(np.zeros((number_of_games,256), dtype='int32'),
                                  index=games,
                                  columns=pd.MultiIndex.from_product([teams, penalties], names=['Team', 'Penalty'])
                                 )
    
    team_penalties.to_csv('team_penalties_'+season_year, index=False)

In [17]:
#Proof of concept cell
startSeason('https://www.hockey-reference.com/leagues/NHL_2018_games.html')

In [22]:
def updateSeason(season_url):
    #Need to check that the season has already been 'started' by startSeason() and the dfs exist.
    import os.path
    season_year = season_url.split('_')[1]
    
    season_df_name = 'NHL_penalties_'+season_year
    team_season_df = 'team_penalties_'+season_year
    
    if os.path.isfile(season_df_name) == False:
        return 'Season data does not exist, has the season been initialized with startSeason?'
    else:
        local_season_df = pd.read_csv(season_df_name)
    if os.path.isfile(team_season_df) == False:
        return 'Teams data does not exist, has the season been initialized?'
    else:
        local_team_season_df = pd.read_csv(team_season_df, header=[0,1])
    
    #Find the last row with valid data and pick the next one as the next row to be entered.
    next_entry = (local_season_df['Updated'].values != 1).argmax()
    columns = local_season_df.columns.tolist()
    
    #Need to rebuild the existing online table to compare to local data.
    #Retrieving the initial url and HTML.
    season_page = urlopen(season_url)
    season_soup = BeautifulSoup(season_page)
    
    #Building the schedule table
    season_games = season_soup.find(id='games')
    Date =[]
    Vis=[]
    VG=[]
    Home=[]
    HG=[]
    OT=[]
    Att=[]
    LOG=[]
    Notes=[]

    #Create a table with a list for each column of data.
    headers = [Date, Vis, VG, Home, HG, OT, Att, LOG, Notes]
    
    #Iterate through each row and column, assigning the corresponding data to the location in 'headers'.
    for row in season_games.findAll('tr'):
        cells = row.findAll(['td','th'])
        index = np.arange(0,len(cells)).tolist()
        for num in index:
            headers[num].append(cells[num].find(text=True))
            
    #Create the games Dataframe, set the column names to names instead of index values.
    season_gamesDf = pd.DataFrame(headers).transpose()
    season_gamesDf.columns = season_gamesDf.iloc[0]
    season_gamesDf = season_gamesDf.reindex(season_gamesDf.index.drop(0))
    
    #Find the most recent game updated. Defines the span of games which need to be updated.
    latest_entry = season_gamesDf['LOG'].last_valid_index()
    
    #Make the list of teams and penalties to be compared against while updating.
    teams = sorted(list(teamAbvs.values()))
    penalties = ['slashing', 'hooking', 'tripping', 'highsticking', 'interference', 'roughing', 'fighting']

    #Begin looping through the games.
    while next_entry < latest_entry:
        gamepage = local_season_df['Game HTML'][next_entry]
        openedgame = urlopen(gamepage)
        gamesoup = BeautifulSoup(openedgame)    
        penaltytable = gamesoup.find(id='penalty')
        
        #Initialize an empty penalty dictionary for each game.
        penalty_dict = {
            'slashing':0,
            'hooking':0,
            'tripping':0,
            'highsticking':0,
            'interference':0,
            'roughing':0,
            'fighting':0
        }
        
        #Going through the penalty entries to find calls.
        for row in penaltytable.findAll('tr'):
            rowtext = row.get_text().lower().split()
            #Record the call made on each line of the penalty table and update the value in the dictionary
            call = [word for word in rowtext if word in penalties]
            if len(call) > 0:
                penalty_dict[call[0]] += 1
                #Record the team who it was called on
                name = [word.upper() for word in rowtext if word.upper() in teams]
                local_team_season_df[name[0]].iloc[next_entry][call[0]] += 1 
            
        #Adding relevant entries to each list.
        #Using next_entry values to avoid indexing issues.
        #Updating in game totals.
        local_season_df['Slashing Calls in Game'][next_entry] = penalty_dict['slashing']
        local_season_df['roughing Calls in Game'][next_entry] = penalty_dict['roughing']
        local_season_df['hooking Calls in Game'][next_entry] = penalty_dict['hooking']
        local_season_df['interfering Calls in Game'][next_entry] = penalty_dict['interference']
        local_season_df['tripping Calls in Game'][next_entry] = penalty_dict['tripping']
        local_season_df['highsticking Calls in Game'][next_entry] = penalty_dict['highsticking']
        local_season_df['fighting Calls in Game'][next_entry] = penalty_dict['fighting']
        
        #Updating full season totals.
        if next_entry == 1:
            local_season_df['Total Slashing Calls'].iloc[next_entry] = penalty_dict['slashing']
            local_season_df['Total roughing Calls'].iloc[next_entry] = penalty_dict['roughing']
            local_season_df['Total hooking Calls'].iloc[next_entry] = penalty_dict['hooking']
            local_season_df['Total interfering Calls'].iloc[next_entry] = penalty_dict['interference']
            local_season_df['Total tripping Calls'].iloc[next_entry] = penalty_dict['tripping']
            local_season_df['Total highsticking Calls'].iloc[next_entry] = penalty_dict['highsticking']
            local_season_df['Total fighting Calls'].iloc[next_entry] = penalty_dict['fighting']
        else:
            local_season_df['Total Slashing Calls'].iloc[next_entry] += penalty_dict['slashing']
            local_season_df['Total roughing Calls'].iloc[next_entry] += penalty_dict['roughing']
            local_season_df['Total hooking Calls'].iloc[next_entry] += penalty_dict['hooking']
            local_season_df['Total interfering Calls'].iloc[next_entry] += penalty_dict['interference']
            local_season_df['Total tripping Calls'].iloc[next_entry] += penalty_dict['tripping']
            local_season_df['Total highsticking Calls'].iloc[next_entry] += penalty_dict['highsticking']
            local_season_df['Total fighting Calls'].iloc[next_entry] += penalty_dict['fighting']
        
        #Marking the game as 'updated'.
        local_season_df['Updated'].iloc[next_entry] = 1.0
        
        #Increment to the next entry
        next_entry += 1
    
    #Save the updated data and overwrite the previous file.
    local_season_df.to_csv('NHL_penalties_'+season_year, index = False,  header=columns)
    local_team_season_df.to_csv('team_penalties_'+season_year, index=False)

In [23]:
updateSeason('https://www.hockey-reference.com/leagues/NHL_2018_games.html')

['1st', 'period']
['12:12', 'edm', 'kris', 'russell:', 'holding', 'the', 'stick', '—', '2', 'min']
['15:43', 'cgy', 'tanner', 'glass:', 'fighting', '—', '5', 'min']
['fighting']
CGY
['15:43', 'edm', 'zack', 'kassian:', 'fighting', '—', '5', 'min']
['fighting']
EDM
['17:16', 'cgy', 'matthew', 'tkachuk:', 'holding', 'the', 'stick', '—', '2', 'min']
['2nd', 'period']
['06:23', 'cgy', 'hooking', '—', '2', 'min']
['hooking']
CGY


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

['1st', 'period']
['00:50', 'pit', 'sidney', 'crosby:', 'hooking', '—', '2', 'min']
['hooking']
PIT
['16:26', 'pit', 'olli', 'maatta:', 'tripping', '—', '2', 'min']
['tripping']
PIT
['2nd', 'period']
['06:15', 'pit', 'sidney', 'crosby:', 'tripping', '—', '2', 'min']
['tripping']
PIT
['12:52', 'stl', 'robert', 'bortuzzo:', 'holding', '—', '2', 'min']
['14:03', 'pit', 'jake', 'guentzel:', 'slashing', '—', '2', 'min']
['slashing']
PIT
['14:55', 'stl', 'too', 'many', 'men', 'on', 'ice', '—', '2', 'min']
['3rd', 'period']
['05:26', 'stl', 'alex', 'pietrangelo:', 'slashing', '—', '2', 'min']
['slashing']
STL
['12:10', 'stl', 'robert', 'bortuzzo:', 'tripping', '—', '2', 'min']
['tripping']
STL
['12:26', 'stl', 'brayden', 'schenn:', 'closing', 'hand', 'on', 'puck', '—', '2', 'min']
['1st', 'period']
['10:21', 'sjs', 'kevin', 'labanc:', 'goaltender', 'interference', '—', '2', 'min']
['interference']
SJS
['18:09', 'phi', 'brandon', 'manning:', 'hooking', '—', '2', 'min']
['hooking']
PHI
['2nd', 

In [None]:
##Want to update with: handling of cancelled games