# Overview

* We seek to model the outcome of a given playoff bracket based on historical performance data.

* We do this via a tried and true method of monte carlo simulation. 

* We must first gather the data for this analysis to do this we will access a hockey statistics website

* We require the historical game data for the year we'd like to model as well as some years previous.



## Legal Disclaimer
The analysis presented here is strictly for teaching purposes, 

The ToS of the website accessed is here:
* https://www.sports-reference.com/termsofuse.html

A statement about data requests here:
* https://www.sports-reference.com/data_use.html

And the associated robots.txt is here:
* https://www.sports-reference.com/robots.txt

As well as a guide to understanding these:
* https://www.promptcloud.com/blog/how-to-read-and-respect-robots-file/
    

## Imports

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import json
import regex

from IPython.display import HTML

import time

# Helper to fetch historical game data and historical bracket data.

In [None]:
def get_game_data(year):
    contents = requests.get(f'https://www.hockey-reference.com/leagues/NHL_{year}_games.html')
    soup = BeautifulSoup(contents.content)
    
    tables = soup.find_all('table')
    
    dataframes = {}
    
    for table in tables:
        table_rows = table.find_all('tr')
        data = {}
        for row in table_rows[1:]:
            cells = row.find_all('td') + row.find_all('th')
            for cell in cells:
                key = cell.get('data-stat')
                if key not in data:
                    data[key] = []
                    data[key].append(cell.text)
                else:
                    data[key].append(cell.text)

        df = pd.DataFrame(data=data)
        
        dataframes[table.find('caption').text] = df
        
    return dataframes

In [None]:
def get_bracket_data(year):
    playoff_pattern = '(?P<win>.*) over (?P<lose>.*)'
    pp = regex.compile(playoff_pattern)
    
    contents = requests.get(f'https://www.hockey-reference.com/leagues/NHL_{year}.html')
    soup = BeautifulSoup(contents.content)
    
    divisions = {"Atlantic Division":[],
            "Metropolitan Division":[],
            "Central Division":[],
            "Pacific Division":[]}
    
    #key = None
    
    standings = [soup.find('table',id='standings_EAS'),soup.find('table',id='standings_WES')]

    for table in standings:
        rows = table.find_all('tr')
        for row in rows[1:]:
            if row.get('class')[0] == 'thead':
                key = row.get_text().strip()
            elif row.get('class')[0] == 'full_table':
                team_name = [x.text for x in row.find_all('th') if x.get('data-stat') == 'team_name'][0]
                pts = [x.text for x in row.find_all('td') if x.get('data-stat') == 'points'][0]
                if '*' in team_name:
                    divisions[key].append((int(pts),team_name.replace('*','')))
                    
                    
    conferences= [('Atlantic Division','Metropolitan Division'),
              ('Central Division', 'Pacific Division')]

    results = {'first_round':[],
               'second_round':[],
               'conference_finals':[],
               'final':[],
               'winner':[]
    }

    for conference in conferences:

        D1 = divisions[conference[0]][0:3]
        D2 = divisions[conference[1]][0:3]

        WC = divisions[conference[0]][3:] + divisions[conference[1]][3:]


        if len(set([x[0] for x in WC])) == 1 or len(set([x[0] for x in [D1[0]]+[D2[0]]])) == 1:

            print('SEEDING TIE!\n')
            print(f'1    {D1[0][1]} vs {WC[0][1]} AND {D2[0][1]} vs {WC[1][1]}')
            print('OR')
            print(f'2    {D1[0][1]} vs {WC[1][1]} AND {D2[0][1]} vs {WC[0][1]}')

            
            display(HTML(f'<a href="https://en.wikipedia.org/wiki/{year}_Stanley_Cup_playoffs#Playoff_bracket">Wikipedia Bracket</a>'))
            
            inp = input('Please verify the correct seed (1 or 2):')  

            if inp == '1':
                D1.insert(1,WC[0])
                D2.insert(1,WC[1])
            elif inp == '2':
                D1.insert(1,WC[1])
                D2.insert(1,WC[0])
            else:
                raise Exception('Incorrect Input Selected!')

        else:   
            ordered = list(sorted(WC,reverse=True))

            if D1[0][0] > D2[0][0]:
                D1.insert(1,ordered[-1])
                D2.insert(1,ordered[0])
            else:
                D2.insert(1,ordered[-1])
                D1.insert(1,ordered[0])

        results['first_round'] = results['first_round'] + D1 + D2

    results['first_round'] = [x[1] for x in results['first_round']]
        
    # Get remaining rounds and reorder accordingly  
    
    poff_table = soup.find('table',id='all_playoffs')
    
    poff_rows = poff_table.find_all('tr')
    
    for row in poff_rows:
        cells = [c.text for c in row.find_all('td')]
        if 'First Round' in cells:
            key = None
        if 'Second Round' in cells:
            key = 'second_round'
        elif 'Conference Finals' in cells:
            key = 'conference_finals'
        elif 'Final' in cells:
            key = 'final'

        if key:
            for cell in cells:
                m = pp.match(cell)
                if m:
                    d = m.capturesdict()
                    results[key].append(d['win'][0])
                    results[key].append(d['lose'][0])

                    if key == 'final':
                        results['winner'].append(d['win'][0])

    
    
    # Reorder for compatibility
    
    keys = ['first_round','second_round','conference_finals','final']
    
    for i in range(1,len(keys)):
        temp = []
        for team in results[keys[i-1]]:
            if team in results[keys[i]]:
                temp.append(team)
        
        results[keys[i]] = temp         
    
    return results

In [None]:
def save_data(year):
    games = get_game_data(year)
    bracket = get_bracket_data(year)
    
    directory = os.path.join('data',str(year))
    
    if not os.path.isdir(directory):
        os.mkdir(directory)
    
    with open(os.path.join(directory,'bracket.json'),'w') as f:
        json.dump(bracket,f)
        
    games['Regular Season Table'].to_csv(os.path.join(directory,'regular.csv'),index=False)
    games['Playoffs Table'].to_csv(os.path.join(directory,'playoffs.csv'),index=False)
    
    return 0

# Aggregate datasets

In [None]:
TIMEOUT = 10 #s

for year in range(2014,2020):
    print(f'Fetching {year}')
    save_data(year)
    time.sleep(TIMEOUT) # timeout to prevent overloading the servers.