# Bonus Analysis

1) We seek to find the most common bracket, as opposed to the most common winner.

2) We seek to generate a list of top contenders to select a hockey pool with! 

In [None]:
import numpy as np
import pandas as pd
import json
from scipy import stats

from bracket_utils import simulate,trial
from generate_probabilities import gen_prob

In [None]:
def define_seasons(years, playoff_years):
    seasons = []
    for year in years:
        seasons.append(pd.read_csv(f'data/{year}/regular.csv'))
        
    for year in playoff_years:
        seasons.append(pd.read_csv(f'data/{year}/playoffs.csv'))
        
    return seasons

In [None]:
prediction_year = 2019

# only include 4 years of data

reg_included = list(range(prediction_year-3,prediction_year+1))

poffs_included = [] #[2014,2015,2016,2017,2018]

assert prediction_year not in poffs_included, "Included playoff data from prediction year!"

seasons = define_seasons(reg_included,poffs_included)

with open(f'data/{prediction_year}/bracket.json','r') as f:
    bracket = json.load(f)

In [None]:
pmat = gen_prob(bracket,seasons)

num_trials = 100000

np.random.seed(0)

TGP,winners,rnds = simulate(num_trials,pmat,include_rounds=True)

## Most Common Bracket

* If we retain the round results for every trial then we can count the most common bracket!
* In this case it turns out even the most common bracket is vanishingly unlikely, so this isn't a money maker!

In [None]:
signatures = [''.join([str(int(x)) for x in rnd.flatten()]) for rnd in rnds]
el,count = stats.mode(signatures)
idx = signatures.index(el[0])
arr = rnds[idx].T

rounds = []
for col in arr:
    rounds.append([int(el) for el in col if el >= 0])
    
print(f"Probability of most common bracket: {count[0]}/{num_trials} ~ {count[0]/num_trials}")

In [None]:
import networkx as nx
import io
import pygraphviz
from networkx.drawing.nx_agraph import write_dot, graphviz_layout
from PIL import Image

teams = bracket['first_round']

G = nx.DiGraph()

for i in range(len(rounds)-1,-1,-1):
    M = len(rounds[i])
    for j in range(M):
        idx = int(rounds[i][j])
        
        if idx >= 0:
            G.add_node(teams[idx]+f'(R{i})')

            if i < len(rounds)-1:
                parent = int(np.floor(j/2.0))
                G.add_edge(teams[idx]+f'(R{i})',teams[int(rounds[i+1][parent])]+f'(R{i+1})')

            

A = nx.nx_agraph.to_agraph(G)
A.graph_attr.update(landscape='false',ranksep='3',strict='false')


# Possible layouts [‘neato’|’dot’|’twopi’|’circo’|’fdp’|’nop’] WARNING: nop and fdp might crash the container.
A.layout('twopi', args='-Nfontsize=8 -Nwidth=".2" -Nheight=".2" -Nmargin=0 -Gfontsize=6 -Goverlap=True')
A.draw('bracket.png')
im = Image.open('bracket.png')
display(im)

## Hockey pool picks

* Via the monte carlo we are able to estimate the expected games played for a given team.
* If we assume that the expected games played and the expected points per game for a player are independent (rough assumption)
* Then we can define the expected points over the playoffs!

## E[Points] = E[(Points per game)(Games Played)] = E[(Points per game)]E[(Games Played)]

* Given this metric we can then rank players accordingly and make our picks for the pool!

### Define Player Type

* We've pre gathered this data via another python package, but you can gather it in a similar fashion as the prior datasets.

In [None]:
player_type = 'defense'
#player_type = 'goalie'
#player_type = 'forward'

In [None]:
expected_games_played = np.mean(TGP,axis=0)

In [None]:
expected_games_played = np.mean(TGP,axis=0)
team_data = pd.DataFrame(data ={'team_full':bracket['first_round'],'EGP':expected_games_played})
player_data = pd.read_csv(f'data/players/df_{player_type}_final.csv')

df = team_data.merge(player_data,how='left',on=['team_full'])

In [None]:
df['expected_points'] = df['EGP']*df['prob_pts']

In [None]:
with pd.option_context('display.max_rows',500):
    display(df.sort_values('expected_points',ascending=False))