In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.ensemble import (GradientBoostingRegressor, 
                              GradientBoostingClassifier, 
                              AdaBoostClassifier,
                              RandomForestClassifier)

from sklearn.inspection import partial_dependence

import sklearn.model_selection as cv
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.inspection import plot_partial_dependence

## *EDA*

In [2]:
# Load dataframe containing individual games

game_df = pd.read_csv('data/game.csv')
game_df.head()

Unnamed: 0,game_id,season,type,date_time,date_time_GMT,away_team_id,home_team_id,away_goals,home_goals,outcome,home_rink_side_start,venue,venue_link,venue_time_zone_id,venue_time_zone_offset,venue_time_zone_tz
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,3,4,home win OT,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,4,1,away win REG,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,3,4,home win OT,left,Prudential Center,/api/v1/venues/null,America/New_York,-4,EDT
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,2,4,home win REG,left,Prudential Center,/api/v1/venues/null,America/New_York,-4,EDT
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,3,1,away win REG,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11429,2018030413,20182019,P,2019-06-02,2019-06-02T00:00:00Z,6,19,7,2,away win REG,left,Enterprise Center,/api/v1/venues/5076,America/Chicago,-5,CDT
11430,2018030414,20182019,P,2019-06-04,2019-06-04T00:00:00Z,6,19,2,4,home win REG,left,Enterprise Center,/api/v1/venues/5076,America/Chicago,-5,CDT
11431,2018030415,20182019,P,2019-06-07,2019-06-07T00:00:00Z,19,6,2,1,away win REG,left,TD Garden,/api/v1/venues/5085,America/New_York,-4,EDT
11432,2018030416,20182019,P,2019-06-10,2019-06-10T00:00:00Z,6,19,5,1,away win REG,left,Enterprise Center,/api/v1/venues/5076,America/Chicago,-5,CDT


In [3]:
# Add column goal spread per game and calculate the average
# Do the same for total goals per game

game_df['goal_spread'] = abs(game_df['away_goals'] - game_df['home_goals'])
print('Average goal spread: ', game_df['goal_spread'].mean())

game_df['total_goals'] = game_df['away_goals'] + game_df['home_goals']
print('Average goals per game: ', game_df['total_goals'].mean())

In [7]:
# Adjust win/loss column based on home team

game_df.loc[game_df['outcome'].str.contains('home'), 'outcome'] = 'win'
game_df.loc[game_df['outcome'].str.contains('away'), 'outcome'] = 'loss'

In [8]:
# Load dataframe containing team info
# This might be useful for finding out specific team names but won't be used for now.

team_df = pd.read_csv('data/team_info.csv')
team_df.head(6)

Unnamed: 0,team_id,franchiseId,shortName,teamName,abbreviation,link
0,1,23,New Jersey,Devils,NJD,/api/v1/teams/1
1,4,16,Philadelphia,Flyers,PHI,/api/v1/teams/4
2,26,14,Los Angeles,Kings,LAK,/api/v1/teams/26
3,14,31,Tampa Bay,Lightning,TBL,/api/v1/teams/14
4,6,6,Boston,Bruins,BOS,/api/v1/teams/6
5,3,10,NY Rangers,Rangers,NYR,/api/v1/teams/3


In [9]:
team_df[team_df['team_id'] == 5]

Unnamed: 0,team_id,franchiseId,shortName,teamName,abbreviation,link
6,5,17,Pittsburgh,Penguins,PIT,/api/v1/teams/5


* The dataframe below will be used to compile cumulative statistics as a season progresses.

In [10]:
# Load dataframe of game outcomes grouped by teams
# Each row is one game for one team.

team_game_df = pd.read_csv('data/game_teams_stats.csv')
team_game_df.loc[team_game_df.team_id == 1]

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways
0,2011030221,1,away,False,OT,Peter DeBoer,3,26,31,12,3,1,44.9,6,7
2,2011030222,1,away,True,REG,Peter DeBoer,4,35,32,12,4,0,50.9,8,7
5,2011030223,1,home,True,OT,Peter DeBoer,4,31,30,10,2,1,49.2,11,4
7,2011030224,1,home,True,REG,Peter DeBoer,4,43,19,4,5,1,37.5,5,3
8,2011030225,1,away,True,REG,Peter DeBoer,3,30,26,2,4,1,55.0,6,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22548,2018021199,1,away,False,REG,John Hynes,0,28,44,2,1,0,47.1,13,5
22567,2018021207,1,home,False,OT,John Hynes,2,27,20,2,3,1,47.8,11,10
22595,2018021222,1,home,True,REG,John Hynes,4,41,23,21,4,1,66.7,10,5
22644,2018021247,1,away,False,REG,John Hynes,1,37,30,4,3,0,50.0,7,10


In [11]:
# Create column showing previous game's outcome

team_game_df['streak'] = team_game_df.groupby('team_id')['won'].shift(fill_value=0)
team_game_df.head()

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,streak
0,2011030221,1,away,False,OT,Peter DeBoer,3,26,31,12,3,1,44.9,6,7,0
1,2011030221,4,home,True,OT,Peter Laviolette,4,36,27,6,6,1,55.1,13,4,0
2,2011030222,1,away,True,REG,Peter DeBoer,4,35,32,12,4,0,50.9,8,7,False
3,2011030222,4,home,False,REG,Peter Laviolette,1,20,24,32,5,0,49.1,9,6,True
4,2011030223,4,away,False,OT,Peter Laviolette,3,28,28,4,5,1,50.8,2,1,False


In [12]:
# Replace 0s in the 'streak' column with False due a team not having a previous game in the dataset

team_game_df = team_game_df.replace({'streak': 0}, False)
team_game_df.head()

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,streak
0,2011030221,1,away,False,OT,Peter DeBoer,3,26,31,12,3,1,44.9,6,7,False
1,2011030221,4,home,True,OT,Peter Laviolette,4,36,27,6,6,1,55.1,13,4,False
2,2011030222,1,away,True,REG,Peter DeBoer,4,35,32,12,4,0,50.9,8,7,False
3,2011030222,4,home,False,REG,Peter Laviolette,1,20,24,32,5,0,49.1,9,6,True
4,2011030223,4,away,False,OT,Peter Laviolette,3,28,28,4,5,1,50.8,2,1,False


In [13]:
# Merge game dataframe to pull timezone data

df = pd.merge(team_game_df, game_df[['game_id','venue_time_zone_offset']],on='game_id', how='left')
df.head()

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,streak,venue_time_zone_offset
0,2011030221,1,away,False,OT,Peter DeBoer,3,26,31,12,3,1,44.9,6,7,False,-4
1,2011030221,4,home,True,OT,Peter Laviolette,4,36,27,6,6,1,55.1,13,4,False,-4
2,2011030222,1,away,True,REG,Peter DeBoer,4,35,32,12,4,0,50.9,8,7,False,-4
3,2011030222,4,home,False,REG,Peter Laviolette,1,20,24,32,5,0,49.1,9,6,True,-4
4,2011030223,4,away,False,OT,Peter Laviolette,3,28,28,4,5,1,50.8,2,1,False,-4


In [14]:
# Create columns to note previous game's time zone and time traveled between games

df['prev_time'] = df.groupby('team_id')['venue_time_zone_offset'].shift(fill_value=0)
df['time_travel'] = abs(df['prev_time'] - df['venue_time_zone_offset'])
df.head()

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,streak,venue_time_zone_offset,prev_time,time_travel
0,2011030221,1,away,False,OT,Peter DeBoer,3,26,31,12,3,1,44.9,6,7,False,-4,0,4
1,2011030221,4,home,True,OT,Peter Laviolette,4,36,27,6,6,1,55.1,13,4,False,-4,0,4
2,2011030222,1,away,True,REG,Peter DeBoer,4,35,32,12,4,0,50.9,8,7,False,-4,-4,0
3,2011030222,4,home,False,REG,Peter Laviolette,1,20,24,32,5,0,49.1,9,6,True,-4,-4,0
4,2011030223,4,away,False,OT,Peter Laviolette,3,28,28,4,5,1,50.8,2,1,False,-4,-4,0


In [15]:
# Replace any values greater than 3 with 0 due to no previous game in the dateset

df['time_travel'] = df['time_travel'].apply(lambda x: 0 if abs(x) > 3 else x)
df.tail(5)

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,streak,venue_time_zone_offset,prev_time,time_travel
22863,2018030415,6,home,False,REG,Bruce Cassidy,1,39,43,2,3,0,40.6,4,11,False,-4,-5,1
22864,2018030416,6,away,True,REG,Bruce Cassidy,5,32,27,10,4,1,41.3,4,10,False,-5,-4,1
22865,2018030416,19,home,False,REG,Craig Berube,1,29,29,20,4,0,58.7,12,11,True,-5,-4,1
22866,2018030417,19,away,True,REG,Craig Berube,4,20,36,2,0,0,49.0,7,8,False,-4,-5,1
22867,2018030417,6,home,False,REG,Bruce Cassidy,1,33,28,0,1,0,51.0,13,6,True,-4,-5,1


In [16]:
# Replace bools with 1, 0

df = df.replace({'won': False}, 0)
df = df.replace({'won': True}, 1)
df = df.replace({'streak': False}, 0)
df = df.replace({'streak': True}, 1)
df = df.replace({'HoA': 'away'}, 0)
df = df.replace({'HoA': 'home'}, 1)

X = df.drop(columns=['game_id', 'won', 'settled_in', 'head_coach', 'venue_time_zone_offset', 'prev_time'])
y = df.won

* Analyzing wins/losses for teams below

In [17]:
df[df['team_id'] == 1]['won'].sum()

328.0

In [18]:
df['team_id'].value_counts()

5     815
6     812
28    805
15    804
3     802
16    793
14    790
19    788
18    787
26    783
24    774
8     751
17    751
30    748
9     746
4     745
23    744
25    736
2     736
1     733
10    731
29    731
21    729
20    724
12    719
13    717
22    717
7     711
52    649
53    410
27    314
54    191
11     82
Name: team_id, dtype: int64

In [19]:
df['team_id'].value_counts()[1]

733

In [20]:
# wins/losses/winning % per team_id

team_ids = list(df['team_id'].unique())
team_ids = sorted(team_ids)

team_win_loss = {}
for team in team_ids:
    w = df[df['team_id'] == team]['won'].sum()
    l = df['team_id'].value_counts()[team] - w
    per = w / (w + l)
    team_win_loss[team] = (w, l, per)

* looks like team 5 has the most wins (Pittsburgh)

In [21]:
team_win_loss

{1: (328.0, 405.0, 0.44747612551159616),
 2: (352.0, 384.0, 0.4782608695652174),
 3: (426.0, 376.0, 0.5311720698254364),
 4: (367.0, 378.0, 0.49261744966442955),
 5: (478.0, 337.0, 0.5865030674846625),
 6: (465.0, 347.0, 0.5726600985221675),
 7: (276.0, 435.0, 0.3881856540084388),
 8: (380.0, 371.0, 0.5059920106524634),
 9: (338.0, 408.0, 0.45308310991957107),
 10: (341.0, 390.0, 0.466484268125855),
 11: (34.0, 48.0, 0.4146341463414634),
 12: (319.0, 400.0, 0.44367176634214184),
 13: (317.0, 400.0, 0.4421199442119944),
 14: (449.0, 341.0, 0.5683544303797469),
 15: (461.0, 343.0, 0.5733830845771144),
 16: (436.0, 357.0, 0.5498108448928121),
 17: (357.0, 394.0, 0.47536617842876167),
 18: (418.0, 369.0, 0.531130876747141),
 19: (445.0, 343.0, 0.5647208121827412),
 20: (350.0, 374.0, 0.48342541436464087),
 21: (332.0, 397.0, 0.4554183813443073),
 22: (285.0, 432.0, 0.39748953974895396),
 23: (360.0, 384.0, 0.4838709677419355),
 24: (423.0, 351.0, 0.5465116279069767),
 25: (372.0, 364.0, 0.

In [22]:
# begin compiling pregame statistics

df_pregame = df
df_pregame.head()

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,streak,venue_time_zone_offset,prev_time,time_travel
0,2011030221,1,0,0.0,OT,Peter DeBoer,3,26,31,12,3,1,44.9,6,7,0.0,-4,0,0
1,2011030221,4,1,1.0,OT,Peter Laviolette,4,36,27,6,6,1,55.1,13,4,0.0,-4,0,0
2,2011030222,1,0,1.0,REG,Peter DeBoer,4,35,32,12,4,0,50.9,8,7,0.0,-4,-4,0
3,2011030222,4,1,0.0,REG,Peter Laviolette,1,20,24,32,5,0,49.1,9,6,1.0,-4,-4,0
4,2011030223,4,0,0.0,OT,Peter Laviolette,3,28,28,4,5,1,50.8,2,1,0.0,-4,-4,0


## *Create working df to compare teams head to head*

In [23]:
# goals per game.  (this is goals per game, not just 'home' goals per game.  rename the column)
df_pregame['home_gpg'] = (df_pregame.groupby('team_id')['goals'].cumsum() - df_pregame['goals']) / df_pregame.groupby('team_id').cumcount()

In [24]:
# win percentage
df_pregame['win_per'] = (df_pregame.groupby('team_id')['won'].cumsum() - df_pregame['won']) / df_pregame.groupby('team_id').cumcount()

In [25]:
# more cumulative stats

df_pregame['shots_pg'] = (df_pregame.groupby('team_id')['shots'].cumsum() - df_pregame['shots']) / df_pregame.groupby('team_id').cumcount()
df_pregame['hits_pg'] = (df_pregame.groupby('team_id')['hits'].cumsum() - df_pregame['hits']) / df_pregame.groupby('team_id').cumcount()
df_pregame['pim_pg'] = (df_pregame.groupby('team_id')['pim'].cumsum() - df_pregame['pim']) / df_pregame.groupby('team_id').cumcount()
df_pregame['hits_pg'] = (df_pregame.groupby('team_id')['hits'].cumsum() - df_pregame['hits']) / df_pregame.groupby('team_id').cumcount()
df_pregame['fo_per'] = (df_pregame.groupby('team_id')['faceOffWinPercentage'].cumsum() - df_pregame['faceOffWinPercentage']) / df_pregame.groupby('team_id').cumcount()
df_pregame['giveaways_pg'] = (df_pregame.groupby('team_id')['giveaways'].cumsum() - df_pregame['giveaways']) / df_pregame.groupby('team_id').cumcount()
df_pregame['takeaways_pg'] = (df_pregame.groupby('team_id')['takeaways'].cumsum() - df_pregame['takeaways']) / df_pregame.groupby('team_id').cumcount()
df_pregame['ppg_per'] = ((df_pregame.groupby('team_id')['powerPlayGoals'].cumsum() - df_pregame['powerPlayGoals'])) / ((df_pregame.groupby('team_id')['powerPlayOpportunities'].cumsum() - df_pregame['powerPlayOpportunities']))

df_pregame

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,...,time_travel,home_gpg,win_per,shots_pg,hits_pg,pim_pg,fo_per,giveaways_pg,takeaways_pg,ppg_per
0,2011030221,1,0,0.0,OT,Peter DeBoer,3,26,31,12,...,0,,,,,,,,,
1,2011030221,4,1,1.0,OT,Peter Laviolette,4,36,27,6,...,0,,,,,,,,,
2,2011030222,1,0,1.0,REG,Peter DeBoer,4,35,32,12,...,0,3.000000,0.000000,26.000000,31.000000,12.000000,44.900000,6.000000,7.000000,0.333333
3,2011030222,4,1,0.0,REG,Peter Laviolette,1,20,24,32,...,0,4.000000,1.000000,36.000000,27.000000,6.000000,55.100000,13.000000,4.000000,0.166667
4,2011030223,4,0,0.0,OT,Peter Laviolette,3,28,28,4,...,0,2.500000,0.500000,28.000000,25.500000,19.000000,52.100000,11.000000,5.000000,0.090909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22863,2018030415,6,1,0.0,REG,Bruce Cassidy,1,39,43,2,...,1,2.975278,0.573548,32.428925,24.796044,11.001236,52.436341,8.194067,6.646477,0.204517
22864,2018030416,6,0,1.0,REG,Bruce Cassidy,5,32,27,10,...,1,2.972840,0.572840,32.437037,24.818519,10.990123,52.421728,8.188889,6.651852,0.204261
22865,2018030416,19,1,0.0,REG,Craig Berube,1,29,29,20,...,1,2.760814,0.564885,30.194656,23.487277,10.914758,50.743130,5.512723,6.726463,0.192056
22866,2018030417,19,0,1.0,REG,Craig Berube,4,20,36,2,...,1,2.758577,0.564168,30.193139,23.494282,10.926302,50.753240,5.520966,6.731893,0.191742


In [26]:
# create new df to compare head-to-head stats

test_game_df = game_df
test_game_df.drop(labels=['home_rink_side_start', 'venue', 'venue_link', 'venue_time_zone_id', 'venue_time_zone_tz'], axis=1, inplace=True)
test_game_df.drop(labels=['away_goals', 'home_goals', 'venue_time_zone_offset'], axis=1, inplace=True)
test_game_df.head()

Unnamed: 0,game_id,season,type,date_time,date_time_GMT,away_team_id,home_team_id,outcome,goal_spread,total_goals
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,win,1,7
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,loss,3,5
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,win,1,7
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,win,2,6
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,loss,2,4


In [27]:
# filter pregame stats by home team

filtered = df_pregame[df_pregame.HoA == 1]
filtered = filtered.filter(items=['game_id', 'streak', 'time_travel', 'home_gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered.head()

Unnamed: 0,game_id,streak,time_travel,home_gpg,win_per,ppg_per,shots_pg,hits_pg,pim_pg,fo_per,giveaways_pg,takeaways_pg
1,2011030221,0.0,0,,,,,,,,,
3,2011030222,1.0,0,4.0,1.0,0.166667,36.0,27.0,6.0,55.1,13.0,4.0
5,2011030223,1.0,0,3.5,0.5,0.142857,30.5,31.5,12.0,47.9,7.0,7.0
7,2011030224,1.0,0,3.666667,0.666667,0.222222,30.666667,31.0,11.333333,48.333333,8.333333,6.0
9,2011030225,0.0,0,2.5,0.25,0.166667,26.5,25.5,13.0,54.375,11.0,4.0


In [28]:
# merge home team pregame stats

test_game_df = test_game_df.merge(filtered, on='game_id', how='left')
test_game_df = test_game_df.rename(columns={'streak': 'home_streak', 'time_travel': 'home_time_travel', "win_per": "home_win_per", "ppg_per": "home_ppg_per", 'shots_pg': 'home_shots_pg', 'hits_pg': 'home_hits_pg', 'pim_pg': 'home_pim_pg', 'fo_per': 'home_fo_per', 'giveaways_pg': 'home_giveaways_pg',
       'takeaways_pg': 'home_takeaways_pg'})
test_game_df.head()

Unnamed: 0,game_id,season,type,date_time,date_time_GMT,away_team_id,home_team_id,outcome,goal_spread,total_goals,...,home_time_travel,home_gpg,home_win_per,home_ppg_per,home_shots_pg,home_hits_pg,home_pim_pg,home_fo_per,home_giveaways_pg,home_takeaways_pg
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,win,1,7,...,0,,,,,,,,,
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,loss,3,5,...,0,4.0,1.0,0.166667,36.0,27.0,6.0,55.1,13.0,4.0
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,win,1,7,...,0,3.5,0.5,0.142857,30.5,31.5,12.0,47.9,7.0,7.0
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,win,2,6,...,0,3.666667,0.666667,0.222222,30.666667,31.0,11.333333,48.333333,8.333333,6.0
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,loss,2,4,...,0,2.5,0.25,0.166667,26.5,25.5,13.0,54.375,11.0,4.0


In [29]:
# filter away team pregame stats

filtered = df_pregame[df_pregame.HoA == 0]
filtered = filtered.filter(items=['game_id', 'streak', 'time_travel', 'home_gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered = filtered.rename(columns={"home_gpg": "away_gpg"})
filtered.head()

Unnamed: 0,game_id,streak,time_travel,away_gpg,win_per,ppg_per,shots_pg,hits_pg,pim_pg,fo_per,giveaways_pg,takeaways_pg
0,2011030221,0.0,0,,,,,,,,,
2,2011030222,0.0,0,3.0,0.0,0.333333,26.0,31.0,12.0,44.9,6.0,7.0
4,2011030223,0.0,0,2.5,0.5,0.090909,28.0,25.5,19.0,52.1,11.0,5.0
6,2011030224,0.0,0,2.666667,0.333333,0.125,28.0,26.333333,14.0,51.666667,8.0,3.666667
8,2011030225,1.0,0,3.75,0.75,0.214286,33.75,28.0,9.5,45.625,7.5,5.25


In [30]:
# merge away team pregame stats

test_game_df = test_game_df.merge(filtered, on='game_id', how='left')
test_game_df = test_game_df.rename(columns={'streak': 'away_streak', 'time_travel': 'away_time_travel',"win_per": "away_win_per", "ppg_per": "away_ppg_per", 'shots_pg': 'away_shots_pg', 'hits_pg': 'away_hits_pg', 'pim_pg': 'away_pim_pg', 'fo_per': 'away_fo_per', 'giveaways_pg': 'away_giveaways_pg',
       'takeaways_pg': 'away_takeaways_pg'})
test_game_df.head()

Unnamed: 0,game_id,season,type,date_time,date_time_GMT,away_team_id,home_team_id,outcome,goal_spread,total_goals,...,away_time_travel,away_gpg,away_win_per,away_ppg_per,away_shots_pg,away_hits_pg,away_pim_pg,away_fo_per,away_giveaways_pg,away_takeaways_pg
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,win,1,7,...,0,,,,,,,,,
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,loss,3,5,...,0,3.0,0.0,0.333333,26.0,31.0,12.0,44.9,6.0,7.0
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,win,1,7,...,0,2.5,0.5,0.090909,28.0,25.5,19.0,52.1,11.0,5.0
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,win,2,6,...,0,2.666667,0.333333,0.125,28.0,26.333333,14.0,51.666667,8.0,3.666667
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,loss,2,4,...,0,3.75,0.75,0.214286,33.75,28.0,9.5,45.625,7.5,5.25


In [31]:
# new working df with pregame stats for home and away to allow for head to head comparisons.

test_game_df.columns
test_game_df = test_game_df.fillna(0)

In [32]:
test_game_df.head()

Unnamed: 0,game_id,season,type,date_time,date_time_GMT,away_team_id,home_team_id,outcome,goal_spread,total_goals,...,away_time_travel,away_gpg,away_win_per,away_ppg_per,away_shots_pg,away_hits_pg,away_pim_pg,away_fo_per,away_giveaways_pg,away_takeaways_pg
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,win,1,7,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,loss,3,5,...,0,3.0,0.0,0.333333,26.0,31.0,12.0,44.9,6.0,7.0
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,win,1,7,...,0,2.5,0.5,0.090909,28.0,25.5,19.0,52.1,11.0,5.0
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,win,2,6,...,0,2.666667,0.333333,0.125,28.0,26.333333,14.0,51.666667,8.0,3.666667
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,loss,2,4,...,0,3.75,0.75,0.214286,33.75,28.0,9.5,45.625,7.5,5.25


In [33]:
model_df = test_game_df.drop(labels=['game_id', 'season', 'type', 'date_time','date_time_GMT'], axis=1)
model_df.head()

Unnamed: 0,away_team_id,home_team_id,outcome,goal_spread,total_goals,home_streak,home_time_travel,home_gpg,home_win_per,home_ppg_per,...,away_time_travel,away_gpg,away_win_per,away_ppg_per,away_shots_pg,away_hits_pg,away_pim_pg,away_fo_per,away_giveaways_pg,away_takeaways_pg
0,1,4,win,1,7,0.0,0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,4,loss,3,5,1.0,0,4.0,1.0,0.166667,...,0,3.0,0.0,0.333333,26.0,31.0,12.0,44.9,6.0,7.0
2,4,1,win,1,7,1.0,0,3.5,0.5,0.142857,...,0,2.5,0.5,0.090909,28.0,25.5,19.0,52.1,11.0,5.0
3,4,1,win,2,6,1.0,0,3.666667,0.666667,0.222222,...,0,2.666667,0.333333,0.125,28.0,26.333333,14.0,51.666667,8.0,3.666667
4,1,4,loss,2,4,0.0,0,2.5,0.25,0.166667,...,0,3.75,0.75,0.214286,33.75,28.0,9.5,45.625,7.5,5.25


In [34]:
model_df.columns

Index(['away_team_id', 'home_team_id', 'outcome', 'goal_spread', 'total_goals',
       'home_streak', 'home_time_travel', 'home_gpg', 'home_win_per',
       'home_ppg_per', 'home_shots_pg', 'home_hits_pg', 'home_pim_pg',
       'home_fo_per', 'home_giveaways_pg', 'home_takeaways_pg', 'away_streak',
       'away_time_travel', 'away_gpg', 'away_win_per', 'away_ppg_per',
       'away_shots_pg', 'away_hits_pg', 'away_pim_pg', 'away_fo_per',
       'away_giveaways_pg', 'away_takeaways_pg'],
      dtype='object')

In [35]:
from sklearn.linear_model import LogisticRegression, LinearRegression

In [36]:
X = model_df.drop(labels=['outcome'], axis=1)
y = model_df.outcome

* model_df to be used for modeling.  Contains stats for each team up to but not including the current game.

### **Initial logistic regression**

My initial models below all had ~55% accuracy.  Eliminated some features based on p-values and
dependence plots.  My target is ~60% so the next step is to breakdown team stats by season since
teams change from year to year.

In [37]:
X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.25, random_state=1)

In [38]:
model = LogisticRegression(C=1000)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1000)

In [39]:
coefs = []
for i in model.coef_:
    for coef in i:
        coefs.append(coef)

betas = {k: v for k, v in zip(list(X.columns), coefs)}
betas

{'away_team_id': 0.0024483241139516237,
 'home_team_id': -0.0002123379155126861,
 'goal_spread': 0.11778838271147228,
 'total_goals': 0.013019577348397223,
 'home_streak': -0.04782089532991535,
 'home_time_travel': 0.0190907261213392,
 'home_gpg': 0.39936093206214096,
 'home_win_per': 0.15736157763683936,
 'home_ppg_per': 0.0398070527806266,
 'home_shots_pg': 0.016065334911097517,
 'home_hits_pg': 0.0022031429069358744,
 'home_pim_pg': -0.025743741272660665,
 'home_fo_per': 0.001738253698391103,
 'home_giveaways_pg': -0.0038693748182243295,
 'home_takeaways_pg': -0.031868382921486166,
 'away_streak': 0.04600526091668979,
 'away_time_travel': -0.0008062300799518117,
 'away_gpg': -0.3931470014804633,
 'away_win_per': -0.15590151083575682,
 'away_ppg_per': -0.03263389133153694,
 'away_shots_pg': -0.0416356634103675,
 'away_hits_pg': -0.0019002757876353685,
 'away_pim_pg': 0.03713117244567162,
 'away_fo_per': 0.0035227028533132304,
 'away_giveaways_pg': 0.022240654311177666,
 'away_takeawa

### **Logistic regression with time travel removed**

In [40]:
# no time travel
df_nott = model_df.drop(labels=['home_time_travel', 'away_time_travel'], axis=1)

In [41]:
X2 = df_nott.drop(labels=['outcome'], axis=1)
y2 = df_nott.outcome

In [42]:
y2 = y2.apply(lambda x: 1 if x == 'win' else 0)

In [43]:
X2_train, X2_test, y2_train, y2_test = cv.train_test_split(X2, y2, test_size=0.25, random_state=1)

In [44]:
model2 = LogisticRegression(C=1000)
model2.fit(X2_train, y2_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1000)

In [45]:
coefs2 = []
for i in model2.coef_:
    for coef in i:
        coefs2.append(coef)

betas2 = {k: v for k, v in zip(list(X2.columns), coefs)}
betas2

{'away_team_id': 0.0024483241139516237,
 'home_team_id': -0.0002123379155126861,
 'goal_spread': 0.11778838271147228,
 'total_goals': 0.013019577348397223,
 'home_streak': -0.04782089532991535,
 'home_gpg': 0.0190907261213392,
 'home_win_per': 0.39936093206214096,
 'home_ppg_per': 0.15736157763683936,
 'home_shots_pg': 0.0398070527806266,
 'home_hits_pg': 0.016065334911097517,
 'home_pim_pg': 0.0022031429069358744,
 'home_fo_per': -0.025743741272660665,
 'home_giveaways_pg': 0.001738253698391103,
 'home_takeaways_pg': -0.0038693748182243295,
 'away_streak': -0.031868382921486166,
 'away_gpg': 0.04600526091668979,
 'away_win_per': -0.0008062300799518117,
 'away_ppg_per': -0.3931470014804633,
 'away_shots_pg': -0.15590151083575682,
 'away_hits_pg': -0.03263389133153694,
 'away_pim_pg': -0.0416356634103675,
 'away_fo_per': -0.0019002757876353685,
 'away_giveaways_pg': 0.03713117244567162,
 'away_takeaways_pg': 0.0035227028533132304}

In [46]:
# log regression kfold on data with no time travel

kf = KFold(n_splits=5, shuffle=True)  # almost always use shuffle=True
logr_scores = []

model_kf = LogisticRegression()
    
for train, test in kf.split(X2):
    ## fit the model to training data
    model_kf.fit(X2.values[train], y2.values[train])
    ## evaluate the model on testing data!!!
    logr_scores.append(model_kf.score(X2.values[test], y2.values[test]))
    
print(np.mean(logr_scores))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.5529115266363458


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [47]:
import numpy as np
from scipy.stats import norm

def logit_pvalue(model, x):
    """ Calculate z-scores for scikit-learn LogisticRegression.
    parameters:
        model: fitted sklearn.linear_model.LogisticRegression with intercept and large C
        x:     matrix on which the model was fit
    This function uses asymtptics for maximum likelihood estimates.
    """
    p = model.predict_proba(x)
    n = len(p)
    m = len(model.coef_[0]) + 1
    coefs = np.concatenate([model.intercept_, model.coef_[0]])
    x_full = np.matrix(np.insert(np.array(x), 0, 1, axis = 1))
    ans = np.zeros((m, m))
    for i in range(n):
        ans = ans + np.dot(np.transpose(x_full[i, :]), x_full[i, :]) * p[i,1] * p[i, 0]
    vcov = np.linalg.inv(np.matrix(ans))
    se = np.sqrt(np.diag(vcov))
    t =  coefs/se  
    p = (1 - norm.cdf(abs(t))) * 2
    return p

logit_pvalue(model_kf, X2)

pvals = {}
for feat, pval in zip(list(X2.columns), logit_pvalue(model_kf, X2)):
    pvals[feat] = pval

In [48]:
print(betas2)
pvals

{'away_team_id': 0.0024483241139516237, 'home_team_id': -0.0002123379155126861, 'goal_spread': 0.11778838271147228, 'total_goals': 0.013019577348397223, 'home_streak': -0.04782089532991535, 'home_gpg': 0.0190907261213392, 'home_win_per': 0.39936093206214096, 'home_ppg_per': 0.15736157763683936, 'home_shots_pg': 0.0398070527806266, 'home_hits_pg': 0.016065334911097517, 'home_pim_pg': 0.0022031429069358744, 'home_fo_per': -0.025743741272660665, 'home_giveaways_pg': 0.001738253698391103, 'home_takeaways_pg': -0.0038693748182243295, 'away_streak': -0.031868382921486166, 'away_gpg': 0.04600526091668979, 'away_win_per': -0.0008062300799518117, 'away_ppg_per': -0.3931470014804633, 'away_shots_pg': -0.15590151083575682, 'away_hits_pg': -0.03263389133153694, 'away_pim_pg': -0.0416356634103675, 'away_fo_per': -0.0019002757876353685, 'away_giveaways_pg': 0.03713117244567162, 'away_takeaways_pg': 0.0035227028533132304}


{'away_team_id': 0.9979343230447202,
 'home_team_id': 0.7446289890787154,
 'goal_spread': 0.7938015342052842,
 'total_goals': 1.2294609774698984e-12,
 'home_streak': 0.23853866474356944,
 'home_gpg': 0.7495923148459447,
 'home_win_per': 0.027443429749803183,
 'home_ppg_per': 0.7928012276567542,
 'home_shots_pg': 0.9794955865091595,
 'home_hits_pg': 0.025249473429635794,
 'home_pim_pg': 0.3214367940874465,
 'home_fo_per': 0.05083431852441911,
 'home_giveaways_pg': 0.9406448423977596,
 'home_takeaways_pg': 0.7650942173203656,
 'away_streak': 0.0017547090122920217,
 'away_gpg': 0.019745686695822995,
 'away_win_per': 0.0547404222937129,
 'away_ppg_per': 0.788979569797287,
 'away_shots_pg': 0.9864939739461076,
 'away_hits_pg': 9.023356888526735e-05,
 'away_pim_pg': 0.7565862089893334,
 'away_fo_per': 0.20453849327561002,
 'away_giveaways_pg': 0.31867960767309156,
 'away_takeaways_pg': 0.4649131810066378}

In [49]:
# run lasso regression

from sklearn import linear_model

reg = linear_model.Lasso(alpha=0.1)
reg.fit(X2_train, y2_train)

reg.score(X2_test, y2_test)

-0.002118741157018489

In [50]:
new_feats = ['home_win_per', 'home_hits_pg', 'home_fo_per', 'away_streak', 'away_gpg', 'away_win_per', 'away_hits_pg', 'away_fo_per']
new_df = X2[new_feats]
new_df

Unnamed: 0,home_win_per,home_hits_pg,home_fo_per,away_streak,away_gpg,away_win_per,away_hits_pg,away_fo_per
0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,1.000000,27.000000,55.100000,0.0,3.000000,0.000000,31.000000,44.900000
2,0.500000,31.500000,47.900000,0.0,2.500000,0.500000,25.500000,52.100000
3,0.666667,31.000000,48.333333,0.0,2.666667,0.333333,26.333333,51.666667
4,0.250000,25.500000,54.375000,1.0,3.750000,0.750000,28.000000,45.625000
...,...,...,...,...,...,...,...,...
11429,0.564496,23.432950,50.738186,0.0,2.971499,0.573730,24.770756,52.438290
11430,0.563776,23.447704,50.730102,1.0,2.976485,0.574257,24.775990,52.442203
11431,0.573548,24.796044,52.436341,1.0,2.761783,0.564331,23.473885,50.732102
11432,0.564885,23.487277,50.743130,0.0,2.972840,0.572840,24.818519,52.421728


### **Logistic regression using only features with low p-values**

In [51]:
X3_train, X3_test, y3_train, y3_test = cv.train_test_split(new_df, y2, test_size=0.25, random_state=1)

In [52]:
model3 = LogisticRegression(C=1000)
model3.fit(X3_train, y3_train)
model3.score(X3_test, y3_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.546694648478489

In [53]:
kf = KFold(n_splits=5, shuffle=True)  # almost always use shuffle=True
logr_scores3 = []

model_kf3 = LogisticRegression()
    
for train, test in kf.split(new_df):
    ## fit the model to training data
    model_kf3.fit(new_df.values[train], y2.values[train])
    ## evaluate the model on testing data!!!
    logr_scores3.append(model_kf3.score(new_df.values[test], y2.values[test]))
    
print(np.mean(logr_scores3))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.5555366958666678


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### **Random forest using all columns except time travel**

In [54]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [55]:
rf = RandomForestClassifier(n_estimators=1000,
                           max_features='auto',
                           random_state=0,
                           oob_score=True, max_depth=9)
rf.fit(X_train, y_train)
rf.oob_score_

0.556734693877551

In [56]:
rf = RandomForestClassifier(n_estimators=500,
                           max_features='auto',
                           random_state=0,
                           oob_score=True, max_depth=9)
rf.fit(X3_train, y3_train)
rf.oob_score_

0.5528862973760933

In [57]:
rf

RandomForestClassifier(max_depth=9, n_estimators=500, oob_score=True,
                       random_state=0)

In [58]:
feat_scores = pd.Series(rf.feature_importances_,
                           index=X_train.columns)
feat_scores = feat_scores.sort_values()
ax = feat_scores.plot(kind='barh', 
                      figsize=(10,8),
                      color='b')
ax.set_title('Average Gini Importance')
ax.set_xlabel('Average contribution to information gain')

ValueError: Length of passed values is 8, index implies 26.

### **Gradient boost**

In [None]:
gb = GradientBoostingClassifier(loss='deviance',
                                   learning_rate=0.005,
                                   n_estimators=500,
                                   min_samples_leaf=5)
gb.fit(X2_train, y2_train)
gb.score(X2_test, y2_test)

In [None]:
gb3 = GradientBoostingClassifier(loss='deviance',
                                   learning_rate=0.005,
                                   n_estimators=500,
                                   min_samples_leaf=5)
gb3.fit(X3_train, y3_train)
gb3.score(X3_test, y3_test)

### **Breakdown by season**

The previous modeling included stats from all seasons put together.  Will breakdown cumulative stats by season to see if it improves accuracy.

In [None]:
season_df = game_df
season_team = team_game_df

In [None]:
season_team = pd.merge(season_team, season_df[['game_id', 'season']], on='game_id', how='left')

In [None]:
'''
Below is cumulative stats broken down by seasons:
season_20112012
season_20122013
season_20132014
season_20142015
season_20152016
season_20162017
season_20172018
season_20182019
'''

##### **20112012**

In [None]:
# 2 dfs for 2011-2012 season
season_20112012 = season_df[season_df.season == 20112012]
filtered = season_team[season_team.season == 20112012]

In [None]:
# now calc cumulative stats
filtered['gpg'] = (filtered.groupby('team_id')['goals'].cumsum() - filtered['goals']) / filtered.groupby('team_id').cumcount()
filtered['win_per'] = (filtered.groupby('team_id')['won'].cumsum() - filtered['won']) / filtered.groupby('team_id').cumcount()
filtered['shots_pg'] = (filtered.groupby('team_id')['shots'].cumsum() - filtered['shots']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['pim_pg'] = (filtered.groupby('team_id')['pim'].cumsum() - filtered['pim']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['fo_per'] = (filtered.groupby('team_id')['faceOffWinPercentage'].cumsum() - filtered['faceOffWinPercentage']) / filtered.groupby('team_id').cumcount()
filtered['giveaways_pg'] = (filtered.groupby('team_id')['giveaways'].cumsum() - filtered['giveaways']) / filtered.groupby('team_id').cumcount()
filtered['takeaways_pg'] = (filtered.groupby('team_id')['takeaways'].cumsum() - filtered['takeaways']) / filtered.groupby('team_id').cumcount()
filtered['ppg_per'] = ((filtered.groupby('team_id')['powerPlayGoals'].cumsum() - filtered['powerPlayGoals'])) / ((filtered.groupby('team_id')['powerPlayOpportunities'].cumsum() - filtered['powerPlayOpportunities']))

In [None]:
# filter home team stats
filtered_h = filtered[filtered.HoA == 'home']
filtered_h = filtered_h.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_h = filtered_h.rename(columns={'streak': 'home_streak', 'time_travel': 'home_time_travel', 'gpg': 'home_gpg', "win_per": "home_win_per", "ppg_per": "home_ppg_per", 'shots_pg': 'home_shots_pg', 'hits_pg': 'home_hits_pg', 'pim_pg': 'home_pim_pg', 'fo_per': 'home_fo_per', 'giveaways_pg': 'home_giveaways_pg',
       'takeaways_pg': 'home_takeaways_pg'})
# merge home team stats
season_20112012 = season_20112012.merge(filtered_h, on='game_id', how='left')

In [None]:
# merge home team stats
team_20112012 = filtered
season_20112012 = season_20112012.merge(filtered, on='game_id', how='left')

In [None]:
# filter away team stats
filtered_a = filtered[filtered.HoA == 'away']
filtered_a = filtered_a.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_a = filtered_a.rename(columns={'streak': 'away_streak', 'time_travel': 'away_time_travel', 'gpg': 'away_gpg', "win_per": "away_win_per", "ppg_per": "away_ppg_per", 'shots_pg': 'away_shots_pg', 'hits_pg': 'away_hits_pg', 'pim_pg': 'away_pim_pg', 'fo_per': 'away_fo_per', 'giveaways_pg': 'away_giveaways_pg',
       'takeaways_pg': 'away_takeaways_pg'})

# merge away team stats
season_20112012 = season_20112012.merge(filtered_a, on='game_id', how='left')

In [None]:
season_20112012

In [None]:
season_20112012 = season_20112012.drop(columns=['season_x', 'season_y', 'game_id', 'type', 'date_time', 'date_time_GMT'])


In [None]:
season_20112012 = season_20112012.replace({'home_streak': False}, 0)
season_20112012 = season_20112012.replace({'home_streak': True}, 1)
season_20112012 = season_20112012.replace({'away_streak': False}, 0)
season_20112012 = season_20112012.replace({'away_streak': True}, 1)
season_20112012 = season_20112012.replace({'outcome': 'loss'}, 0)
season_20112012 = season_20112012.replace({'outcome': 'win'}, 1)

In [None]:
season_20112012 = season_20112012.fillna(0)

In [None]:
season_20112012 = season_20112012.drop(labels=['team_id', 'HoA', 'won', 'settled_in', 'head_coach', 'goals', 'shots', 'hits', 'pim', 'powerPlayOpportunities', 'powerPlayGoals',
       'faceOffWinPercentage', 'giveaways', 'takeaways', 'streak', 'gpg',
       'win_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg',
       'takeaways_pg', 'ppg_per'], axis=1)

##### **2012-2013**

In [None]:
season_20122013 = season_df[season_df.season == 20122013]
filtered = season_team[season_team.season == 20122013]

# now calc cumulative stats
filtered['gpg'] = (filtered.groupby('team_id')['goals'].cumsum() - filtered['goals']) / filtered.groupby('team_id').cumcount()
filtered['win_per'] = (filtered.groupby('team_id')['won'].cumsum() - filtered['won']) / filtered.groupby('team_id').cumcount()
filtered['shots_pg'] = (filtered.groupby('team_id')['shots'].cumsum() - filtered['shots']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['pim_pg'] = (filtered.groupby('team_id')['pim'].cumsum() - filtered['pim']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['fo_per'] = (filtered.groupby('team_id')['faceOffWinPercentage'].cumsum() - filtered['faceOffWinPercentage']) / filtered.groupby('team_id').cumcount()
filtered['giveaways_pg'] = (filtered.groupby('team_id')['giveaways'].cumsum() - filtered['giveaways']) / filtered.groupby('team_id').cumcount()
filtered['takeaways_pg'] = (filtered.groupby('team_id')['takeaways'].cumsum() - filtered['takeaways']) / filtered.groupby('team_id').cumcount()
filtered['ppg_per'] = ((filtered.groupby('team_id')['powerPlayGoals'].cumsum() - filtered['powerPlayGoals'])) / ((filtered.groupby('team_id')['powerPlayOpportunities'].cumsum() - filtered['powerPlayOpportunities']))

# filter home team stats
filtered_h = filtered[filtered.HoA == 'home']
filtered_h = filtered_h.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_h = filtered_h.rename(columns={'streak': 'home_streak', 'time_travel': 'home_time_travel', 'gpg': 'home_gpg', "win_per": "home_win_per", "ppg_per": "home_ppg_per", 'shots_pg': 'home_shots_pg', 'hits_pg': 'home_hits_pg', 'pim_pg': 'home_pim_pg', 'fo_per': 'home_fo_per', 'giveaways_pg': 'home_giveaways_pg',
       'takeaways_pg': 'home_takeaways_pg'})
# merge home team stats
season_20122013 = season_20122013.merge(filtered_h, on='game_id', how='left')

# filter away team stats
filtered_a = filtered[filtered.HoA == 'away']
filtered_a = filtered_a.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_a = filtered_a.rename(columns={'streak': 'away_streak', 'time_travel': 'away_time_travel', 'gpg': 'away_gpg', "win_per": "away_win_per", "ppg_per": "away_ppg_per", 'shots_pg': 'away_shots_pg', 'hits_pg': 'away_hits_pg', 'pim_pg': 'away_pim_pg', 'fo_per': 'away_fo_per', 'giveaways_pg': 'away_giveaways_pg',
       'takeaways_pg': 'away_takeaways_pg'})

# merge away team stats
season_20122013 = season_20122013.merge(filtered_a, on='game_id', how='left')

In [None]:
season_20122013 = season_20122013.drop(columns=['season_x', 'season_y', 'game_id', 'type', 'date_time', 'date_time_GMT'])
season_20122013 = season_20122013.replace({'home_streak': False}, 0)
season_20122013 = season_20122013.replace({'home_streak': True}, 1)
season_20122013 = season_20122013.replace({'away_streak': False}, 0)
season_20122013 = season_20122013.replace({'away_streak': True}, 1)
season_20122013 = season_20122013.replace({'outcome': 'loss'}, 0)
season_20122013 = season_20122013.replace({'outcome': 'win'}, 1)
season_20122013.fillna(0)

In [None]:
season_20122013 = season_20122013.fillna(0)

In [None]:
season_20122013.columns

In [None]:
season_20122013 = season_20122013.drop(labels=['season'], axis=1)

##### **2013-2014**

In [None]:
season_20132014 = season_df[season_df.season == 20132014]
filtered = season_team[season_team.season == 20132014]

# now calc cumulative stats
filtered['gpg'] = (filtered.groupby('team_id')['goals'].cumsum() - filtered['goals']) / filtered.groupby('team_id').cumcount()
filtered['win_per'] = (filtered.groupby('team_id')['won'].cumsum() - filtered['won']) / filtered.groupby('team_id').cumcount()
filtered['shots_pg'] = (filtered.groupby('team_id')['shots'].cumsum() - filtered['shots']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['pim_pg'] = (filtered.groupby('team_id')['pim'].cumsum() - filtered['pim']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['fo_per'] = (filtered.groupby('team_id')['faceOffWinPercentage'].cumsum() - filtered['faceOffWinPercentage']) / filtered.groupby('team_id').cumcount()
filtered['giveaways_pg'] = (filtered.groupby('team_id')['giveaways'].cumsum() - filtered['giveaways']) / filtered.groupby('team_id').cumcount()
filtered['takeaways_pg'] = (filtered.groupby('team_id')['takeaways'].cumsum() - filtered['takeaways']) / filtered.groupby('team_id').cumcount()
filtered['ppg_per'] = ((filtered.groupby('team_id')['powerPlayGoals'].cumsum() - filtered['powerPlayGoals'])) / ((filtered.groupby('team_id')['powerPlayOpportunities'].cumsum() - filtered['powerPlayOpportunities']))

# filter home team stats
filtered_h = filtered[filtered.HoA == 'home']
filtered_h = filtered_h.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_h = filtered_h.rename(columns={'streak': 'home_streak', 'time_travel': 'home_time_travel', 'gpg': 'home_gpg', "win_per": "home_win_per", "ppg_per": "home_ppg_per", 'shots_pg': 'home_shots_pg', 'hits_pg': 'home_hits_pg', 'pim_pg': 'home_pim_pg', 'fo_per': 'home_fo_per', 'giveaways_pg': 'home_giveaways_pg',
       'takeaways_pg': 'home_takeaways_pg'})
# merge home team stats
season_20132014 = season_20132014.merge(filtered_h, on='game_id', how='left')

# filter away team stats
filtered_a = filtered[filtered.HoA == 'away']
filtered_a = filtered_a.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_a = filtered_a.rename(columns={'streak': 'away_streak', 'time_travel': 'away_time_travel', 'gpg': 'away_gpg', "win_per": "away_win_per", "ppg_per": "away_ppg_per", 'shots_pg': 'away_shots_pg', 'hits_pg': 'away_hits_pg', 'pim_pg': 'away_pim_pg', 'fo_per': 'away_fo_per', 'giveaways_pg': 'away_giveaways_pg',
       'takeaways_pg': 'away_takeaways_pg'})

# merge away team stats
season_20132014 = season_20132014.merge(filtered_a, on='game_id', how='left')

In [None]:
season_20132014 = season_20132014.drop(columns=['season_x', 'season_y', 'game_id', 'type', 'date_time', 'date_time_GMT'])
season_20132014 = season_20132014.replace({'home_streak': False}, 0)
season_20132014 = season_20132014.replace({'home_streak': True}, 1)
season_20132014 = season_20132014.replace({'away_streak': False}, 0)
season_20132014 = season_20132014.replace({'away_streak': True}, 1)
season_20132014 = season_20132014.replace({'outcome': 'loss'}, 0)
season_20132014 = season_20132014.replace({'outcome': 'win'}, 1)
season_20132014.fillna(0)

In [None]:
season_20132014 = season_20132014.fillna(0)

In [None]:
season_20132014 = season_20132014.drop(labels=['season'], axis=1)

##### **2014-2015**

In [None]:
season_20142015 = season_df[season_df.season == 20142015]
filtered = season_team[season_team.season == 20142015]

# now calc cumulative stats
filtered['gpg'] = (filtered.groupby('team_id')['goals'].cumsum() - filtered['goals']) / filtered.groupby('team_id').cumcount()
filtered['win_per'] = (filtered.groupby('team_id')['won'].cumsum() - filtered['won']) / filtered.groupby('team_id').cumcount()
filtered['shots_pg'] = (filtered.groupby('team_id')['shots'].cumsum() - filtered['shots']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['pim_pg'] = (filtered.groupby('team_id')['pim'].cumsum() - filtered['pim']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['fo_per'] = (filtered.groupby('team_id')['faceOffWinPercentage'].cumsum() - filtered['faceOffWinPercentage']) / filtered.groupby('team_id').cumcount()
filtered['giveaways_pg'] = (filtered.groupby('team_id')['giveaways'].cumsum() - filtered['giveaways']) / filtered.groupby('team_id').cumcount()
filtered['takeaways_pg'] = (filtered.groupby('team_id')['takeaways'].cumsum() - filtered['takeaways']) / filtered.groupby('team_id').cumcount()
filtered['ppg_per'] = ((filtered.groupby('team_id')['powerPlayGoals'].cumsum() - filtered['powerPlayGoals'])) / ((filtered.groupby('team_id')['powerPlayOpportunities'].cumsum() - filtered['powerPlayOpportunities']))

# filter home team stats
filtered_h = filtered[filtered.HoA == 'home']
filtered_h = filtered_h.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_h = filtered_h.rename(columns={'streak': 'home_streak', 'time_travel': 'home_time_travel', 'gpg': 'home_gpg', "win_per": "home_win_per", "ppg_per": "home_ppg_per", 'shots_pg': 'home_shots_pg', 'hits_pg': 'home_hits_pg', 'pim_pg': 'home_pim_pg', 'fo_per': 'home_fo_per', 'giveaways_pg': 'home_giveaways_pg',
       'takeaways_pg': 'home_takeaways_pg'})
# merge home team stats
season_20142015 = season_20142015.merge(filtered_h, on='game_id', how='left')

# filter away team stats
filtered_a = filtered[filtered.HoA == 'away']
filtered_a = filtered_a.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_a = filtered_a.rename(columns={'streak': 'away_streak', 'time_travel': 'away_time_travel', 'gpg': 'away_gpg', "win_per": "away_win_per", "ppg_per": "away_ppg_per", 'shots_pg': 'away_shots_pg', 'hits_pg': 'away_hits_pg', 'pim_pg': 'away_pim_pg', 'fo_per': 'away_fo_per', 'giveaways_pg': 'away_giveaways_pg',
       'takeaways_pg': 'away_takeaways_pg'})

# merge away team stats
season_20142015 = season_20142015.merge(filtered_a, on='game_id', how='left')

In [None]:
season_20142015 = season_20142015.drop(columns=['season_x', 'season_y', 'game_id', 'type', 'date_time', 'date_time_GMT'])
season_20142015 = season_20142015.replace({'home_streak': False}, 0)
season_20142015 = season_20142015.replace({'home_streak': True}, 1)
season_20142015 = season_20142015.replace({'away_streak': False}, 0)
season_20142015 = season_20142015.replace({'away_streak': True}, 1)
season_20142015 = season_20142015.replace({'outcome': 'loss'}, 0)
season_20142015 = season_20142015.replace({'outcome': 'win'}, 1)
season_20142015.fillna(0)

In [None]:
season_20142015 = season_20142015.fillna(0)

In [None]:
season_20142015 = season_20142015.drop(labels=['season'], axis=1)

##### **2015-2016**

In [None]:
season_20152016 = season_df[season_df.season == 20152016]
filtered = season_team[season_team.season == 20152016]

# now calc cumulative stats
filtered['gpg'] = (filtered.groupby('team_id')['goals'].cumsum() - filtered['goals']) / filtered.groupby('team_id').cumcount()
filtered['win_per'] = (filtered.groupby('team_id')['won'].cumsum() - filtered['won']) / filtered.groupby('team_id').cumcount()
filtered['shots_pg'] = (filtered.groupby('team_id')['shots'].cumsum() - filtered['shots']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['pim_pg'] = (filtered.groupby('team_id')['pim'].cumsum() - filtered['pim']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['fo_per'] = (filtered.groupby('team_id')['faceOffWinPercentage'].cumsum() - filtered['faceOffWinPercentage']) / filtered.groupby('team_id').cumcount()
filtered['giveaways_pg'] = (filtered.groupby('team_id')['giveaways'].cumsum() - filtered['giveaways']) / filtered.groupby('team_id').cumcount()
filtered['takeaways_pg'] = (filtered.groupby('team_id')['takeaways'].cumsum() - filtered['takeaways']) / filtered.groupby('team_id').cumcount()
filtered['ppg_per'] = ((filtered.groupby('team_id')['powerPlayGoals'].cumsum() - filtered['powerPlayGoals'])) / ((filtered.groupby('team_id')['powerPlayOpportunities'].cumsum() - filtered['powerPlayOpportunities']))

# filter home team stats
filtered_h = filtered[filtered.HoA == 'home']
filtered_h = filtered_h.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_h = filtered_h.rename(columns={'streak': 'home_streak', 'time_travel': 'home_time_travel', 'gpg': 'home_gpg', "win_per": "home_win_per", "ppg_per": "home_ppg_per", 'shots_pg': 'home_shots_pg', 'hits_pg': 'home_hits_pg', 'pim_pg': 'home_pim_pg', 'fo_per': 'home_fo_per', 'giveaways_pg': 'home_giveaways_pg',
       'takeaways_pg': 'home_takeaways_pg'})
# merge home team stats
season_20152016 = season_20152016.merge(filtered_h, on='game_id', how='left')

# filter away team stats
filtered_a = filtered[filtered.HoA == 'away']
filtered_a = filtered_a.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_a = filtered_a.rename(columns={'streak': 'away_streak', 'time_travel': 'away_time_travel', 'gpg': 'away_gpg', "win_per": "away_win_per", "ppg_per": "away_ppg_per", 'shots_pg': 'away_shots_pg', 'hits_pg': 'away_hits_pg', 'pim_pg': 'away_pim_pg', 'fo_per': 'away_fo_per', 'giveaways_pg': 'away_giveaways_pg',
       'takeaways_pg': 'away_takeaways_pg'})

# merge away team stats
season_20152016 = season_20152016.merge(filtered_a, on='game_id', how='left')

In [None]:
season_20152016 = season_20152016.drop(columns=['season_x', 'season_y', 'game_id', 'type', 'date_time', 'date_time_GMT'])
season_20152016 = season_20152016.replace({'home_streak': False}, 0)
season_20152016 = season_20152016.replace({'home_streak': True}, 1)
season_20152016 = season_20152016.replace({'away_streak': False}, 0)
season_20152016 = season_20152016.replace({'away_streak': True}, 1)
season_20152016 = season_20152016.replace({'outcome': 'loss'}, 0)
season_20152016 = season_20152016.replace({'outcome': 'win'}, 1)
season_20152016.fillna(0)

In [None]:
season_20152016 = season_20152016.fillna(0)

In [None]:
season_20152016 = season_20152016.drop(labels=['season'], axis=1)

##### **2016-2017**

In [None]:
season_20162017 = season_df[season_df.season == 20162017]
filtered = season_team[season_team.season == 20162017]

# now calc cumulative stats
filtered['gpg'] = (filtered.groupby('team_id')['goals'].cumsum() - filtered['goals']) / filtered.groupby('team_id').cumcount()
filtered['win_per'] = (filtered.groupby('team_id')['won'].cumsum() - filtered['won']) / filtered.groupby('team_id').cumcount()
filtered['shots_pg'] = (filtered.groupby('team_id')['shots'].cumsum() - filtered['shots']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['pim_pg'] = (filtered.groupby('team_id')['pim'].cumsum() - filtered['pim']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['fo_per'] = (filtered.groupby('team_id')['faceOffWinPercentage'].cumsum() - filtered['faceOffWinPercentage']) / filtered.groupby('team_id').cumcount()
filtered['giveaways_pg'] = (filtered.groupby('team_id')['giveaways'].cumsum() - filtered['giveaways']) / filtered.groupby('team_id').cumcount()
filtered['takeaways_pg'] = (filtered.groupby('team_id')['takeaways'].cumsum() - filtered['takeaways']) / filtered.groupby('team_id').cumcount()
filtered['ppg_per'] = ((filtered.groupby('team_id')['powerPlayGoals'].cumsum() - filtered['powerPlayGoals'])) / ((filtered.groupby('team_id')['powerPlayOpportunities'].cumsum() - filtered['powerPlayOpportunities']))

# filter home team stats
filtered_h = filtered[filtered.HoA == 'home']
filtered_h = filtered_h.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_h = filtered_h.rename(columns={'streak': 'home_streak', 'time_travel': 'home_time_travel', 'gpg': 'home_gpg', "win_per": "home_win_per", "ppg_per": "home_ppg_per", 'shots_pg': 'home_shots_pg', 'hits_pg': 'home_hits_pg', 'pim_pg': 'home_pim_pg', 'fo_per': 'home_fo_per', 'giveaways_pg': 'home_giveaways_pg',
       'takeaways_pg': 'home_takeaways_pg'})
# merge home team stats
season_20162017 = season_20162017.merge(filtered_h, on='game_id', how='left')

# filter away team stats
filtered_a = filtered[filtered.HoA == 'away']
filtered_a = filtered_a.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_a = filtered_a.rename(columns={'streak': 'away_streak', 'time_travel': 'away_time_travel', 'gpg': 'away_gpg', "win_per": "away_win_per", "ppg_per": "away_ppg_per", 'shots_pg': 'away_shots_pg', 'hits_pg': 'away_hits_pg', 'pim_pg': 'away_pim_pg', 'fo_per': 'away_fo_per', 'giveaways_pg': 'away_giveaways_pg',
       'takeaways_pg': 'away_takeaways_pg'})

# merge away team stats
season_20162017 = season_20162017.merge(filtered_a, on='game_id', how='left')

In [None]:
season_20162017 = season_20162017.drop(columns=['season_x', 'season_y', 'game_id', 'type', 'date_time', 'date_time_GMT'])
season_20162017 = season_20162017.replace({'home_streak': False}, 0)
season_20162017 = season_20162017.replace({'home_streak': True}, 1)
season_20162017 = season_20162017.replace({'away_streak': False}, 0)
season_20162017 = season_20162017.replace({'away_streak': True}, 1)
season_20162017 = season_20162017.replace({'outcome': 'loss'}, 0)
season_20162017 = season_20162017.replace({'outcome': 'win'}, 1)
season_20162017.fillna(0)

In [None]:
season_20162017 = season_20162017.fillna(0)

In [None]:
season_20162017 = season_20162017.drop(labels=['season'], axis=1)

##### **2017-2018**

In [None]:
season_20172018 = season_df[season_df.season == 20172018]
filtered = season_team[season_team.season == 20172018]

# now calc cumulative stats
filtered['gpg'] = (filtered.groupby('team_id')['goals'].cumsum() - filtered['goals']) / filtered.groupby('team_id').cumcount()
filtered['win_per'] = (filtered.groupby('team_id')['won'].cumsum() - filtered['won']) / filtered.groupby('team_id').cumcount()
filtered['shots_pg'] = (filtered.groupby('team_id')['shots'].cumsum() - filtered['shots']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['pim_pg'] = (filtered.groupby('team_id')['pim'].cumsum() - filtered['pim']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['fo_per'] = (filtered.groupby('team_id')['faceOffWinPercentage'].cumsum() - filtered['faceOffWinPercentage']) / filtered.groupby('team_id').cumcount()
filtered['giveaways_pg'] = (filtered.groupby('team_id')['giveaways'].cumsum() - filtered['giveaways']) / filtered.groupby('team_id').cumcount()
filtered['takeaways_pg'] = (filtered.groupby('team_id')['takeaways'].cumsum() - filtered['takeaways']) / filtered.groupby('team_id').cumcount()
filtered['ppg_per'] = ((filtered.groupby('team_id')['powerPlayGoals'].cumsum() - filtered['powerPlayGoals'])) / ((filtered.groupby('team_id')['powerPlayOpportunities'].cumsum() - filtered['powerPlayOpportunities']))

# filter home team stats
filtered_h = filtered[filtered.HoA == 'home']
filtered_h = filtered_h.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_h = filtered_h.rename(columns={'streak': 'home_streak', 'time_travel': 'home_time_travel', 'gpg': 'home_gpg', "win_per": "home_win_per", "ppg_per": "home_ppg_per", 'shots_pg': 'home_shots_pg', 'hits_pg': 'home_hits_pg', 'pim_pg': 'home_pim_pg', 'fo_per': 'home_fo_per', 'giveaways_pg': 'home_giveaways_pg',
       'takeaways_pg': 'home_takeaways_pg'})
# merge home team stats
season_20172018 = season_20172018.merge(filtered_h, on='game_id', how='left')

# filter away team stats
filtered_a = filtered[filtered.HoA == 'away']
filtered_a = filtered_a.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_a = filtered_a.rename(columns={'streak': 'away_streak', 'time_travel': 'away_time_travel', 'gpg': 'away_gpg', "win_per": "away_win_per", "ppg_per": "away_ppg_per", 'shots_pg': 'away_shots_pg', 'hits_pg': 'away_hits_pg', 'pim_pg': 'away_pim_pg', 'fo_per': 'away_fo_per', 'giveaways_pg': 'away_giveaways_pg',
       'takeaways_pg': 'away_takeaways_pg'})

# merge away team stats
season_20172018 = season_20172018.merge(filtered_a, on='game_id', how='left')

In [None]:
season_20172018 = season_20172018.drop(columns=['season_x', 'season_y', 'game_id', 'type', 'date_time', 'date_time_GMT'])
season_20172018 = season_20172018.replace({'home_streak': False}, 0)
season_20172018 = season_20172018.replace({'home_streak': True}, 1)
season_20172018 = season_20172018.replace({'away_streak': False}, 0)
season_20172018 = season_20172018.replace({'away_streak': True}, 1)
season_20172018 = season_20172018.replace({'outcome': 'loss'}, 0)
season_20172018 = season_20172018.replace({'outcome': 'win'}, 1)

In [None]:
season_20172018 = season_20172018.fillna(0)

In [None]:
season_20172018 = season_20172018.drop(labels=['season'], axis=1)

##### **2018-2019**

In [None]:
season_20182019 = season_df[season_df.season == 20182019]
filtered = season_team[season_team.season == 20182019]

# now calc cumulative stats
filtered['gpg'] = (filtered.groupby('team_id')['goals'].cumsum() - filtered['goals']) / filtered.groupby('team_id').cumcount()
filtered['win_per'] = (filtered.groupby('team_id')['won'].cumsum() - filtered['won']) / filtered.groupby('team_id').cumcount()
filtered['shots_pg'] = (filtered.groupby('team_id')['shots'].cumsum() - filtered['shots']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['pim_pg'] = (filtered.groupby('team_id')['pim'].cumsum() - filtered['pim']) / filtered.groupby('team_id').cumcount()
filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
filtered['fo_per'] = (filtered.groupby('team_id')['faceOffWinPercentage'].cumsum() - filtered['faceOffWinPercentage']) / filtered.groupby('team_id').cumcount()
filtered['giveaways_pg'] = (filtered.groupby('team_id')['giveaways'].cumsum() - filtered['giveaways']) / filtered.groupby('team_id').cumcount()
filtered['takeaways_pg'] = (filtered.groupby('team_id')['takeaways'].cumsum() - filtered['takeaways']) / filtered.groupby('team_id').cumcount()
filtered['ppg_per'] = ((filtered.groupby('team_id')['powerPlayGoals'].cumsum() - filtered['powerPlayGoals'])) / ((filtered.groupby('team_id')['powerPlayOpportunities'].cumsum() - filtered['powerPlayOpportunities']))

# filter home team stats
filtered_h = filtered[filtered.HoA == 'home']
filtered_h = filtered_h.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_h = filtered_h.rename(columns={'streak': 'home_streak', 'time_travel': 'home_time_travel', 'gpg': 'home_gpg', "win_per": "home_win_per", "ppg_per": "home_ppg_per", 'shots_pg': 'home_shots_pg', 'hits_pg': 'home_hits_pg', 'pim_pg': 'home_pim_pg', 'fo_per': 'home_fo_per', 'giveaways_pg': 'home_giveaways_pg',
       'takeaways_pg': 'home_takeaways_pg'})
# merge home team stats
season_20182019 = season_20182019.merge(filtered_h, on='game_id', how='left')

# filter away team stats
filtered_a = filtered[filtered.HoA == 'away']
filtered_a = filtered_a.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
filtered_a = filtered_a.rename(columns={'streak': 'away_streak', 'time_travel': 'away_time_travel', 'gpg': 'away_gpg', "win_per": "away_win_per", "ppg_per": "away_ppg_per", 'shots_pg': 'away_shots_pg', 'hits_pg': 'away_hits_pg', 'pim_pg': 'away_pim_pg', 'fo_per': 'away_fo_per', 'giveaways_pg': 'away_giveaways_pg',
       'takeaways_pg': 'away_takeaways_pg'})

# merge away team stats
season_20182019 = season_20182019.merge(filtered_a, on='game_id', how='left')

In [None]:
season_20182019 = season_20182019.drop(columns=['season_x', 'season_y', 'game_id', 'type', 'date_time', 'date_time_GMT'])
season_20182019 = season_20182019.replace({'home_streak': False}, 0)
season_20182019 = season_20182019.replace({'home_streak': True}, 1)
season_20182019 = season_20182019.replace({'away_streak': False}, 0)
season_20182019 = season_20182019.replace({'away_streak': True}, 1)
season_20182019 = season_20182019.replace({'outcome': 'loss'}, 0)
season_20182019 = season_20182019.replace({'outcome': 'win'}, 1)
season_20182019 = season_20182019.fillna(0)

In [None]:
season_20182019 = season_20182019.drop(labels=['season'], axis=1)

* Below are the dataframes containing data only for each season

In [None]:
df_list = [season_20112012,
           season_20122013,
           season_20132014,
           season_20142015,
           season_20152016,
           season_20162017,
           season_20172018,
           season_20182019]

In [None]:
season_names = ['s_20112012',
                's_20122013',
                's_20132014',
                's_20142015',
                's_20152016',
                's_20162017',
                's_20172018',
                's_20182019']

## **Models**

##### **Logistic Regression**

In [None]:
log_reg = []
for df in df_list:
    X = df.drop(labels=['outcome'], axis=1)
    y = df.outcome
    
    X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.25, random_state=1)
    
    kf = KFold(n_splits=5, shuffle=True)  # almost always use shuffle=True
    logr_scores = []

    model_kf = LogisticRegression()
    
    for train, test in kf.split(X_train):
        ## fit the model to training data
        model_kf.fit(X_train.values[train], y_train.values[train])
        ## evaluate the model on testing data!!!
        logr_scores.append(model_kf.score(X_train.values[test], y_train.values[test]))
    
    log_reg.append(np.mean(logr_scores))

In [None]:
log_reg_acc = {}
for k, v in zip(season_names, log_reg):
    log_reg_acc[k] = v
    
log_reg_acc

In [None]:
logit_pvalue(model_kf, X_train)

pvals = {}
for feat, pval in zip(list(X_train.columns), logit_pvalue(model_kf, X_train)):
    pvals[feat] = pval

print(betas2)
pvals

##### **Random Forest**

In [None]:
rand_for = []
for df in df_list: 
    X = df.drop(labels=['outcome'], axis=1)
    y = df.outcome
    
    X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.25, random_state=1)
    
    kf = KFold(n_splits=5, shuffle=True)  # almost always use shuffle=True
    rf_scores = []

    model_rf = RandomForestClassifier(n_estimators=1000,
                           max_features='auto',
                           random_state=0,
                           oob_score=True, max_depth=9)
    
    for train, test in kf.split(X_train):
        ## fit the model to training data
        model_rf.fit(X_train.values[train], y_train.values[train])
        ## evaluate the model on testing data!!!
        rf_scores.append(model_rf.score(X_train.values[test], y_train.values[test]))
    
    rand_for.append(np.mean(rf_scores))

In [None]:
rf_acc = {}
for k, v in zip(season_names, rand_for):
    rf_acc[k] = v
    
rf_acc

In [None]:
rf = RandomForestClassifier(n_estimators=1000,
                           max_features='auto',
                           random_state=0,
                           oob_score=True, max_depth=9)
rf_oob = []

for df in df_list:
    X = df.drop(labels=['outcome'], axis=1)
    y = df.outcome
    
    X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.25, random_state=1)
    
    rf.fit(X_train, y_train)
    rf_oob.append(rf.oob_score_)

oob_scores = {}
for k, v in zip(season_names, rf_oob):
    oob_scores[k] = v
oob_scores

* Season 2011-2012 is really high so maybe the sample isn't large enough.  Will try training a model for one season and use it to predict others.  

##### **Gradient Boosting**

In [None]:
grad_boo = []
for df in df_list: 
    X = df.drop(labels=['outcome'], axis=1)
    y = df.outcome
    
    X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.25, random_state=1)
    
    kf = KFold(n_splits=5, shuffle=True)  # almost always use shuffle=True
    gb_scores = []

    model_gb = GradientBoostingClassifier(loss='deviance',
                                   learning_rate=0.005,
                                   n_estimators=500,
                                   min_samples_leaf=5)
    
    for train, test in kf.split(X_train):
        ## fit the model to training data
        model_gb.fit(X_train.values[train], y_train.values[train])
        ## evaluate the model on testing data!!!
        gb_scores.append(model_gb.score(X_train.values[test], y_train.values[test]))
    
    grad_boo.append(np.mean(gb_scores))

In [None]:
gb_acc = {}
for k, v in zip(season_names, grad_boo):
    gb_acc[k] = v
    
gb_acc

In [None]:
gb = GradientBoostingClassifier(loss='deviance',
                                   learning_rate=0.005,
                                   n_estimators=500,
                                   min_samples_leaf=5)

gb_score = []

for df in df_list:
    X = df.drop(labels=['outcome'], axis=1)
    y = df.outcome
    
    X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.25, random_state=1)
    
    gb.fit(X_train, y_train)
    gb_score.append(gb.score(X_test, y_test))

gb_scores_nocv = {}
for k, v in zip(season_names, gb_score):
    gb_scores_nocv[k] = v
gb_scores_nocv

In [None]:
'''
Cross-val log reg:      log_reg_acc
Cross-val rand forest:  rf_acc
Cross-val grad boost:   gb_acc

Rand forest oob:        oob_scores
Grad boos no cross val: gb_scores_nocv
'''

In [None]:
plt.plot(season_names, log_reg, label='Log Reg') 
plt.plot(season_names, rand_for, label='Random Forest')
plt.plot(season_names, grad_boo, label='Gradient Boost')
plt.legend()
plt.xlabel

In [None]:
fig = plt.figure(figsize=(20,10))
plt.plot(season_names[1:], log_reg[1:], linewidth=4, label=f'Logistic Reg: {np.mean(log_reg[1:]):.4f}') 
plt.plot(season_names[1:], rand_for[1:], linewidth=4, color='green', label=f'Random Forest: {np.mean(rand_for[1:]):.4f}')
plt.plot(season_names[1:], grad_boo[1:], linewidth=4, label=f'Gradient Boost: {np.mean(grad_boo[1:]):.4f}')
plt.legend(prop={'size': 21})
plt.xlabel('Seasons')
plt.ylabel('Scores')
plt.title('Cross-Validation Scores')
plt.hlines((np.mean(log_reg[1:])), season_names[1:], season_names[-1:], colors='r', linestyles='dashed')
plt.hlines((np.mean(rand_for[1:])), season_names[1:], season_names[-1:], colors='green', linestyles='dashed')
plt.hlines((np.mean(grad_boo[1:])), season_names[1:], season_names[-1:], colors='blue', linestyles='dashed')
# plt.savefig('Cross_val_all_season_train')

* Eliminated the first season for now.  Need to look into why its accuracy was unusually high.
* According to the chart, GB performed best in 2 seasons, RF best in 2 seasons, LR best in 2 seasons.

## **Train Model on 1 season**

In [None]:
'''
Objectives:
    -Use 2016-2017 season since all 3 models were fairly close for that one.  Maybe 2014-2015.
        'season_20162017'
    -Random Forest - with random search
    -Gradient Boost - with random search?
    -Deep Learning
    
Later:
    -Regression for goal spread?
    -Add in player stats?
'''

In [None]:
X = season_20162017.drop(labels=['outcome'], axis=1)
y = season_20162017.outcome

In [None]:
X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = cv.train_test_split(X_train, y_train, test_size=0.2, random_state=1)

##### **Random Forest with Grid Search**

In [None]:
rf2 = RandomForestClassifier(n_estimators=1000,
                           max_features='auto',
                           random_state=0,
                           oob_score=True, max_depth=10)

parameters = {'n_estimators': [100, 250, 500, 750, 1000],
              #'criterion': ['mse'],
              'max_depth': [5, 10, 15], 
              'min_samples_split': [2, 5, 7, 10],
              'min_samples_leaf': [1, 3, 5]
             }

grid_obj = GridSearchCV(rf2, parameters,
                        cv=5,
                        n_jobs=-1,
                        verbose=1)

grid_obj = grid_obj.fit(X_train, y_train)

In [None]:
rf2 = grid_obj.best_estimator_
rf2.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

predictions = rf2.predict(X_val)
y_validation_RF = y_val.reset_index()['outcome']

print('R2 score = ',r2_score(y_validation_RF, predictions), '/ 1.0')
print('MSE score = ',mean_squared_error(y_validation_RF, predictions), '/ 0.0')

In [None]:
# test data predictions
predictions = rf2.predict(X_test)

y_test_RF = y_test.reset_index()['outcome']

print('R2 score = ',r2_score(y_test_RF, predictions), '/ 1.0')
print('MSE score = ',mean_squared_error(y_test_RF, predictions), '/ 0.0')

In [None]:
rf2.score(X_test, y_test)

In [None]:
plt.plot(y_validation_RF[0:50], '+', color ='blue', alpha=0.7)
plt.plot(predictions[0:50], 'ro', color ='red', alpha=0.5)
plt.title('Prediction vs Real values')
plt.show()

* Improvement of about .6%

##### **Gradient Boost with Grid Search**

In [None]:
gb2 = GradientBoostingClassifier(loss='deviance',
                                   learning_rate=0.005,
                                   n_estimators=500,
                                   min_samples_leaf=5)

parameters_gb = {'learning_rate': [.001, .005, .01, .015, .02],
              #'criterion': ['mse'],
              'n_estimators': [10, 50, 100, 250, 500, 1000],
              'max_depth': [3, 5, 10], 
              'min_samples_split': [2, 5, 7, 10],
              'min_samples_leaf': [1, 3, 5]
             }

grid_obj_gb = GridSearchCV(gb2, parameters_gb,
                        cv=5,
                        n_jobs=-1,
                        verbose=1)

grid_obj_gb = grid_obj_gb.fit(X_train, y_train)

In [None]:
gb2 = grid_obj_gb.best_estimator_
gb2.fit(X_train, y_train)

In [None]:
predictions_gb = gb2.predict(X_val)
y_validation_gb = y_val.reset_index()['outcome']

print('R2 score = ',r2_score(y_validation_gb, predictions_gb), '/ 1.0')
print('MSE score = ',mean_squared_error(y_validation_gb, predictions_gb), '/ 0.0')

In [None]:
# test data predictions
predictions_gb = gb2.predict(X_test)

y_test_gb = y_test.reset_index()['outcome']

print('R2 score = ',r2_score(y_test_gb, predictions_gb), '/ 1.0')
print('MSE score = ',mean_squared_error(y_test_gb, predictions_gb), '/ 0.0')

In [None]:
gb2.score(X_test, y_test)

##### **Use the RF and GB models to predict other seasons**

In [None]:
rand_for2 = []
rf_scores2 = []
for df in df_list: 
    X = df.drop(labels=['outcome'], axis=1)
    y = df.outcome
    
    X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.25, random_state=1)
    
    kf = KFold(n_splits=5, shuffle=True)  # almost always use shuffle=True
    rf_scores = []
    
    for train, test in kf.split(X_train):
        ## fit the model to training data
        rf2.fit(X_train.values[train], y_train.values[train])
        ## evaluate the model on testing data!!!
        rf_scores.append(rf2.score(X_train.values[test], y_train.values[test]))
    rf_scores2.append(rf2.score(X_test, y_test))
    rand_for2.append(np.mean(rf_scores))

In [None]:
rf_acc2 = {}
for k, v in zip(season_names, rand_for2):
    rf_acc2[k] = v
    
rf_acc2

In [None]:
predict_scores_rf = {}
for k, v in zip(season_names, rf_scores2):
    predict_scores_rf[k] = v
    
predict_scores_rf

In [None]:
grad_boo2 = []
gb_scores2 = []
for df in df_list: 
    X = df.drop(labels=['outcome'], axis=1)
    y = df.outcome
    
    X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.25, random_state=1)
    
    kf = KFold(n_splits=5, shuffle=True)  # almost always use shuffle=True
    gb_scores = []
    
    for train, test in kf.split(X_train):
        ## fit the model to training data
        gb2.fit(X_train.values[train], y_train.values[train])
        ## evaluate the model on testing data!!!
        gb_scores.append(gb2.score(X_train.values[test], y_train.values[test]))
    gb_scores2.append(gb2.score(X_test, y_test))
    grad_boo2.append(np.mean(gb_scores))

In [None]:
gb_acc2 = {}
for k, v in zip(season_names, grad_boo2):
    gb_acc2[k] = v
    
gb_acc2

In [None]:
predict_scores_gb = {}
for k, v in zip(season_names, gb_scores2):
    predict_scores_gb[k] = v
    
predict_scores_gb

In [None]:
fig = plt.figure(figsize=(20,10))
plt.plot(season_names[1:], rand_for2[1:], linewidth=4, color='green', label=f'Random Forest: {np.mean(rand_for2[1:]):.4f}')
plt.plot(season_names[1:], grad_boo2[1:], linewidth=4, label=f'Gradient Boost: {np.mean(grad_boo2[1:]):.4f}')
plt.legend(prop={'size': 21})
plt.xlabel('Seasons')
plt.ylabel('Scores')
plt.title('Cross-Validation Scores')
plt.hlines((np.mean(rand_for2[1:])), season_names[1:], season_names[-1:], colors='green', linestyles='dashed')
plt.hlines((np.mean(grad_boo2[1:])), season_names[1:], season_names[-1:], colors='red', linestyles='dashed')
# plt.savefig('one_season_training')

##### **Tensorflow if time**
* from: https://www.kaggle.com/kyubii/nba-deep-learning-2-2

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation

In [None]:
model = Sequential()

In [None]:
# Building the model
model.add(Dense(200, input_dim=50, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(50, kernel_initializer='normal', activation='relu'))
model.add(Dense(25, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))

model.compile(loss = 'mse', optimizer='adam', metrics=['mse']) #mse: mean_square_error
model.summary()

In [None]:
# Convert data as np.array
features = np.array(X_train)
targets = np.array(y_train)

features_validation= np.array(X_val)
targets_validation = np.array(y_val)

features_test= np.array(X_test)
targets_test = np.array(y_test)

In [None]:
print(features.shape)
print(targets.shape)

In [None]:
model.fit(features, targets, epochs=10, batch_size=24)

In [None]:
# Training the model
epochs_tot = 1000
epochs_step = 100
epochs_ratio = int(epochs_tot / epochs_step)
hist =np.array([])

for i in range(epochs_ratio):
    history = model.fit(features, targets, epochs=epochs_step, batch_size=100, verbose=0)
    
    # Evaluating the model on the training and testing set
    print("Step : " , i * epochs_step, "/", epochs_tot)
    score = model.evaluate(features, targets)
    print("Training MSE:", score[1])
    score = model.evaluate(features_validation, targets_validation)
    print("Validation MSE:", score[1], "\n")
    hist = np.concatenate((hist, np.array(history.history['mse'])), axis = 0)
    
# plot metrics
plt.plot(hist)
plt.show()

## **Betting**

-Correct: 55%
-https://www.vegasinsider.com/nhl/odds/las-vegas/
  EDMONTON -160 (BET $100 TO WIN $62.50)
  ARIZONA +145 (BET $100 TO WIN $145)
  
Favorite:               Underdog:
-If correct: +62.50     -If correct: +145
-If wrong: -100         -If wrong: -100

In [None]:
print('Gain/Loss for Favorite: ', (55 * 62.50) - (45 * 100))
print('Gain/Loss for Underdog: ', (45 * 145) - (55 * 100))

* According to https://www.oddsshark.com/sports-betting/which-sport-do-betting-underdogs-win-most-often,
  NHL underdogs win approximately 41% of the time.

In [None]:
print('Gain/Loss for Favorite: ', (61.5 * 62.50) - (38.5 * 100))
print('Gain/Loss for Underdog: ', (41 * 145) - (59 * 100))

#### *To break even, you need to be right about 61.5% of the time.*

In [None]:
from random import choices

In [None]:
population = [0, 1]
weights = [0.45, 0.55]
num_bets = 100

def bet_sim(population, weights, num_bets):
    money = 0
    wins = []
    for i in range(num_bets):
        wins.append(choices(population, weights))
    
    for _ in wins:
        for win in _:
            if win == 0:
                money -= 100
            else:
                money += 62.5
    return money

bet_sim(population, weights, num_bets)

In [None]:
winnings = []
for i in range(100):
    winnings.append(bet_sim(population, weights, num_bets))
np.mean(winnings)

In [None]:
nums = [.5900350, .5917832, .5847902, .5900350, .5839161, .5821678, .5865385, .6031469, .5970280, .5900350, .5882867, .5882867, .5743007, .5952797, .5847902, .5926573, .5900350]
np.mean(nums)