<h1>Implement Random Forest for Arbetrary Goalies</h1>
<p>This notebook implements a random forest for the selected goalie and calculates the increased risk factor based on the returned probability.  It will be ported over for implementation into statgen.py </p>

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt, timedelta
from dateutil.relativedelta import *
import re
import sqlite3
pd.set_option('display.max_rows', 500)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

<h3>SQLite Handling functions</h3>
<p>Use Pandas to import dataframe into SQLlite</p>

In [2]:
DB = "../assests/hockey_data_goalies.db"

#SQL Handle functions
def run_query(q):
    with sqlite3.connect(DB) as conn:
        x = pd.read_sql(q,conn)
    return x
   
#send command
def run_command(c):
    with sqlite3.connect(DB) as conn:
        conn.isolation_level = None
        return conn.execute(c)
    
#show tables
def show_tables():
    r = """
    Select name,type
    FROM sqlite_master
    WHERE type in ("table","view");
        """
    return run_query(r)

In [3]:
ran_for = load('ranforest_regression.joblib')
pipe = load('pipeline.joblib')

In [88]:
def riskfunc(x):
    """Estimate the increased risk factor by player playing
    """
    y = x**3+0.37*(x**2)-0.2572*x+0.0118
    return np.round(10**y,1)

#inputs:

team_value = 'TOR'
year_value = 2016
game_date = '2017-01-17'

#get unique ids
q = ("""SELECT * 
    FROM player_log 
    WHERE team_id=\"{0}\"
    AND (CAST(SUBSTR(date_game,1,4) AS FLOAT)+CAST(SUBSTR(date_game,6,7) AS FLOAT)/12) > {1}
    AND date_game < \"{2}\"
    GROUP BY player_id
    """.format(team_value,int(year_value) + .66,game_date))

ids = run_query(q)

#now generate data for season

prodf = pd.DataFrame(columns=['player_id','team_id','opp_id','date_game','age','rest_days','min_season',
                                'shots_against','save_pct','min3W','sa3W','svepct3W','future_save_pct','injured'])
row = 0
for each_id in ids['player_id']:
    print(each_id)
    q = ("""SELECT * 
    FROM player_log 
    WHERE (CAST(SUBSTR(date_game,1,4) AS FLOAT)+CAST(SUBSTR(date_game,6,7) AS FLOAT)/12) > {0}
    AND date_game < \"{1}\"
    AND player_id = \"{2}\"
    """.format(int(year_value) + .66,game_date,each_id))
    season_logs = run_query(q)
    season_logs['date_game'] = season_logs['date_game'].astype('datetime64') #convert to datetime
    season_logs['time_on_ice'] = season_logs['time_on_ice'].str.extract(r'(\d*)\:\d*')[0].astype(int)+season_logs['time_on_ice'].str.extract(r'\d*\:(\d*)')[0].astype(int)/60
    
    for r,game in season_logs[-1:].iterrows():
        prodf.loc[row,'player_id'] = season_logs.loc[r,'player_id']
        prodf.loc[row,'team_id'] = season_logs.loc[r,'team_id']
        prodf.loc[row,'opp_id'] = season_logs.loc[r,'opp_id']
        prodf.loc[row,'date_game'] = season_logs.loc[r,'date_game']
        prodf.loc[row,'age'] = season_logs.loc[r,'age']
        prodf.loc[row,'rest_days'] = (season_logs.loc[r,'date_game']-season_logs.loc[r-1,'date_game']).days
        prodf.loc[row,'min_season'] = season_logs.loc[:(r-1),'time_on_ice'].sum()
        prodf.loc[row,'shots_against'] = season_logs.loc[:(r-1),'shots_against'].sum()
        prodf.loc[row,'save_pct'] = np.round(season_logs.loc[:(r-1),'saves'].sum()/season_logs.loc[:(r-1),'shots_against'].sum(),3)

        window = (season_logs.loc[r,'date_game']>season_logs['date_game'])&(season_logs['date_game']>(season_logs.loc[r,'date_game']-timedelta(21)))

        prodf.loc[row,'min3W'] = season_logs.loc[window,'time_on_ice'].sum()
        prodf.loc[row,'sa3W'] = season_logs.loc[window ,'shots_against'].sum()
        prodf.loc[row,'svepct3W'] = season_logs.loc[window,'saves'].sum()/season_logs.loc[window,'shots_against'].sum()
        prodf.loc[row,'future_save_pct'] = np.round(season_logs.loc[r,'saves'].sum()/season_logs.loc[r,'shots_against'].sum(),3)
        prodf.loc[row,'injured'] = season_logs.loc[r,'injured']    
        prodf.loc[row,'pre_inj'] = season_logs.loc[r,'pre_inj']
    row+=1
    
columns = ['age','min_season','rest_days','shots_against','save_pct','min3W','sa3W','svepct3W','pre_inj']
probs = ran_for.predict_proba(pipe.fit_transform(prodf[columns]))[:,1]
riskfunc(probs)
dict(zip(ids['player_id'].to_list(),riskfunc(probs)))

anderfr01
enrotjh01
mcelhcu01


{'anderfr01': 2.9, 'enrotjh01': 1.0, 'mcelhcu01': 1.2}

In [91]:
data = ['Injury Risk Factor']
data.extend(riskfunc(probs))
data

['Injury Risk Factor', 2.9, 1.0, 1.2]

In [92]:
winz = ['Wins',1,-1,-5]

In [93]:
[i-max(winz[1:]) if type(i) == int else i for i in winz]

['Wins', 0, -2, -6]