In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime as dt, timedelta
from dateutil.relativedelta import *
import re
from matplotlib import pyplot as plt
import seaborn as sns
import sqlite3

<h3>SQLite Handling functions</h3>
<p>Use Pandas to import dataframe into SQLlite</p>

In [6]:
DB = "..\\Assets\\hockey_data_goalies.db"

#SQL Handle functions
def run_query(q):
    with sqlite3.connect(DB) as conn:
        x = pd.read_sql(q,conn)
    return x
   
#send command
def run_command(c):
    with sqlite3.connect(DB) as conn:
        conn.isolation_level = None
        return conn.execute(c)
    
#show tables
def show_tables():
    r = """
    Select name,type
    FROM sqlite_master
    WHERE type in ("table","view");
        """
    return run_query(r)

In [3]:
def sos_calc(year,game_no,nme,wind=[0]):
    """
        CALCULATES THE SOS FOR THE TEAM entered faced before a given date
    INPUT:
        year: the current season (YYYY-YYYY+1)
        game_no: the game_no for the current game previous games are [0-game_no-1]
        nme: the team abbr to calculate the SOS for
        RETURN:
        sos: the computed sos statistics sum(no_times_faced*(goalsfor-goalsagainst)/no_games)/no_games
    """
    
    sos = 0
    
    #get the details on the game
    q = ("""SELECT * 
            FROM team_log 
            WHERE team_id=\"{0}\"
            AND (CAST(SUBSTR(date_game,1,4) AS FLOAT)+CAST(SUBSTR(date_game,6,7) AS FLOAT)/12) > {1}
            AND (CAST(SUBSTR(date_game,1,4) AS FLOAT)+CAST(SUBSTR(date_game,6,7) AS FLOAT)/12) < {2}
            """.format(nme,int(year) + .66,int(year)+1.66))
    games = run_query(q)
    games['date_game'] = games['date_game'].astype('datetime64')
    
    gamedate = games.at[game_no,'date_game']

    if game_no>0:
        #list of teams faced to date (and number of times)
        if len(wind) ==1:
            teams_faced = games.loc[games['date_game']<gamedate,'opp_name'].value_counts()
        else:
            teams_faced = games.loc[wind,'opp_name'].value_counts()
       
        #get list of games played in the league that season up until the gameday
        q = ("""SELECT * 
                FROM team_log 
                WHERE (CAST(SUBSTR(date_game,1,4) AS FLOAT)+CAST(SUBSTR(date_game,6,7) AS FLOAT)/12) > {1}
                AND (CAST(SUBSTR(date_game,1,4) AS FLOAT)+CAST(SUBSTR(date_game,6,7) AS FLOAT)/12) < {2}
                """.format(nme,int(year) + .66,int(year)+1.66))
        temp_games = run_query(q)
        temp_games['date_game'] = temp_games['date_game'].astype('datetime64')

        #calculate statistics for each team
        goals = temp_games[temp_games['date_game']<gamedate].groupby('team_id').agg([sum,'count'])

        diff = (goals['goals']['sum']-goals['opp_goals']['sum'])/goals['goals']['count'] #calculates metric for all teams to date

        for eachteam in teams_faced.index:
            #get team_abbr

            q = '''SELECT team_abbr FROM team_list WHERE team_name=\"{0}\" AND CAST(SUBSTR(years_active,6,9) AS INT)>{1}'''.format(eachteam,int(year))
            nme = (run_query(q)['team_abbr'].values)[0]
            sos = sos + teams_faced[eachteam]*diff[nme]
        if len(wind) == 1:
            return sos/game_no
        else:
            return sos/wind.sum()
    else:
        return 0
    
def team_stat_gen(team_value,date):
    """
        Generates team based statistics for the season (and the last 3 weeks) upto the selected game
        INPUT:
            team_value: The team abbr
            year_value: The season in question where year is YYYY:YYYY+1
            game_value: The game_no for the current game previous games are [0-game_no-1] # must be more than 10
        OUTPUT:
            returns a data series of team stats
    """
    
    if date.month > 8:
        year_value = date.year
    else:
        year_value = date.year-1


    q = """SELECT game_number FROM team_log WHERE date_game=\"{0}\" AND team_id=\"{1}\"""".format(date.strftime("%Y-%m-%d"),team_value)
    game_value=run_query(q)['game_number'][0]
        
    
    #get the details on the game
    q = ("""SELECT * 
            FROM team_log 
            WHERE team_id=\"{0}\"
            AND (CAST(SUBSTR(date_game,1,4) AS FLOAT)+CAST(SUBSTR(date_game,6,7) AS FLOAT)/12) > {1}
            AND (CAST(SUBSTR(date_game,1,4) AS FLOAT)+CAST(SUBSTR(date_game,6,7) AS FLOAT)/12) < {2}
            """.format(team_value,int(year_value) + .66,int(year_value)+1.66))
    game_details = run_query(q)
    game_details['date_game'] = game_details['date_game'].astype('datetime64')
    try:
        game_date = game_details.at[game_value,'date_game']
    except:
        print(game_details,team_value,year_value,game_value)
        
    rest_days = game_date-game_details.at[game_value-1,'date_game']

    #team stats up to this point of the season (basic stats)
    cum_total = game_details.loc[game_details.index<game_value,['goals','opp_goals','shots','shots_against']].sum()
    games = (game_details.loc[game_details.index<game_value,'game_outcome']=='W').count() 
    wins = (game_details.loc[game_details.index<game_value,'game_outcome']=='W').sum()
    losses = (game_details.loc[game_details.index<game_value,'game_outcome']=='L').sum()
    ties = (game_details.loc[game_details.index<game_value,'game_outcome']=='T').sum()
    otl = ((game_details.loc[game_details.index<game_value,'game_outcome']=='L')&(game_details['overtimes'].notnull())).sum()
    points = wins*2+ties+otl
    points_pct = np.round(points/(2*games),3)
    sos = sos_calc(year_value,game_value,team_value) #strength of schedule higher is tougher
    srs = (cum_total['goals']-cum_total['opp_goals'])/games+sos #simple rating system
    return srs

<h2>Stat Functions</h2>
<p>Given the player statistics calculate:
    <ul>
        <li>Age</li>
        <li>Total minutes this season</li>
        <li>Total minutes last 3 weeks</li>
        <li>shots_against this season</li>
        <li>shots_against 3 weeks</li>
        <li>save_pct this season</li>
        <li>save_pct 3 weeks</li>
        <li>rest days</li>
       
</ul></p>

In [4]:
def season_finder(date):
    if date.month > 8:
        return date.year
    else:
        return date.year-1
def season_cumul(player):
    """ 
    Given the player, generate the career_log_data
    """
     #get the gamelogs for that player
    q = """SELECT * FROM player_log WHERE player_id=\"{0}\" """.format(player)
    player_logs = run_query(q)
    #unit conversions
    player_logs['date_game'] = player_logs['date_game'].astype('datetime64') #convert to datetime
    player_logs['time_on_ice'] = player_logs['time_on_ice'].str.extract(r'(\d*)\:\d*')[0].astype(int)+player_logs['time_on_ice'].str.extract(r'\d*\:(\d*)')[0].astype(int)/60
    player_logs['season'] = player_logs['date_game'].map(lambda x: season_finder(x))
    
    prodf = pd.DataFrame(columns=['player_id','team_id','opp_id','date_game','age','rest_days','min_season',
                                    'shots_against','save_pct','min3W','sa3W','svepct3W','future_save_pct','pre_inj','injured'])
    row = 0
    #for each season
    for season in player_logs['season'].unique():
        season_logs = player_logs[player_logs['season']==season].copy().reset_index(drop=True)
        for r,game in season_logs[1:].iterrows():

                prodf.loc[row,'player_id'] = season_logs.loc[r,'player_id']
                prodf.loc[row,'team_id'] = season_logs.loc[r,'team_id']
                #prodf.loc[row,'team_srs'] = team_stat_gen(season_logs.at[r,'team_id'],season_logs.at[r,'date_game'])
                prodf.loc[row,'opp_id'] = season_logs.loc[r,'opp_id']
                #prodf.loc[row,'opp_srs'] = team_stat_gen(season_logs.at[r,'opp_id'],season_logs.at[r,'date_game'])
                prodf.loc[row,'date_game'] = season_logs.loc[r,'date_game']
                prodf.loc[row,'age'] = season_logs.loc[r,'age']
                prodf.loc[row,'rest_days'] = (season_logs.loc[r,'date_game']-season_logs.loc[r-1,'date_game']).days
                prodf.loc[row,'min_season'] = season_logs.loc[:(r-1),'time_on_ice'].sum()
                prodf.loc[row,'shots_against'] = season_logs.loc[:(r-1),'shots_against'].sum()
                if season_logs.loc[:(r-1),'shots_against'].sum()>0:
                    prodf.loc[row,'save_pct'] = np.round(season_logs.loc[:(r-1),'saves'].sum()/season_logs.loc[:(r-1),'shots_against'].sum(),3)
                else:
                    prodf.loc[row,'save_pct'] = .905 #league average
                    
                window = (season_logs.loc[r,'date_game']>season_logs['date_game'])&(season_logs['date_game']>(season_logs.loc[r,'date_game']-timedelta(21)))

                prodf.loc[row,'min3W'] = season_logs.loc[window,'time_on_ice'].sum()
                prodf.loc[row,'sa3W'] = season_logs.loc[window ,'shots_against'].sum()
                
                if season_logs.loc[window,'shots_against'].sum()>0:
                    prodf.loc[row,'svepct3W'] = season_logs.loc[window,'saves'].sum()/season_logs.loc[window,'shots_against'].sum()
                else: 
                    prodf.loc[row,'svepct3W'] = .905 #league average
                     
                prodf.loc[row,'future_save_pct'] = np.round(season_logs.loc[r,'save_pct'] ,3)
                prodf.loc[row,'pre_inj'] = season_logs.loc[r-1,'pre_inj']
                prodf.loc[row,'injured'] = season_logs.loc[r,'injured']
                row+=1
     
    return prodf

In [25]:
#for each player in the database make the stat calculations
players = run_query('Select unique_id FROM player_list')
pdf = pd.DataFrame(columns=['player_id','team_id','opp_id','date_game','age','rest_days','min_season',
                                    'shots_against','save_pct','min3W','sa3W','svepct3W','future_save_pct','pre_inj','injured'])
for i,player in enumerate(players['unique_id']):
    season_data = season_cumul(player)
    pdf = pdf.append(season_data)
    #print(i,season_data.shape[0])
    if i%25 == 0:
        print("Finished %d of %d" % (i,players.shape[0]))
pdf = pdf.fillna(pdf['save_pct'].mean())

Finished 0 of 315
Finished 25 of 315
Finished 50 of 315
Finished 75 of 315
Finished 100 of 315
Finished 125 of 315
Finished 150 of 315
Finished 175 of 315
Finished 200 of 315
Finished 225 of 315
Finished 250 of 315
Finished 275 of 315
Finished 300 of 315


In [26]:
pdf.to_csv('advancedgoaliestats_withadditionalgoalies.csv')

In [7]:
run_query("""SELECT * FROM team_log LIMIT 10""")

Unnamed: 0,season_id,team_id,date_game,game_number,game_location,opp_name,goals,opp_goals,game_outcome,overtimes,shots,shots_against
0,ANA20061006,ANA,2006-10-06,0,,Los Angeles Kings,4,3,W,,32,44
1,ANA20061007,ANA,2006-10-07,1,@,Phoenix Coyotes,2,1,W,,25,35
2,ANA20061009,ANA,2006-10-09,2,,St. Louis Blues,2,0,W,,34,34
3,ANA20061011,ANA,2006-10-11,3,,New York Islanders,4,5,L,SO,50,24
4,ANA20061015,ANA,2006-10-15,4,,Dallas Stars,3,4,L,SO,39,35
5,ANA20061018,ANA,2006-10-18,5,,Detroit Red Wings,4,1,W,,31,22
6,ANA20061020,ANA,2006-10-20,6,,Minnesota Wild,2,1,W,,29,28
7,ANA20061022,ANA,2006-10-22,7,@,Los Angeles Kings,3,2,W,SO,21,31
8,ANA20061025,ANA,2006-10-25,8,,Edmonton Oilers,6,2,W,,20,31
9,ANA20061027,ANA,2006-10-27,9,@,Minnesota Wild,2,3,L,SO,34,33


In [9]:
#import player list into a dataframe
for files in os.listdir('C:\\Users\\jesse\\Documents\\Projects\\takeaseat\\Data\\team_gamelogs\\')[:1]:
    season_log = pd.read_csv('C:\\Users\\jesse\\Documents\\Projects\\takeaseat\\Data\\team_gamelogs\\'+files,'\t')

In [10]:
season_log

Unnamed: 0.1,Unnamed: 0,date_game,game_location,opp_name,goals,opp_goals,game_outcome,overtimes,empty1,shots,pen_min,goals_pp,chances_pp,goals_sh,empty2,shots_against,pen_min_opp,goals_against_pp,opp_chances_pp,goals_against_sh
0,0,2006-10-06,,Los Angeles Kings,4,3,W,,,32,21,1,,0,,44,17,1,,0
1,1,2006-10-07,@,Phoenix Coyotes,2,1,W,,,25,24,2,,0,,35,22,1,,0
2,2,2006-10-09,,St. Louis Blues,2,0,W,,,34,10,2,,0,,34,14,0,,0
3,3,2006-10-11,,New York Islanders,4,5,L,SO,,50,6,2,,0,,24,12,1,,0
4,4,2006-10-15,,Dallas Stars,3,4,L,SO,,39,27,0,,1,,35,15,2,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,77,2007-03-29,@,Columbus Blue Jackets,5,2,W,,,31,12,3,,0,,25,16,0,,0
78,78,2007-03-31,@,St. Louis Blues,3,2,W,OT,,27,13,2,,0,,22,13,0,,0
79,79,2007-04-04,,San Jose Sharks,2,3,L,SO,,39,13,1,,0,,31,15,0,,0
80,80,2007-04-06,@,Dallas Stars,1,2,L,SO,,22,10,0,,0,,30,6,0,,0
