<h1>MVP Training Data</h1>
<p>I need to get an inital training set for use with the MVP, so I'll attempt to do that here.  For my feature set, I would like to have data for each forecast day of: 
<ul>
    <li>Position</li>
    <li>Weight</li>
    <li>Age</li>
    <li>Minutes Total (Season)</li>
    <li>Minutes Total (Last 3 Wks)</li>
    <li>Travel Last 3 Wks</li>
    <li>Time changes last 3 weeks</li>
    <li>Days until next game</li>
</ul>
Plus whether or not the player was injured the following game.

Once the .csv player log is analysed it's piped into the SQLdatabase <img rc="schemaMVP.png" alt="Schema for Database">
</p>

In [230]:
import pandas as pd
import numpy as np
import os
from datetime import datetime as dt, timedelta
from dateutil.relativedelta import *
import re
from matplotlib import pyplot as plt
import sqlite3

<h3>SQLite Handling functions</h3>
<p>Use Pandas to import dataframe into SQlite.  Clear table at start to remove any existing gamelog data.</p>

In [232]:
DB = 'hockey_datamvp.db'

#SQL Handle functions
def run_query(q):
    with sqlite3.connect(DB) as conn:
        x = pd.read_sql(q,conn)
    return x
   
#send command
def run_command(c):
    with sqlite3.connect(DB) as conn:
        conn.isolation_level = None
        return conn.execute(c)
    
#show tables
def show_tables():
    r = """
    Select name,type
    FROM sqlite_master
    WHERE type in ("table","view");
        """
    return run_query(r)

In [233]:
run_command("DROP TABLE IF EXISTS {}".format("player_log")) #clear DB of existing tables

#make a new table with the appropriate keys
c1 = """
    CREATE TABLE IF NOT EXISTS player_log(
    gamelog_id TEXT PRIMARY KEY,
    player_id TEXT,
    team_id TEXT,
    date_game TEXT,
    age REAL,
    min_season REAL,
    min_3w REAL,
    days_to_next_g INTEGER,
    timec REAL,
    venuec REAL,
    injured INT,
    FOREIGN KEY (player_id) REFERENCES player_list(unique_id)
    FOREIGN KEY (team_id) REFERENCES team_list(team_id)
);"""
            
run_command(c1)
show_tables()

OperationalError: database is locked

<h3>Data Handling functions</h3>
<p><ul>
    <li><b>Cumul</b> - Used to calculate the cumulative season totals</li>
    <li><b>Injury_Match</b> -  matches the log injury data with the player logs</li>
</ul></p>

In [218]:
def cumul(df,team):
    """ 
    Calculates Several Cumulative Statistics for the season and the 3 Week window following the last game
    df = the player log to analyse
    team = the team info
    """
    prodf = pd.DataFrame(columns=['player_id','team_id','date_game','age','min_season','min_3w','days_to_next_g'])
    game_season = 10
    #
    for row in np.arange(plog.shape[0]-1): #exclude last entry for now as there's no next game
        prodf.loc[row,'player_id'] = df.loc[row,'unique_id']
        prodf.loc[row,'team_id'] = df.loc[row,'team_id']
        prodf.loc[row,'date_game'] = df.loc[row,'date_game']
        prodf.loc[row,'age'] = df.loc[row,'age']
        prodf.loc[row,'days_to_next_g'] = df.loc[row+1,'date_game']-df.loc[row,'date_game'] #days until next game (convert to int)
        
        #the cumulative stuff
        if df.loc[row,'game_season'] > game_season:
            prodf.loc[row,'min_season'] = prodf.loc[row-1,'min_season'] + df.loc[row,'time_on_ice'] #minutes to date in season
            game_season = df.loc[row,'game_season'] 
        else:
            prodf.loc[row,'min_season'] = df.loc[row,'time_on_ice']  #first game of season
            game_season = df.loc[row,'game_season'] 

        #the three week window stuff
        window = df[(df['date_game']>=(df.loc[row,'date_game']-timedelta(21)))&(df['date_game']<=(df.loc[row,'date_game']))].copy()
    
        prodf.loc[row,'min_3w'] = window['time_on_ice'].sum() #recent iceteam
        
        #time changes
        window['venue'] = window['team_id']
        window.loc[window['game_location']=="@",'venue'] = window.loc[window['game_location']=="@",'opp_id']
        prodf.loc[row,'timec'] = window['venue'].replace(dict(zip(team['team_abbr'],team['timezone']))).var()
        #number of venue changes / number of games
        gc = 0
        for i,v in enumerate(window['venue'][:-2]):
            if window.iloc[i+1,-1] != v :
                gc += 1
        prodf.loc[row,'venuec'] = gc/window.shape[0]
    prodf['timec'] = prodf['timec'].fillna(0)
    prodf['days_to_next_g'] = prodf['days_to_next_g'].map(lambda x: x.days) #convert to int
    return prodf

def injury_match(pdf,inj_dat):
    """
        INPUTS: pdf - the player log to analyse
                inj_data - the injury log to analyse for that player
        OUTPUTS: the injury dataframe to analyse
    """
    for i,i_day in inj_dat.iterrows():
        injury_reported = i_day['Date']

        #find minimum number of dates between game and injury report
        dateoffset = pdf['date_game'].map(lambda x:np.abs(x-injury_reported))
        #closest entry in the player logs
        closest_entry = pdf[dateoffset==dateoffset.min()].iloc[[0]]

        #if the injury date happened before the closest game... 
        if closest_entry['date_game'].values>injury_reported:
            closest_entry = pdf.loc[closest_entry['date_game'].index-1]

        #game before the injury game
        pregame = pdf.loc[closest_entry['date_game'].index-1]
        #make sure it makes sense aka did it happen recently...
        if (((pregame['date_game'].map(lambda x:np.abs(x-injury_reported).days))<15).values):
            pdf.loc[pregame.index,'injured'] = 1   
            #print('Last Game: %s, Injury Report %s'%(pregame['date_game'].values,i_day['Date']))
    return pdf

In [219]:
#for each player in the database
players = run_query('Select * FROM player_list')
#also load the teams
teams = run_query('Select * FROM team_list')

for i,player in players.iterrows():

    #load player log
    plog = pd.read_csv('..\\Data\\player_gamelogs\\'+player['unique_id']+'.txt')
    plog['unique_id'] = player['unique_id']
    plog['age'] = plog['age'].str.extract(r'([\d]*)-([\d]*)').astype(int)[0]+plog['age'].str.extract(r'([\d]*)-([\d]*)').astype(int)[1]/365
    plog.drop(columns=['Unnamed: 0'],inplace=True)
    plog['date_game'] = plog['date_game'].astype('datetime64') #convert to datetime
    
    #Time on Ice wasn't recorded before the 1997-1998 season
    plog['time_on_ice'] = plog['time_on_ice'].fillna("0:0") 
    plog['time_on_ice'] = plog['time_on_ice'].str.extract(r'([\d]*):([\d]*)').astype(int)[0]+plog['time_on_ice'].str.extract(r'([\d]*):([\d]*)').astype(int)[1]/60
    plogprocessed = pd.DataFrame() 

    #get the injury report from the db
    player_logs = cumul(plog,teams).copy()
    player_logs['injured'] = 0
    
    #load the injury report
    try:
        inj= pd.read_csv('..\\Data\\player_injurylist\\'+player['unique_id']+'.txt','\t')
        try:
            inj.drop(columns=['Unnamed: 0'],inplace=True)
            inj.columns = inj.columns.str.strip()
            inj['Date'] = inj['Date'].astype('datetime64') #convert to datetime
            injured = inj[inj['Relinquished']!=' '] #all the times put onto the injured list
        except  AssertionError as error:
            print(error)
        
        #now note injuries
        player_logs = injury_match(player_logs,injured)
    except:
        #no reported injury
        pass
    #create unique gamelog id
    player_logs['gamelog_id'] = player_logs['player_id']+player_logs['date_game'].map(lambda x:x.strftime('%Y%m%d'))
    
    #rejig
    player_logs = player_logs[['gamelog_id','player_id','team_id','date_game','age','min_season','min_3w',
                                  'days_to_next_g', 'timec', 'venuec', 'injured']].set_index('gamelog_id')
    player_logs['date_game'] = player_logs['date_game'].astype(str)
    #inject into SQL database
    player_logs.to_sql('player_log',con=sqlite3.connect(DB), if_exists='append')

KeyboardInterrupt: 

In [223]:
#load player log
plog = pd.read_csv('..\\Data\\player_gamelogs\\'+player['unique_id']+'.txt')
plog['unique_id'] = player['unique_id']
plog['age'] = plog['age'].str.extract(r'([\d]*)-([\d]*)').astype(int)[0]+plog['age'].str.extract(r'([\d]*)-([\d]*)').astype(int)[1]/365
plog.drop(columns=['Unnamed: 0'],inplace=True)
plog['date_game'] = plog['date_game'].astype('datetime64') #convert to datetime

#Time on Ice wasn't recorded before the 1997-1998 season
plog['time_on_ice'] = plog['time_on_ice'].fillna("0:0") 
plog['time_on_ice'] = plog['time_on_ice'].str.extract(r'([\d]*):([\d]*)').astype(int)[0]+plog['time_on_ice'].str.extract(r'([\d]*):([\d]*)').astype(int)[1]/60
plogprocessed = pd.DataFrame() 

#get the injury report from the db
player_logs = cumul(plog,teams).copy()
player_logs['injured'] = 0

#load the injury report
try:
    inj= pd.read_csv('..\\Data\\player_injurylist\\'+player['unique_id']+'.txt','\t')
    try:
        inj.drop(columns=['Unnamed: 0'],inplace=True)
        inj.columns = inj.columns.str.strip()
        inj['Date'] = inj['Date'].astype('datetime64') #convert to datetime
        injured = inj[inj['Relinquished']!=' '] #all the times put onto the injured list
    except  AssertionError as error:
        print(error)

    #now note injuries
    player_logs = injury_match(player_logs,injured)
except:
    #no reported injury
    pass
#create unique gamelog id
player_logs['gamelog_id'] = player_logs['player_id']+player_logs['date_game'].map(lambda x:x.strftime('%Y%m%d'))

#rejig
player_logs = player_logs[['gamelog_id','player_id','team_id','date_game','age','min_season','min_3w',
                              'days_to_next_g', 'timec', 'venuec', 'injured']].set_index('gamelog_id')
player_logs['date_game'] = player_logs['date_game'].astype(str)
#inject into SQL database
player_logs.to_sql('player_log',con=sqlite3.connect(DB), if_exists='append')

In [229]:
run_query("""SELECT * FROM player_log ORDER BY player_id DESC LIMIT 5""")

Unnamed: 0,gamelog_id,player_id,team_id,date_game,age,min_season,min_3w,days_to_next_g,timec,venuec,injured
0,bourqra0119901004,bourqra01,BOS,1990-10-04 00:00:00,29.767123,0.0,0.0,2,0.0,0.0,0
1,bourqra0119901006,bourqra01,BOS,1990-10-06 00:00:00,29.772603,0.0,0.0,1,0.0,0.0,0
2,bourqra0119901007,bourqra01,BOS,1990-10-07 00:00:00,29.775342,0.0,0.0,3,0.0,0.0,0
3,bourqra0119901010,bourqra01,BOS,1990-10-10 00:00:00,29.783562,0.0,0.0,1,0.25,0.25,0
4,bourqra0119901011,bourqra01,BOS,1990-10-11 00:00:00,29.786301,0.0,0.0,2,0.3,0.4,0
