In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime as dt, timedelta
from dateutil.relativedelta import *
import re
from matplotlib import pyplot as plt
import sqlite3

<h3>SQLite Handling functions</h3>
<p>Use Pandas to import dataframe into SQLlite</p>

In [7]:
DB = "hockey_dataMVP.db"

#SQL Handle functions
def run_query(q):
    with sqlite3.connect(DB) as conn:
        x = pd.read_sql(q,conn)
    return x
   
#send command
def run_command(c):
    with sqlite3.connect(DB) as conn:
        conn.isolation_level = None
        return conn.execute(c)
    
#show tables
def show_tables():
    r = """
    Select name,type
    FROM sqlite_master
    WHERE type in ("table","view");
        """
    return run_query(r)

In [32]:
not_injured = run_query("""SELECT COUNT(pl.player_id) as injuries From player_log pl WHERE pl.injured =0""")['injuries'].values
injured = run_query("""SELECT COUNT(pl.player_id) as injuries From player_log pl WHERE pl.injured =1""")['injuries'].values
print('There are %d injuries in %entries' % (injured,injured+not_injured))

There are 11529 injuries in 1.133894e+06ntries


In [34]:
run_query("""PRAGMA table_info(player_log);""")

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,gamelog_id,TEXT,0,,1
1,1,player_id,TEXT,0,,0
2,2,date_game,TEXT,0,,0
3,3,age,REAL,0,,0
4,4,min_season,REAL,0,,0
5,5,min_3w,REAL,0,,0
6,6,days_to_next_g,INTEGER,0,,0
7,7,timec,REAL,0,,0
8,8,venuec,REAL,0,,0
9,9,injured,INT,0,,0


In [92]:
no_injury = run_query("""WITH 
            /*Add year*/
            add_year AS (SELECT pl.*, CAST(SUBSTR(pl.date_game,1,4) AS INT) as YEAR 
                          FROM player_log pl),
            /*REMOVE Record Before 1997 as Time on Ice wasn't recorded*/
            post_1997 AS (SELECT ady.*
                          FROM add_year ady
                          WHERE ady.YEAR > 1997)
            /*SELECT 20000 entries with no injuries and join to those with injuries*/
            SELECT pl.position,pl.weight_kg,po.age,po.min_season,po.min_3w,po.days_to_next_g,po.timec,po.venuec,po.injured
            FROM post_1997 po
            LEFT JOIN player_list pl ON pl.unique_id=po.player_id
            WHERE injured = 0 AND days_to_next_g < 20
            ORDER BY RANDOM()
            LIMIT 20000
            """)
injury = run_query("""WITH 
            /*Add year*/
            add_year AS (SELECT pl.*, CAST(SUBSTR(pl.date_game,1,4) AS INT) as YEAR 
                          FROM player_log pl),
            /*REMOVE Record Before 1997 as Time on Ice wasn't recorded*/
            post_1997 AS (SELECT ady.*
                          FROM add_year ady
                          WHERE ady.YEAR > 1997)
            /*SELECT 20000 entries with no injuries and join to those with injuries*/
            SELECT pl.position,pl.weight_kg,po.age,po.min_season,po.min_3w,po.days_to_next_g,po.timec,po.venuec,po.injured
            FROM post_1997 po
            LEFT JOIN player_list pl ON pl.unique_id=po.player_id
            WHERE injured = 1 AND days_to_next_g < 20
            ORDER BY RANDOM()
            LIMIT 10000
            """)

In [93]:
dataset = injury.append(no_injury).sample(frac=1) #add together and randomize
dataset.head()

Unnamed: 0,position,weight_kg,age,min_season,min_3w,days_to_next_g,timec,venuec,injured
173,D,92,25.438356,1027.866667,235.816667,2,0.0,0.7,0
19864,F,104,27.052055,1282.583333,108.683333,3,0.0,0.4,0
1300,D,86,26.876712,482.466667,200.75,1,1.218182,0.636364,1
9556,D,98,33.958904,1088.733333,183.833333,2,0.0,0.666667,0
18206,F,84,31.819178,269.983333,143.716667,2,0.090909,0.454545,0


In [94]:
dataset.describe()

Unnamed: 0,weight_kg,age,min_season,min_3w,days_to_next_g,timec,venuec,injured
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,92.175667,28.28495,629.17126,158.326961,2.518767,0.595558,0.516251,0.333333
std,7.061634,4.520951,503.893454,88.129012,1.737128,0.679212,0.199489,0.471412
min,72.0,18.230137,0.0,0.0,1.0,0.0,0.0,0.0
25%,87.0,24.865753,245.05,104.466667,2.0,0.0,0.428571,0.0
50%,92.0,27.89589,532.758333,154.358333,2.0,0.290909,0.555556,0.0
75%,97.0,31.356164,900.891667,197.716667,3.0,1.0,0.666667,1.0
max,120.0,48.161644,4252.733333,709.533333,19.0,4.5,0.846154,1.0
