This notebook unpickles files and merges them to create the master dataset used for modeling

In [1]:
import pandas as pd
import numpy as np
import pickle
import glob

pd.set_option('display.max_columns', 500)

In [2]:
# Functions for Pickling and Unpickling Files
def read_pickle(file):
    with open(file, 'rb') as picklefile:
        data = pickle.load(picklefile)
    return(data)

def write_pickle(file, data):
    with open(file, 'wb') as picklefile:
        pickle.dump(data, picklefile)

In [3]:
# Generate list of filenames containing player data and game data
player_file_list = glob.glob('/Users/JacKuo14/Documents/Metis/course_work/project03_baseball/sportstradar_data2/*player*.pkl')
gamefile_list = glob.glob('/Users/JacKuo14/Documents/Metis/course_work/project03_baseball/sportstradar_data2/*game*.pkl')

In [4]:
# Create "Master" Dataframes with all game and player data
df_game = pd.DataFrame()
for file in gamefile_list:
    df_game = df_game.append(read_pickle(file))
    
df_play = pd.DataFrame()
for file in player_file_list:
    df_play = df_play.append(read_pickle(file))

In [5]:
# Dataframe containing statistics at a player level
df_play.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 366846 entries, 0 to 274
Data columns (total 23 columns):
game_id                  366846 non-null object
home_away                366846 non-null object
preferred_name           366846 non-null object
first_name               366846 non-null object
last_name                366846 non-null object
plyr_id                  366846 non-null object
plyr_position            366846 non-null object
plyr_primary_position    366827 non-null object
era                      106713 non-null object
whip                     106713 non-null float64
k9                       106713 non-null float64
pitch_count              106713 non-null float64
obp                      192171 non-null float64
ops                      314578 non-null float64
slg                      314578 non-null float64
rbi                      314578 non-null float64
avg                      314578 non-null object
av_risp                  251584 non-null float64
hit_risp            

In [6]:
df_play.head()

Unnamed: 0,game_id,home_away,preferred_name,first_name,last_name,plyr_id,plyr_position,plyr_primary_position,era,whip,k9,pitch_count,obp,ops,slg,rbi,avg,av_risp,hit_risp,runs,error,fpct,assists
0,083801cc-cf2c-43f1-b78a-27a5d270403d,home,Alex,Alexander,Guerrero,03e46dc5-9e2e-4493-8614-e8b1c11a0c21,OF,LF,,,,,,5.0,4.0,2.0,1.0,,,1.0,,,
1,083801cc-cf2c-43f1-b78a-27a5d270403d,home,Juan,Juan,Uribe,081677e0-4134-4f69-a4a4-05acf060ead0,IF,3B,,,,,,1.0,0.0,0.0,0.0,,,0.0,,,
2,083801cc-cf2c-43f1-b78a-27a5d270403d,home,Adrián,Adrián,González,10154eef-8834-48e0-97e7-d7436367534c,IF,1B,,,,,,0.8,0.4,1.0,0.4,,,0.0,0.0,1.0,
3,083801cc-cf2c-43f1-b78a-27a5d270403d,home,A.J.,Andrew,Ellis,1b59c902-b26d-464d-895c-a08b971347c2,C,C,,,,,,0.25,0.0,0.0,0.0,,,1.0,0.0,1.0,
4,083801cc-cf2c-43f1-b78a-27a5d270403d,home,Howie,Howie,Kendrick,1da8f60d-0741-4b61-811c-1acca32b4393,OF,LF,,,,,,0.933,0.333,0.0,0.333,,,0.0,0.0,1.0,


In [8]:
# Dataframe containing all game statistics
# It's pretty massive...
df_game.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12596 entries, 0 to 14
Columns: 809 entries, game.attendance to game.weather.forecast.wind.speed_mph
dtypes: float64(698), int64(7), object(104)
memory usage: 77.8+ MB


In [9]:
df_game.head()

Unnamed: 0,game.attendance,game.away.abbr,game.away.errors,game.away.hits,game.away.id,game.away.lineup,game.away.loss,game.away.market,game.away.name,game.away.players,game.away.probable_pitcher.era,game.away.probable_pitcher.first_name,game.away.probable_pitcher.id,game.away.probable_pitcher.jersey_number,game.away.probable_pitcher.last_name,game.away.probable_pitcher.loss,game.away.probable_pitcher.preferred_name,game.away.probable_pitcher.win,game.away.roster,game.away.runs,game.away.scoring,game.away.starting_pitcher.era,game.away.starting_pitcher.first_name,game.away.starting_pitcher.id,game.away.starting_pitcher.jersey_number,game.away.starting_pitcher.last_name,game.away.starting_pitcher.loss,game.away.starting_pitcher.preferred_name,game.away.starting_pitcher.win,game.away.statistics.fielding.overall.a,game.away.statistics.fielding.overall.assists.outfield,game.away.statistics.fielding.overall.assists.total,game.away.statistics.fielding.overall.c_wp,game.away.statistics.fielding.overall.dp,game.away.statistics.fielding.overall.error,game.away.statistics.fielding.overall.errors.fielding,game.away.statistics.fielding.overall.errors.interference,game.away.statistics.fielding.overall.errors.throwing,game.away.statistics.fielding.overall.errors.total,game.away.statistics.fielding.overall.fpct,game.away.statistics.fielding.overall.pb,game.away.statistics.fielding.overall.po,game.away.statistics.fielding.overall.steal.caught,game.away.statistics.fielding.overall.steal.pct,game.away.statistics.fielding.overall.steal.pickoff,game.away.statistics.fielding.overall.steal.stolen,game.away.statistics.fielding.overall.tc,game.away.statistics.fielding.overall.tp,game.away.statistics.hitting.overall.ab,game.away.statistics.hitting.overall.ab_risp,game.away.statistics.hitting.overall.abhr,game.away.statistics.hitting.overall.abk,game.away.statistics.hitting.overall.ap,game.away.statistics.hitting.overall.avg,game.away.statistics.hitting.overall.babip,game.away.statistics.hitting.overall.bbk,game.away.statistics.hitting.overall.bbpa,game.away.statistics.hitting.overall.bip,game.away.statistics.hitting.overall.flyball,game.away.statistics.hitting.overall.gofo,game.away.statistics.hitting.overall.groundball,game.away.statistics.hitting.overall.hit_risp,game.away.statistics.hitting.overall.iso,game.away.statistics.hitting.overall.linedrive,game.away.statistics.hitting.overall.lob,game.away.statistics.hitting.overall.lob_risp_2out,game.away.statistics.hitting.overall.obp,game.away.statistics.hitting.overall.onbase.bb,game.away.statistics.hitting.overall.onbase.cycle,game.away.statistics.hitting.overall.onbase.d,game.away.statistics.hitting.overall.onbase.fc,game.away.statistics.hitting.overall.onbase.h,game.away.statistics.hitting.overall.onbase.hbp,game.away.statistics.hitting.overall.onbase.hr,game.away.statistics.hitting.overall.onbase.ibb,game.away.statistics.hitting.overall.onbase.roe,game.away.statistics.hitting.overall.onbase.s,game.away.statistics.hitting.overall.onbase.t,game.away.statistics.hitting.overall.onbase.tb,game.away.statistics.hitting.overall.ops,game.away.statistics.hitting.overall.outcome.ball,game.away.statistics.hitting.overall.outcome.dirtball,game.away.statistics.hitting.overall.outcome.foul,game.away.statistics.hitting.overall.outcome.iball,game.away.statistics.hitting.overall.outcome.klook,game.away.statistics.hitting.overall.outcome.kswing,game.away.statistics.hitting.overall.outcome.ktotal,game.away.statistics.hitting.overall.outs.fidp,game.away.statistics.hitting.overall.outs.fo,game.away.statistics.hitting.overall.outs.gidp,game.away.statistics.hitting.overall.outs.go,game.away.statistics.hitting.overall.outs.klook,game.away.statistics.hitting.overall.outs.kswing,game.away.statistics.hitting.overall.outs.ktotal,game.away.statistics.hitting.overall.outs.lidp,game.away.statistics.hitting.overall.outs.lo,game.away.statistics.hitting.overall.outs.po,game.away.statistics.hitting.overall.outs.sacfly,game.away.statistics.hitting.overall.outs.sachit,game.away.statistics.hitting.overall.pitch_count,game.away.statistics.hitting.overall.pitches.btotal,game.away.statistics.hitting.overall.pitches.count,game.away.statistics.hitting.overall.pitches.ktotal,game.away.statistics.hitting.overall.popup,game.away.statistics.hitting.overall.rbi,game.away.statistics.hitting.overall.rbi_2out,game.away.statistics.hitting.overall.runs.total,game.away.statistics.hitting.overall.seca,game.away.statistics.hitting.overall.slg,game.away.statistics.hitting.overall.steal.caught,game.away.statistics.hitting.overall.steal.pct,game.away.statistics.hitting.overall.steal.pickoff,game.away.statistics.hitting.overall.steal.stolen,game.away.statistics.hitting.overall.team_lob,game.away.statistics.hitting.overall.xbh,game.away.statistics.pitching.bullpen.babip,game.away.statistics.pitching.bullpen.bf,game.away.statistics.pitching.bullpen.bf_ip,game.away.statistics.pitching.bullpen.bk,game.away.statistics.pitching.bullpen.era,game.away.statistics.pitching.bullpen.games.blown_save,game.away.statistics.pitching.bullpen.games.complete,game.away.statistics.pitching.bullpen.games.hold,game.away.statistics.pitching.bullpen.games.loss,game.away.statistics.pitching.bullpen.games.qstart,game.away.statistics.pitching.bullpen.games.save,game.away.statistics.pitching.bullpen.games.shutout,game.away.statistics.pitching.bullpen.games.svo,game.away.statistics.pitching.bullpen.games.team_shutout,game.away.statistics.pitching.bullpen.games.win,game.away.statistics.pitching.bullpen.gbfb,game.away.statistics.pitching.bullpen.gofo,game.away.statistics.pitching.bullpen.in_play.flyball,game.away.statistics.pitching.bullpen.in_play.groundball,game.away.statistics.pitching.bullpen.in_play.linedrive,game.away.statistics.pitching.bullpen.in_play.popup,game.away.statistics.pitching.bullpen.ip_1,game.away.statistics.pitching.bullpen.ip_2,game.away.statistics.pitching.bullpen.k9,game.away.statistics.pitching.bullpen.kbb,game.away.statistics.pitching.bullpen.lob,game.away.statistics.pitching.bullpen.oab,game.away.statistics.pitching.bullpen.oba,game.away.statistics.pitching.bullpen.obp,game.away.statistics.pitching.bullpen.onbase.bb,game.away.statistics.pitching.bullpen.onbase.d,game.away.statistics.pitching.bullpen.onbase.fc,game.away.statistics.pitching.bullpen.onbase.h,game.away.statistics.pitching.bullpen.onbase.h9,game.away.statistics.pitching.bullpen.onbase.hbp,game.away.statistics.pitching.bullpen.onbase.hr,game.away.statistics.pitching.bullpen.onbase.hr9,game.away.statistics.pitching.bullpen.onbase.ibb,game.away.statistics.pitching.bullpen.onbase.roe,game.away.statistics.pitching.bullpen.onbase.s,game.away.statistics.pitching.bullpen.onbase.t,game.away.statistics.pitching.bullpen.onbase.tb,game.away.statistics.pitching.bullpen.outcome.ball,game.away.statistics.pitching.bullpen.outcome.dirtball,game.away.statistics.pitching.bullpen.outcome.foul,game.away.statistics.pitching.bullpen.outcome.iball,game.away.statistics.pitching.bullpen.outcome.klook,game.away.statistics.pitching.bullpen.outcome.kswing,game.away.statistics.pitching.bullpen.outcome.ktotal,game.away.statistics.pitching.bullpen.outs.fidp,game.away.statistics.pitching.bullpen.outs.fo,game.away.statistics.pitching.bullpen.outs.gidp,game.away.statistics.pitching.bullpen.outs.go,game.away.statistics.pitching.bullpen.outs.klook,game.away.statistics.pitching.bullpen.outs.kswing,game.away.statistics.pitching.bullpen.outs.ktotal,game.away.statistics.pitching.bullpen.outs.lidp,game.away.statistics.pitching.bullpen.outs.lo,game.away.statistics.pitching.bullpen.outs.po,game.away.statistics.pitching.bullpen.outs.sacfly,game.away.statistics.pitching.bullpen.outs.sachit,game.away.statistics.pitching.bullpen.pitch_count,game.away.statistics.pitching.bullpen.pitches.btotal,game.away.statistics.pitching.bullpen.pitches.count,game.away.statistics.pitching.bullpen.pitches.ktotal,game.away.statistics.pitching.bullpen.pitches.per_bf,game.away.statistics.pitching.bullpen.pitches.per_ip,game.away.statistics.pitching.bullpen.runs.bqr,game.away.statistics.pitching.bullpen.runs.bqra,game.away.statistics.pitching.bullpen.runs.earned,game.away.statistics.pitching.bullpen.runs.ir,game.away.statistics.pitching.bullpen.runs.ira,game.away.statistics.pitching.bullpen.runs.total,game.away.statistics.pitching.bullpen.runs.unearned,game.away.statistics.pitching.bullpen.slg,game.away.statistics.pitching.bullpen.steal.caught,game.away.statistics.pitching.bullpen.steal.pickoff,game.away.statistics.pitching.bullpen.steal.stolen,game.away.statistics.pitching.bullpen.whip,game.away.statistics.pitching.bullpen.wp,game.away.statistics.pitching.overall.babip,game.away.statistics.pitching.overall.bf,game.away.statistics.pitching.overall.bf_ip,game.away.statistics.pitching.overall.bf_start,game.away.statistics.pitching.overall.bk,game.away.statistics.pitching.overall.era,game.away.statistics.pitching.overall.games.blown_save,game.away.statistics.pitching.overall.games.complete,game.away.statistics.pitching.overall.games.hold,game.away.statistics.pitching.overall.games.loss,game.away.statistics.pitching.overall.games.qstart,game.away.statistics.pitching.overall.games.save,game.away.statistics.pitching.overall.games.shutout,game.away.statistics.pitching.overall.games.svo,game.away.statistics.pitching.overall.games.team_shutout,game.away.statistics.pitching.overall.games.win,game.away.statistics.pitching.overall.gbfb,game.away.statistics.pitching.overall.gofo,game.away.statistics.pitching.overall.in_play.flyball,game.away.statistics.pitching.overall.in_play.groundball,game.away.statistics.pitching.overall.in_play.linedrive,game.away.statistics.pitching.overall.in_play.popup,game.away.statistics.pitching.overall.ip_1,game.away.statistics.pitching.overall.ip_2,game.away.statistics.pitching.overall.k9,game.away.statistics.pitching.overall.kbb,game.away.statistics.pitching.overall.lob,game.away.statistics.pitching.overall.oab,game.away.statistics.pitching.overall.oba,game.away.statistics.pitching.overall.obp,game.away.statistics.pitching.overall.onbase.bb,game.away.statistics.pitching.overall.onbase.d,game.away.statistics.pitching.overall.onbase.fc,game.away.statistics.pitching.overall.onbase.h,game.away.statistics.pitching.overall.onbase.h9,game.away.statistics.pitching.overall.onbase.hbp,game.away.statistics.pitching.overall.onbase.hr,game.away.statistics.pitching.overall.onbase.hr9,game.away.statistics.pitching.overall.onbase.ibb,game.away.statistics.pitching.overall.onbase.roe,game.away.statistics.pitching.overall.onbase.s,game.away.statistics.pitching.overall.onbase.t,game.away.statistics.pitching.overall.onbase.tb,game.away.statistics.pitching.overall.outcome.ball,game.away.statistics.pitching.overall.outcome.dirtball,game.away.statistics.pitching.overall.outcome.foul,game.away.statistics.pitching.overall.outcome.iball,game.away.statistics.pitching.overall.outcome.klook,game.away.statistics.pitching.overall.outcome.kswing,game.away.statistics.pitching.overall.outcome.ktotal,game.away.statistics.pitching.overall.outs.fidp,game.away.statistics.pitching.overall.outs.fo,game.away.statistics.pitching.overall.outs.gidp,game.away.statistics.pitching.overall.outs.go,game.away.statistics.pitching.overall.outs.klook,...,game.home.statistics.pitching.bullpen.steal.caught,game.home.statistics.pitching.bullpen.steal.pickoff,game.home.statistics.pitching.bullpen.steal.stolen,game.home.statistics.pitching.bullpen.whip,game.home.statistics.pitching.bullpen.wp,game.home.statistics.pitching.overall.babip,game.home.statistics.pitching.overall.bf,game.home.statistics.pitching.overall.bf_ip,game.home.statistics.pitching.overall.bf_start,game.home.statistics.pitching.overall.bk,game.home.statistics.pitching.overall.era,game.home.statistics.pitching.overall.games.blown_save,game.home.statistics.pitching.overall.games.complete,game.home.statistics.pitching.overall.games.hold,game.home.statistics.pitching.overall.games.loss,game.home.statistics.pitching.overall.games.qstart,game.home.statistics.pitching.overall.games.save,game.home.statistics.pitching.overall.games.shutout,game.home.statistics.pitching.overall.games.svo,game.home.statistics.pitching.overall.games.team_shutout,game.home.statistics.pitching.overall.games.win,game.home.statistics.pitching.overall.gbfb,game.home.statistics.pitching.overall.gofo,game.home.statistics.pitching.overall.in_play.flyball,game.home.statistics.pitching.overall.in_play.groundball,game.home.statistics.pitching.overall.in_play.linedrive,game.home.statistics.pitching.overall.in_play.popup,game.home.statistics.pitching.overall.ip_1,game.home.statistics.pitching.overall.ip_2,game.home.statistics.pitching.overall.k9,game.home.statistics.pitching.overall.kbb,game.home.statistics.pitching.overall.lob,game.home.statistics.pitching.overall.oab,game.home.statistics.pitching.overall.oba,game.home.statistics.pitching.overall.obp,game.home.statistics.pitching.overall.onbase.bb,game.home.statistics.pitching.overall.onbase.d,game.home.statistics.pitching.overall.onbase.fc,game.home.statistics.pitching.overall.onbase.h,game.home.statistics.pitching.overall.onbase.h9,game.home.statistics.pitching.overall.onbase.hbp,game.home.statistics.pitching.overall.onbase.hr,game.home.statistics.pitching.overall.onbase.hr9,game.home.statistics.pitching.overall.onbase.ibb,game.home.statistics.pitching.overall.onbase.roe,game.home.statistics.pitching.overall.onbase.s,game.home.statistics.pitching.overall.onbase.t,game.home.statistics.pitching.overall.onbase.tb,game.home.statistics.pitching.overall.outcome.ball,game.home.statistics.pitching.overall.outcome.dirtball,game.home.statistics.pitching.overall.outcome.foul,game.home.statistics.pitching.overall.outcome.iball,game.home.statistics.pitching.overall.outcome.klook,game.home.statistics.pitching.overall.outcome.kswing,game.home.statistics.pitching.overall.outcome.ktotal,game.home.statistics.pitching.overall.outs.fidp,game.home.statistics.pitching.overall.outs.fo,game.home.statistics.pitching.overall.outs.gidp,game.home.statistics.pitching.overall.outs.go,game.home.statistics.pitching.overall.outs.klook,game.home.statistics.pitching.overall.outs.kswing,game.home.statistics.pitching.overall.outs.ktotal,game.home.statistics.pitching.overall.outs.lidp,game.home.statistics.pitching.overall.outs.lo,game.home.statistics.pitching.overall.outs.po,game.home.statistics.pitching.overall.outs.sacfly,game.home.statistics.pitching.overall.outs.sachit,game.home.statistics.pitching.overall.pitch_count,game.home.statistics.pitching.overall.pitches.btotal,game.home.statistics.pitching.overall.pitches.count,game.home.statistics.pitching.overall.pitches.ktotal,game.home.statistics.pitching.overall.pitches.per_bf,game.home.statistics.pitching.overall.pitches.per_ip,game.home.statistics.pitching.overall.pitches.per_start,game.home.statistics.pitching.overall.runs.bqr,game.home.statistics.pitching.overall.runs.bqra,game.home.statistics.pitching.overall.runs.earned,game.home.statistics.pitching.overall.runs.ir,game.home.statistics.pitching.overall.runs.ira,game.home.statistics.pitching.overall.runs.total,game.home.statistics.pitching.overall.runs.unearned,game.home.statistics.pitching.overall.slg,game.home.statistics.pitching.overall.steal.caught,game.home.statistics.pitching.overall.steal.pickoff,game.home.statistics.pitching.overall.steal.stolen,game.home.statistics.pitching.overall.whip,game.home.statistics.pitching.overall.wp,game.home.statistics.pitching.starters.babip,game.home.statistics.pitching.starters.bf,game.home.statistics.pitching.starters.bf_ip,game.home.statistics.pitching.starters.bf_start,game.home.statistics.pitching.starters.bk,game.home.statistics.pitching.starters.era,game.home.statistics.pitching.starters.games.blown_save,game.home.statistics.pitching.starters.games.complete,game.home.statistics.pitching.starters.games.hold,game.home.statistics.pitching.starters.games.loss,game.home.statistics.pitching.starters.games.qstart,game.home.statistics.pitching.starters.games.save,game.home.statistics.pitching.starters.games.shutout,game.home.statistics.pitching.starters.games.svo,game.home.statistics.pitching.starters.games.team_shutout,game.home.statistics.pitching.starters.games.win,game.home.statistics.pitching.starters.gbfb,game.home.statistics.pitching.starters.gofo,game.home.statistics.pitching.starters.in_play.flyball,game.home.statistics.pitching.starters.in_play.groundball,game.home.statistics.pitching.starters.in_play.linedrive,game.home.statistics.pitching.starters.in_play.popup,game.home.statistics.pitching.starters.ip_1,game.home.statistics.pitching.starters.ip_2,game.home.statistics.pitching.starters.k9,game.home.statistics.pitching.starters.kbb,game.home.statistics.pitching.starters.lob,game.home.statistics.pitching.starters.oab,game.home.statistics.pitching.starters.oba,game.home.statistics.pitching.starters.obp,game.home.statistics.pitching.starters.onbase.bb,game.home.statistics.pitching.starters.onbase.d,game.home.statistics.pitching.starters.onbase.fc,game.home.statistics.pitching.starters.onbase.h,game.home.statistics.pitching.starters.onbase.h9,game.home.statistics.pitching.starters.onbase.hbp,game.home.statistics.pitching.starters.onbase.hr,game.home.statistics.pitching.starters.onbase.hr9,game.home.statistics.pitching.starters.onbase.ibb,game.home.statistics.pitching.starters.onbase.roe,game.home.statistics.pitching.starters.onbase.s,game.home.statistics.pitching.starters.onbase.t,game.home.statistics.pitching.starters.onbase.tb,game.home.statistics.pitching.starters.outcome.ball,game.home.statistics.pitching.starters.outcome.dirtball,game.home.statistics.pitching.starters.outcome.foul,game.home.statistics.pitching.starters.outcome.iball,game.home.statistics.pitching.starters.outcome.klook,game.home.statistics.pitching.starters.outcome.kswing,game.home.statistics.pitching.starters.outcome.ktotal,game.home.statistics.pitching.starters.outs.fidp,game.home.statistics.pitching.starters.outs.fo,game.home.statistics.pitching.starters.outs.gidp,game.home.statistics.pitching.starters.outs.go,game.home.statistics.pitching.starters.outs.klook,game.home.statistics.pitching.starters.outs.kswing,game.home.statistics.pitching.starters.outs.ktotal,game.home.statistics.pitching.starters.outs.lidp,game.home.statistics.pitching.starters.outs.lo,game.home.statistics.pitching.starters.outs.po,game.home.statistics.pitching.starters.outs.sacfly,game.home.statistics.pitching.starters.outs.sachit,game.home.statistics.pitching.starters.pitch_count,game.home.statistics.pitching.starters.pitches.btotal,game.home.statistics.pitching.starters.pitches.count,game.home.statistics.pitching.starters.pitches.ktotal,game.home.statistics.pitching.starters.pitches.per_bf,game.home.statistics.pitching.starters.pitches.per_ip,game.home.statistics.pitching.starters.pitches.per_start,game.home.statistics.pitching.starters.runs.bqr,game.home.statistics.pitching.starters.runs.bqra,game.home.statistics.pitching.starters.runs.earned,game.home.statistics.pitching.starters.runs.ir,game.home.statistics.pitching.starters.runs.ira,game.home.statistics.pitching.starters.runs.total,game.home.statistics.pitching.starters.runs.unearned,game.home.statistics.pitching.starters.slg,game.home.statistics.pitching.starters.steal.caught,game.home.statistics.pitching.starters.steal.pickoff,game.home.statistics.pitching.starters.steal.stolen,game.home.statistics.pitching.starters.whip,game.home.statistics.pitching.starters.wp,game.home.win,game.home_team,game.id,game.officials,game.pitching.blown_save,game.pitching.hold,game.pitching.loss.blown_save,game.pitching.loss.first_name,game.pitching.loss.hold,game.pitching.loss.id,game.pitching.loss.jersey_number,game.pitching.loss.last_name,game.pitching.loss.loss,game.pitching.loss.position,game.pitching.loss.preferred_name,game.pitching.loss.primary_position,game.pitching.loss.save,game.pitching.loss.status,game.pitching.loss.win,game.pitching.save.blown_save,game.pitching.save.first_name,game.pitching.save.hold,game.pitching.save.id,game.pitching.save.jersey_number,game.pitching.save.last_name,game.pitching.save.loss,game.pitching.save.position,game.pitching.save.preferred_name,game.pitching.save.primary_position,game.pitching.save.save,game.pitching.save.status,game.pitching.save.win,game.pitching.win.blown_save,game.pitching.win.first_name,game.pitching.win.hold,game.pitching.win.id,game.pitching.win.jersey_number,game.pitching.win.last_name,game.pitching.win.loss,game.pitching.win.position,game.pitching.win.preferred_name,game.pitching.win.primary_position,game.pitching.win.save,game.pitching.win.status,game.pitching.win.win,game.reference,game.rescheduled,game.scheduled,game.split_squad,game.status,game.tbd,game.venue.address,game.venue.capacity,game.venue.city,game.venue.country,game.venue.field_orientation,game.venue.id,game.venue.location.lat,game.venue.location.lng,game.venue.market,game.venue.name,game.venue.stadium_type,game.venue.state,game.venue.surface,game.venue.zip,game.weather.current_conditions.cloud_cover,game.weather.current_conditions.condition,game.weather.current_conditions.dew_point_f,game.weather.current_conditions.humidity,game.weather.current_conditions.obs_time,game.weather.current_conditions.temp_f,game.weather.current_conditions.wind.direction,game.weather.current_conditions.wind.speed_mph,game.weather.forecast.cloud_cover,game.weather.forecast.condition,game.weather.forecast.dew_point_f,game.weather.forecast.humidity,game.weather.forecast.obs_time,game.weather.forecast.temp_f,game.weather.forecast.wind.direction,game.weather.forecast.wind.speed_mph
0,44680.0,ATL,0,9,12079497-e414-450a-8bf2-29f91de646bf,[{'id': 'e43136cd-762f-4b3f-8595-4f07e83be3b2'...,21.0,Atlanta,Braves,"[{'preferred_name': 'Pedro', 'first_name': 'Pe...",6.136,Williams,49f71aa5-fdd7-467b-b1be-fc9b347568b8,,Perez,0.0,Williams,0.0,"[{'preferred_name': 'Pedro', 'first_name': 'Pe...",3,"[{'number': 1, 'sequence': 1, 'runs': 0, 'hits...",6.136,Williams,49f71aa5-fdd7-467b-b1be-fc9b347568b8,,Perez,0.0,Williams,0.0,8.0,,,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,24.0,0.0,0.0,0.0,0.0,32.0,0.0,34.0,,0.0,8.5,36.0,0.265,0.3,0.5,0.056,30.0,0.0,2.67,0.0,,0.029,0.0,11.0,,,1.0,0.0,1.0,0.0,9.0,0.0,0.0,1.0,0.0,8.0,0.0,10.0,0.6,42.0,2.0,17.0,4.0,23.0,11.0,34.0,0.0,2.0,1.0,16.0,0.0,4.0,4.0,0.0,3.0,1.0,0.0,0.0,129.0,,,,0.0,3.0,0.0,3.0,0.059,0.294,1.0,0.0,1.0,0.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.364,40.0,5.0,,,6.75,1.0,,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,0.0,2.5,,,,,24.0,8.0,10.125,,18.0,34.0,0.324,0.425,5.0,1.0,0.0,11.0,,1.0,3.0,,0.0,0.0,7.0,0.0,21.0,73.0,0.0,25.0,0.0,30.0,10.0,40.0,0.0,1.0,1.0,10.0,4.0,...,,,,,,0.3,36.0,4.0,,,3.0,0.0,,0.0,0.0,1.0,1.0,0.0,1.0,,1.0,0.0,2.5,,,,,27.0,9.0,3.996,,11.0,34.0,0.265,0.306,1.0,1.0,0.0,9.0,,0.0,0.0,,1.0,0.0,8.0,0.0,10.0,42.0,2.0,17.0,4.0,23.0,11.0,34.0,0.0,2.0,1.0,15.0,0.0,4.0,4.0,0.0,3.0,1.0,0.0,0.0,129.0,,,,,,,,,3.0,,,3.0,0.0,0.294,1.0,1.0,0.0,1.2222,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26.0,ef64da7f-cfaf-4300-87b0-9313386b977c,083801cc-cf2c-43f1-b78a-27a5d270403d,[{'id': 'c4bf2d8b-ec0a-4938-b3c4-cdfd9efd2cc4'...,"[{'preferred_name': 'Luis', 'first_name': 'Lui...",,0.0,Nicholas,1.0,765872c5-d99e-409c-8f4a-d24891d4582d,51,Masset,1.0,P,Nick,RP,0.0,A,0.0,0.0,Kenley,1.0,9e798b78-22d9-44df-a4c3-fc9db6c0133d,74.0,Jansen,0.0,P,Kenley,RP,4.0,A,0.0,0.0,Adam,3.0,79ad0ac3-8b77-4989-b50f-a7ca99c87db6,36,Liberatore,1.0,P,Adam,RP,0.0,A,1.0,,,2015-05-26T00:10:00+00:00,,closed,,1000 Vin Scully Avenue,56000.0,Los Angeles,USA,NE,66a19c3d-24fe-477d-bee7-c6ef1b98352f,34.0745409,-118.2408881,Los Angeles,Dodger Stadium,outdoor,CA,grass,90012,,,,,,,,,,,,,,,,
1,30946.0,PHI,0,6,2142e1ba-3b40-445c-b8bb-f1f8b1054220,[{'id': 'f4b89e5f-baae-4dd4-87b2-1cd75889b48b'...,27.0,Philadelphia,Phillies,"[{'preferred_name': 'Maikel', 'first_name': 'M...",7.105,Severino,9e2ccda3-1cb4-4da7-8453-9987ff95dbbd,,Gonzalez,1.0,Severino,2.0,"[{'preferred_name': 'Maikel', 'first_name': 'M...",3,"[{'number': 1, 'sequence': 1, 'runs': 0, 'hits...",7.105,Severino,9e2ccda3-1cb4-4da7-8453-9987ff95dbbd,,Gonzalez,1.0,Severino,2.0,6.0,,,,2.0,0.0,0.0,0.0,0.0,0.0,1.0,,24.0,1.0,0.5,0.0,1.0,30.0,0.0,33.0,,0.0,3.3,37.0,0.182,0.25,0.3,0.081,24.0,0.0,0.39,0.0,,0.03,0.0,10.0,,,3.0,0.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,5.0,0.0,7.0,0.455,45.0,3.0,34.0,0.0,33.0,9.0,42.0,0.0,4.0,0.0,5.0,6.0,4.0,10.0,0.0,7.0,2.0,1.0,0.0,148.0,,,,0.0,3.0,0.0,3.0,0.121,0.212,0.0,0.0,0.0,0.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.278,34.0,4.25,,,6.75,0.0,,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.857,,,,,24.0,8.0,10.125,,12.0,30.0,0.267,0.353,3.0,0.0,0.0,8.0,,1.0,3.0,,0.0,0.0,5.0,0.0,17.0,47.0,2.0,23.0,0.0,22.0,16.0,38.0,0.0,3.0,1.0,6.0,2.0,...,,,,,,0.25,37.0,4.111,,,3.0,0.0,,2.0,0.0,1.0,1.0,0.0,1.0,,1.0,0.0,0.385,,,,,27.0,9.0,9.999,,10.0,33.0,0.182,0.243,3.0,1.0,0.0,6.0,,0.0,0.0,,0.0,0.0,5.0,0.0,7.0,45.0,3.0,34.0,0.0,33.0,9.0,42.0,0.0,4.0,0.0,5.0,6.0,4.0,10.0,0.0,7.0,2.0,1.0,0.0,148.0,,,,,,,,,3.0,,,3.0,0.0,0.212,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,24.0,f246a5e5-afdb-479c-9aaa-c68beeda7af6,35233f95-2d40-4859-9b59-6736a26b0b69,[{'id': 'b6506928-3c9b-4eae-a3c3-5ea443f4bfe2'...,,"[{'preferred_name': 'Alex', 'first_name': 'Ale...",0.0,Elvis,0.0,f2e016ca-66e9-4ed5-8e01-39bc35e644e6,59,Araujo,1.0,P,Elvis,RP,0.0,A,1.0,1.0,Jeurys,1.0,4f82d295-a42a-4520-9844-35362a25d037,27.0,Familia,0.0,P,Jeurys,RP,14.0,A,0.0,0.0,Bartolo,0.0,1e4a62ff-7ae0-40b5-8f56-7c4a6d40a705,40,Colón,3.0,P,Bartolo,SP,0.0,A,7.0,,,2015-05-25T17:10:00+00:00,,closed,,123-01 Roosevelt Avenue,41922.0,"Flushing, Queens",USA,E,265c7d6c-427a-4b8a-8def-392c41954bec,40.7564124,-73.84589369999999,New York,Citi Field,outdoor,NY,grass,11368,,,,,,,,,,,,,,,,
2,15168.0,CWS,0,4,47f490cd-2f58-4ef7-9dfd-2ad6ba6c1ae8,[{'id': 'aa614a82-e4c7-4129-ba6d-5205b1c9b63d'...,22.0,Chicago,White Sox,"[{'preferred_name': 'J.B.', 'first_name': 'Jac...",5.604,Hector,aa614a82-e4c7-4129-ba6d-5205b1c9b63d,48.0,Noesi,3.0,Hector,0.0,"[{'preferred_name': 'J.B.', 'first_name': 'Jac...",0,"[{'number': 1, 'sequence': 1, 'runs': 0, 'hits...",5.604,Hector,aa614a82-e4c7-4129-ba6d-5205b1c9b63d,48.0,Noesi,3.0,Hector,0.0,15.0,,,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,24.0,0.0,0.0,0.0,0.0,39.0,0.0,29.0,,0.0,3.625,29.0,0.138,0.19,0.0,0.0,21.0,0.0,1.38,0.0,,0.0,0.0,7.0,,,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.276,26.0,0.0,17.0,0.0,24.0,8.0,32.0,0.0,4.0,2.0,11.0,2.0,6.0,8.0,0.0,1.0,3.0,0.0,0.0,96.0,,,,0.0,0.0,0.0,0.0,0.0,0.138,0.0,0.0,0.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.208,34.0,4.25,,,6.75,0.0,,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,2.167,,,,,24.0,8.0,4.5,,13.0,30.0,0.233,0.324,4.0,1.0,0.0,7.0,,0.0,2.0,,0.0,0.0,4.0,0.0,14.0,52.0,3.0,31.0,0.0,19.0,11.0,30.0,0.0,2.0,1.0,13.0,1.0,...,,,,,,0.19,29.0,3.222,,,0.0,0.0,,0.0,0.0,1.0,0.0,1.0,0.0,,1.0,0.0,1.125,,,,,27.0,9.0,8.001,,7.0,29.0,0.138,0.138,0.0,0.0,0.0,4.0,,0.0,0.0,,0.0,0.0,4.0,0.0,4.0,26.0,0.0,17.0,0.0,24.0,8.0,32.0,0.0,4.0,2.0,9.0,2.0,6.0,8.0,0.0,1.0,3.0,0.0,0.0,96.0,,,,,,,,,0.0,,,0.0,0.0,0.138,0.0,0.0,0.0,0.4444,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,20.0,1d678440-b4b1-4954-9b39-70afb3ebbcfa,0991650b-dfeb-42f1-9a83-ffc52c51d690,[{'id': 'cd5f1e16-45c1-4bbf-90a5-c825475c1b52'...,,,0.0,Hector,0.0,aa614a82-e4c7-4129-ba6d-5205b1c9b63d,48,Noesi,4.0,P,Hector,RP,0.0,A,0.0,,,,,,,,,,,,,,0.0,Andrew,0.0,f11efc76-62f5-4396-b145-e03839fd4d1c,36,Hutchison,1.0,P,Drew,SP,0.0,A,4.0,,,2015-05-25T23:07:00+00:00,,closed,,One Blue Jays Way,49282.0,Toronto,CANADA,N,84d72338-2173-4a90-9d25-99adc6c86f4b,43.6417388,-79.3892547,Toronto,Rogers Centre,retractable,ON,turf,M5V1J3,,,,,,,,,,,,,,,,
3,13614.0,TEX,1,9,d99f919b-1534-4516-8e8a-9cd106c6d8cd,[{'id': '79fc6097-04ab-442b-9d5f-8495b19fb3c5'...,23.0,Texas,Rangers,"[{'preferred_name': 'Delino', 'first_name': 'D...",4.5,Phil,79fc6097-04ab-442b-9d5f-8495b19fb3c5,43.0,Klein,0.0,Phil,1.0,"[{'preferred_name': 'Delino', 'first_name': 'D...",10,"[{'number': 1, 'sequence': 1, 'runs': 3, 'hits...",4.5,Phil,79fc6097-04ab-442b-9d5f-8495b19fb3c5,43.0,Klein,0.0,Phil,1.0,11.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.974,,27.0,1.0,1.0,0.0,0.0,39.0,0.0,37.0,,12.333,3.364,40.0,0.243,0.261,0.273,0.075,23.0,0.0,1.43,0.0,,0.298,0.0,5.0,,,2.0,0.0,2.0,0.0,9.0,0.0,3.0,1.0,1.0,4.0,0.0,20.0,0.841,45.0,2.0,31.0,1.0,29.0,19.0,48.0,0.0,4.0,0.0,10.0,4.0,7.0,11.0,0.0,2.0,1.0,0.0,0.0,153.0,,,,0.0,9.0,0.0,10.0,0.405,0.541,0.0,1.0,0.0,1.0,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.25,42.0,4.667,,,7.0,0.0,,1.0,0.0,0.0,1.0,0.0,1.0,,1.0,0.0,1.571,,,,,27.0,9.0,9.0,,17.0,34.0,0.235,0.357,6.0,4.0,0.0,8.0,,0.0,2.0,,1.0,0.0,2.0,0.0,18.0,62.0,7.0,44.0,4.0,31.0,9.0,40.0,0.0,4.0,0.0,11.0,6.0,...,,,,,,0.261,40.0,4.444,,,8.0,1.0,,0.0,1.0,0.0,0.0,0.0,1.0,,0.0,0.0,1.429,,,,,27.0,9.0,10.998,,5.0,37.0,0.243,0.3,2.0,2.0,0.0,9.0,,0.0,3.0,,1.0,0.0,4.0,0.0,20.0,45.0,2.0,31.0,1.0,29.0,19.0,48.0,0.0,4.0,0.0,10.0,4.0,7.0,11.0,0.0,2.0,1.0,0.0,0.0,153.0,,,,,,,,,8.0,,,10.0,2.0,0.541,0.0,0.0,1.0,1.3333,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,20.0,80715d0d-0d2a-450f-a970-1b9a3b18c7e7,6be1fab3-cb64-4bb4-bd31-7f69ef5a4773,[{'id': 'd5cba620-1fcf-432a-aa74-e261b4927141'...,"[{'preferred_name': 'Marc', 'first_name': 'Mar...","[{'preferred_name': 'Sam', 'first_name': 'Samu...",2.0,Marc,5.0,d64cbbbf-b48a-483c-b382-bb4c4aaa966c,35,Rzepczynski,2.0,P,Marc,RP,0.0,A,1.0,0.0,Shawn,6.0,d710df18-7a1f-4d3e-a614-60cdc4563371,37.0,Tolleson,0.0,P,Shawn,RP,4.0,A,1.0,0.0,Tanner,4.0,74893884-a7a8-4abb-9449-2ccac55151f0,52,Scheppers,0.0,P,Tanner,RP,0.0,A,1.0,,,2015-05-25T20:10:00+00:00,,closed,,2401 Ontario Street,35051.0,Cleveland,USA,N,2b0ccd49-4d87-4996-ac4d-27ffc7ee4c16,41.4957048,-81.6852732,Cleveland,Progressive Field,outdoor,OH,grass,44115,,,,,,,,,,,,,,,,
4,20046.0,MIA,0,9,03556285-bdbb-4576-a06d-42f71f46ddc5,[{'id': '65de4cd1-ca86-468c-9346-1e68d6279a8e'...,27.0,Miami,Marlins,"[{'preferred_name': 'Vin', 'first_name': 'Vinc...",3.214,David,fc6848bc-5f1a-491d-89a8-1270c6b9b5b7,35.0,Phelps,1.0,David,2.0,"[{'preferred_name': 'Vin', 'first_name': 'Vinc...",2,"[{'number': 1, 'sequence': 1, 'runs': 1, 'hits...",3.214,David,fc6848bc-5f1a-491d-89a8-1270c6b9b5b7,35.0,Phelps,1.0,David,2.0,13.0,,,,2.0,0.0,0.0,0.0,0.0,0.0,1.0,,24.0,1.0,1.0,0.0,0.0,37.0,0.0,34.0,,34.0,11.333,34.0,0.265,0.267,0.0,0.0,30.0,0.0,11.0,0.0,,0.117,0.0,11.0,,,0.0,0.0,1.0,0.0,9.0,0.0,1.0,0.0,0.0,7.0,0.0,13.0,0.647,31.0,0.0,17.0,0.0,24.0,6.0,30.0,0.0,2.0,2.0,22.0,2.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,109.0,,,,0.0,2.0,0.0,2.0,0.147,0.382,0.0,1.0,0.0,1.0,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.364,33.0,4.125,,,4.5,0.0,,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,1.5,,,,,24.0,8.0,6.75,,7.0,30.0,0.333,0.375,2.0,1.0,0.0,10.0,,0.0,2.0,,0.0,0.0,7.0,0.0,17.0,49.0,0.0,16.0,0.0,26.0,9.0,35.0,0.0,4.0,1.0,9.0,4.0,...,,,,,,0.267,34.0,3.778,,,2.0,0.0,,1.0,0.0,1.0,1.0,0.0,1.0,,1.0,0.0,10.0,,,,,27.0,9.0,2.997,,11.0,34.0,0.265,0.265,0.0,1.0,0.0,9.0,,0.0,1.0,,0.0,0.0,7.0,0.0,13.0,31.0,0.0,17.0,0.0,24.0,6.0,30.0,0.0,2.0,2.0,20.0,2.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,109.0,,,,,,,,,2.0,,,2.0,0.0,0.382,0.0,0.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,21.0,481dfe7e-5dab-46ab-a49f-9dcc2b6e2cfd,ad8ee97e-aedf-483b-bd31-b94e5983daee,[{'id': '75e48ea1-67ec-40eb-9832-bf258c680cb0'...,,"[{'preferred_name': 'Tony', 'first_name': 'Ant...",0.0,David,0.0,fc6848bc-5f1a-491d-89a8-1270c6b9b5b7,41,Phelps,2.0,P,David,RP,0.0,A,2.0,1.0,Mark,0.0,01982903-301b-41eb-8a8d-b88a7b3070d0,35.0,Melancon,1.0,P,Mark,RP,11.0,A,0.0,0.0,Charles,0.0,ff772241-8fdd-488c-a81e-49b44ce600fc,50,Morton,0.0,P,Charlie,SP,0.0,A,1.0,,,2015-05-25T23:05:00+00:00,,closed,,115 Federal Street,38362.0,Pittsburgh,USA,E,61314394-c8b8-411e-b891-ca41285d5362,40.4471507,-80.0064087,Pittsburgh,PNC Park,outdoor,PA,grass,15212,,,,,,,,,,,,,,,,


# Data Cleaning and Feature Engineering

In [10]:
### Adding MLB League ###
# Get list of team names
team_df = df_game[['game.away.id', 'game.away.name']].drop_duplicates()[:30].reset_index(drop = True).rename(columns = {'game.away.id': 'team.id', 'game.away.name': 'team.name'})
team_list = team_df['team.name'].to_list()

# Remove games played by teams not in the MLB
mask = ((df_game['game.away.name'].isin(team_list)) 
        & (df_game['game.home.name'].isin(team_list)))
df_game = df_game[mask]

# Assign the MLB League to each home and away team
league_dict = {
    'Braves': 'National',
    'Phillies': 'National',
    'White Sox': 'American',
    'Rangers': 'American',
    'Marlins': 'National',
    'Red Sox': 'American',
    'Rockies': 'National',
    'Nationals': 'National',
    'Royals': 'American',
    'Tigers': 'American',
    'Mariners': 'American',
    'Padres': 'National',
    'Giants': 'National',
    'Diamondbacks': 'National',
    'Astros': 'American',
    'Indians': 'American',
    'Pirates': 'National',
    'Yankees': 'American',
    'Blue Jays': 'American',
    'Dodgers': 'National',
    'Rays': 'American',
    'Orioles': 'American',
    'Brewers': 'National',
    'Mets': 'National',
    'Cubs': 'National',
    'Twins': 'American',
    'Athletics': 'American',
    'Reds': 'National',
    'Cardinals': 'National',
    'Angels': 'American'
}

# Create DataFrame from dictionary
df_league = pd.DataFrame.from_dict(league_dict, orient = 'index').reset_index().rename(columns = {'index': 'team_name', 0: 'team_league'})

In [11]:
# Convert date to datetime format
df_game['game.date'] = pd.to_datetime(df_game['game.scheduled']).dt.date

In [12]:
# Create a "sub" dataframe containing only the stats we want to use in model from df_game
df_game_sub = df_game[['game.id',
                       'game.scheduled', 
                       'game.date',
                       'game.home.runs',
                       'game.away.runs',
                       'game.away.lineup', 
                       'game.home.lineup', 
                       'game.home_team', 
                       'game.home.name',
                       'game.away_team', 
                       'game.away.name', 
                       'game.attendance',
                       'game.away.starting_pitcher.era', 
                       'game.home.starting_pitcher.era', 
                       'game.away.starting_pitcher.win', 
                       'game.away.starting_pitcher.loss', 
                       'game.home.starting_pitcher.loss', 
                       'game.home.starting_pitcher.win']]
# Reset Index
df_game_sub = df_game_sub.reset_index(drop = True)

# Addd
df_game_sub = pd.merge(df_game_sub, df_league, left_on = 'game.away.name', right_on = 'team_name').rename(columns = {'team_league': 'away.team.league'}).drop('team_name', axis = 1)
df_game_sub = pd.merge(df_game_sub, df_league, left_on = 'game.home.name', right_on = 'team_name').rename(columns = {'team_league': 'home.team.league'}).drop('team_name', axis = 1)

In [13]:
# Create Binary Winning Team Column (Our model target)
# 1 = home team won
# 0 = away team won
df_game_sub['game.winning.team'] = np.where(df_game_sub['game.home.runs'] - df_game_sub['game.away.runs'] > 0, '1', '0')

# Checking to see target distribution
df_game_sub['game.winning.team'].value_counts()

1    6728
0    5856
Name: game.winning.team, dtype: int64

In [14]:
# Create Column for Pitcher's Win Percentage leading up to game
df_game_sub['game.away.pitching.winpercentage'] = df_game_sub['game.away.starting_pitcher.win'] / (df_game_sub['game.away.starting_pitcher.win'] + df_game_sub['game.away.starting_pitcher.loss'])
df_game_sub['game.home.pitching.winpercentage'] = df_game_sub['game.home.starting_pitcher.win'] / (df_game_sub['game.home.starting_pitcher.win'] + df_game_sub['game.home.starting_pitcher.loss'])

In [15]:
df_game_sub.head()

Unnamed: 0,game.id,game.scheduled,game.date,game.home.runs,game.away.runs,game.away.lineup,game.home.lineup,game.home_team,game.home.name,game.away_team,game.away.name,game.attendance,game.away.starting_pitcher.era,game.home.starting_pitcher.era,game.away.starting_pitcher.win,game.away.starting_pitcher.loss,game.home.starting_pitcher.loss,game.home.starting_pitcher.win,game.winning.team,game.away.pitching.winpercentage,game.home.pitching.winpercentage
0,083801cc-cf2c-43f1-b78a-27a5d270403d,2015-05-26T00:10:00+00:00,2015-05-26,6,3,[{'id': 'e43136cd-762f-4b3f-8595-4f07e83be3b2'...,[{'id': '43a9d631-5673-4059-9b25-d59290bc32c3'...,ef64da7f-cfaf-4300-87b0-9313386b977c,Dodgers,12079497-e414-450a-8bf2-29f91de646bf,Braves,44680.0,6.136,3.614,0.0,0.0,2.0,2.0,1,,0.5
1,35233f95-2d40-4859-9b59-6736a26b0b69,2015-05-25T17:10:00+00:00,2015-05-25,6,3,[{'id': 'f4b89e5f-baae-4dd4-87b2-1cd75889b48b'...,[{'id': '39212be3-ce0d-4f7f-ab1a-c687ca189edd'...,f246a5e5-afdb-479c-9aaa-c68beeda7af6,Mets,2142e1ba-3b40-445c-b8bb-f1f8b1054220,Phillies,30946.0,7.105,4.85,2.0,1.0,3.0,6.0,1,0.666667,0.666667
2,0991650b-dfeb-42f1-9a83-ffc52c51d690,2015-05-25T23:07:00+00:00,2015-05-25,6,0,[{'id': 'aa614a82-e4c7-4129-ba6d-5205b1c9b63d'...,[{'id': 'f11efc76-62f5-4396-b145-e03839fd4d1c'...,1d678440-b4b1-4954-9b39-70afb3ebbcfa,Blue Jays,47f490cd-2f58-4ef7-9dfd-2ad6ba6c1ae8,White Sox,15168.0,5.604,6.061,0.0,3.0,1.0,3.0,1,0.0,0.75
3,6be1fab3-cb64-4bb4-bd31-7f69ef5a4773,2015-05-25T20:10:00+00:00,2015-05-25,8,10,[{'id': '79fc6097-04ab-442b-9d5f-8495b19fb3c5'...,[{'id': '792a2e07-49cb-4ad0-a1bc-a901ca6a7cf8'...,80715d0d-0d2a-450f-a970-1b9a3b18c7e7,Indians,d99f919b-1534-4516-8e8a-9cd106c6d8cd,Rangers,13614.0,4.5,2.314,1.0,0.0,0.0,1.0,0,1.0,1.0
4,ad8ee97e-aedf-483b-bd31-b94e5983daee,2015-05-25T23:05:00+00:00,2015-05-25,4,2,[{'id': '65de4cd1-ca86-468c-9346-1e68d6279a8e'...,[{'id': '0e762dec-639f-497f-a65b-af0eaa8be39f'...,481dfe7e-5dab-46ab-a49f-9dcc2b6e2cfd,Pirates,03556285-bdbb-4576-a06d-42f71f46ddc5,Marlins,20046.0,3.214,,2.0,1.0,,,1,0.666667,


In [16]:
### Aggregate player performance statistics ###

# Grab all filenames for each type of average player performance statistics
path = '/Users/JacKuo14/Documents/Metis/course_work/project03_baseball/sportstradar_data2/player_stats/'
away1 = glob.glob(path + '*away1*.pkl') # last game
away7 = glob.glob(path + '*away7*.pkl') # last 7 games
away30 = glob.glob(path + '*away30*.pkl') # last 30 games

home1 = glob.glob(path + '*home1*.pkl')
home7 = glob.glob(path + '*home7*.pkl')
home30 = glob.glob(path + '*home30*.pkl')

# -----------------------------------
## Unpickle each file, put in dataframe, and groupby game_id to get game-level averages

# AWAY - Avg last game
df_away1 = pd.DataFrame()
for file in away1:
    df_away1 = df_away1.append(read_pickle(file))
df_away1_group = df_away1.groupby('game_id').mean().reset_index() # groupby game_id to find mean of values for each game
df_away1_group.columns = [str(col) + '_away' for col in df_away1_group.columns]
df_away1_group = df_away1_group.rename(columns = {'game_id_away': 'game.id'})

# AWAY - Avg last 7 games
df_away7 = pd.DataFrame()
for file in away7:
    df_away7 = df_away7.append(read_pickle(file))
df_away7_group = df_away7.groupby('game_id').mean().reset_index()
df_away7_group.columns = [str(col) + '_away' for col in df_away7_group.columns]
df_away7_group = df_away7_group.rename(columns = {'game_id_away': 'game.id'})

                                       
# AWAY - Avg last 30 game
df_away30 = pd.DataFrame()
for file in away30:
    df_away30 = df_away30.append(read_pickle(file))
df_away30_group = df_away30.groupby('game_id').mean().reset_index()
df_away30_group.columns = [str(col) + '_away' for col in df_away30_group.columns]
df_away30_group = df_away30_group.rename(columns = {'game_id_away': 'game.id'})                                   

# HOME - Avg last game
df_home1 = pd.DataFrame()
for file in away1:
    df_home1 = df_home1.append(read_pickle(file))
df_home1_group = df_home1.groupby('game_id').mean().reset_index() # groupby game_id to find mean of values for each game
df_home1_group.columns = [str(col) + '_home' for col in df_home1_group.columns]
df_home1_group = df_home1_group.rename(columns = {'game_id_home': 'game.id'})

# HOME - Avg last 7 games
df_home7 = pd.DataFrame()
for file in away7:
    df_home7 = df_home7.append(read_pickle(file))
df_home7_group = df_home7.groupby('game_id').mean().reset_index()
df_home7_group.columns = [str(col) + '_home' for col in df_home7_group.columns]
df_home7_group = df_home7_group.rename(columns = {'game_id_home': 'game.id'})
                                       
# HOME - Avg last 30 game
df_home30 = pd.DataFrame()
for file in away30:
    df_home30 = df_home30.append(read_pickle(file))
df_home30_group = df_home30.groupby('game_id').mean().reset_index()
df_home30_group.columns = [str(col) + '_home' for col in df_home30_group.columns]
df_home30_group = df_home30_group.rename(columns = {'game_id_home': 'game.id'})

In [17]:
df_home30_group.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10239 entries, 0 to 10238
Data columns (total 16 columns):
game.id                10239 non-null object
assists_30_home        4791 non-null float64
av_risp_30_home        8453 non-null float64
avg_30_home            0 non-null float64
era_30_home            9235 non-null float64
error_30_home          10239 non-null float64
fpct_30_home           10239 non-null float64
hit_risp_30_home       8453 non-null float64
k9_30_home             9628 non-null float64
obp_30_home            8796 non-null float64
ops_30_home            10239 non-null float64
pitch_count_30_home    9628 non-null float64
rbi_30_home            10239 non-null float64
runs_30_home           10239 non-null float64
slg_30_home            10239 non-null float64
whip_30_home           9628 non-null float64
dtypes: float64(15), object(1)
memory usage: 1.2+ MB


In [18]:
# Each row is a game, and contains the 30-day average statistics for each player playing the game
df_home30_group.head()

Unnamed: 0,game.id,assists_30_home,av_risp_30_home,avg_30_home,era_30_home,error_30_home,fpct_30_home,hit_risp_30_home,k9_30_home,obp_30_home,ops_30_home,pitch_count_30_home,rbi_30_home,runs_30_home,slg_30_home,whip_30_home
0,0008b6f5-e325-4217-b8a2-e2e3a7ef0994,0.720107,0.454762,,6.220556,0.048711,0.663694,0.119048,9.04668,0.20594,0.476669,30.073333,0.245238,0.266667,0.270729,1.544976
1,0017e242-0d2f-44fb-a244-d2e74fa5f1f8,0.710022,0.520918,,6.088067,0.027281,0.659109,0.140476,9.29934,0.224532,0.500957,31.586667,0.305102,0.298129,0.276897,1.474491
2,001abb29-00d1-4d14-8410-fd9965886060,0.982114,0.558333,,6.266278,0.053746,0.785953,0.097222,6.4161,0.236142,0.511128,69.733333,0.191667,0.205556,0.274986,1.835227
3,001f0b21-1418-4adc-81e3-73aed23e472a,0.592481,0.52381,,5.17,0.057639,0.660236,0.159524,10.4229,0.237707,0.592864,17.0,0.42381,0.45,0.355157,1.209405
4,0023c134-e992-4110-86a5-d3d1f89326df,0.676822,0.616776,,4.976195,0.059254,0.778717,0.187146,10.604817,0.267577,0.643056,48.128736,0.38366,0.382789,0.375479,1.401345


In [19]:
# Merge all dataframes
group_df_list = [df_away1_group, df_away7_group, df_away30_group, df_home1_group, df_home7_group, df_home30_group]
df_game_sub_merged = df_game_sub
for df in group_df_list:
    df_game_sub_merged = pd.merge(df_game_sub_merged, df, how = 'left', on = 'game.id')

In [21]:
# Master Dataframe
df_game_sub_merged.head()

Unnamed: 0,game.id,game.scheduled,game.date,game.home.runs,game.away.runs,game.away.lineup,game.home.lineup,game.home_team,game.home.name,game.away_team,game.away.name,game.attendance,game.away.starting_pitcher.era,game.home.starting_pitcher.era,game.away.starting_pitcher.win,game.away.starting_pitcher.loss,game.home.starting_pitcher.loss,game.home.starting_pitcher.win,game.winning.team,game.away.pitching.winpercentage,game.home.pitching.winpercentage,assists_1_away,av_risp_1_away,avg_1_away,era_1_away,error_1_away,fpct_1_away,hit_risp_1_away,k9_1_away,obp_1_away,ops_1_away,pitch_count_1_away,rbi_1_away,runs_1_away,slg_1_away,whip_1_away,assists_7_away,av_risp_7_away,avg_7_away,era_7_away,error_7_away,fpct_7_away,hit_risp_7_away,k9_7_away,obp_7_away,ops_7_away,pitch_count_7_away,rbi_7_away,runs_7_away,slg_7_away,whip_7_away,assists_30_away,av_risp_30_away,avg_30_away,era_30_away,error_30_away,fpct_30_away,hit_risp_30_away,k9_30_away,obp_30_away,ops_30_away,pitch_count_30_away,rbi_30_away,runs_30_away,slg_30_away,whip_30_away,assists_1_home,av_risp_1_home,avg_1_home,era_1_home,error_1_home,fpct_1_home,hit_risp_1_home,k9_1_home,obp_1_home,ops_1_home,pitch_count_1_home,rbi_1_home,runs_1_home,slg_1_home,whip_1_home,assists_7_home,av_risp_7_home,avg_7_home,era_7_home,error_7_home,fpct_7_home,hit_risp_7_home,k9_7_home,obp_7_home,ops_7_home,pitch_count_7_home,rbi_7_home,runs_7_home,slg_7_home,whip_7_home,assists_30_home,av_risp_30_home,avg_30_home,era_30_home,error_30_home,fpct_30_home,hit_risp_30_home,k9_30_home,obp_30_home,ops_30_home,pitch_count_30_home,rbi_30_home,runs_30_home,slg_30_home,whip_30_home
0,083801cc-cf2c-43f1-b78a-27a5d270403d,2015-05-26T00:10:00+00:00,2015-05-26,6,3,[{'id': 'e43136cd-762f-4b3f-8595-4f07e83be3b2'...,[{'id': '43a9d631-5673-4059-9b25-d59290bc32c3'...,ef64da7f-cfaf-4300-87b0-9313386b977c,Dodgers,12079497-e414-450a-8bf2-29f91de646bf,Braves,44680.0,6.136,3.614,0.0,0.0,2.0,2.0,1,,0.5,,,0.210643,5.5125,0.076923,0.839154,,18.3375,,0.522571,31.0,0.071429,0.142857,0.2285,2.975,,,,2.035714,0.010989,0.697168,,6.75,,0.455286,12.952381,0.252747,0.252747,0.213176,1.314286,,,,,0.038889,0.93089,,,,0.670067,,0.383333,0.466667,0.350911,,,,0.210643,5.5125,0.076923,0.839154,,18.3375,,0.522571,31.0,0.071429,0.142857,0.2285,2.975,,,,2.035714,0.010989,0.697168,,6.75,,0.455286,12.952381,0.252747,0.252747,0.213176,1.314286,,,,,0.038889,0.93089,,,,0.670067,,0.383333,0.466667,0.350911,
1,35233f95-2d40-4859-9b59-6736a26b0b69,2015-05-25T17:10:00+00:00,2015-05-25,6,3,[{'id': 'f4b89e5f-baae-4dd4-87b2-1cd75889b48b'...,[{'id': '39212be3-ce0d-4f7f-ab1a-c687ca189edd'...,f246a5e5-afdb-479c-9aaa-c68beeda7af6,Mets,2142e1ba-3b40-445c-b8bb-f1f8b1054220,Phillies,30946.0,7.105,4.85,2.0,1.0,3.0,6.0,1,0.666667,0.666667,,,0.2178,9.36,0.0,0.615385,,1.8,,0.5556,25.8,0.2,0.2,0.3378,2.2,,,,5.571429,0.054945,0.767407,,9.15,,0.569253,15.904762,0.274725,0.340659,0.312286,1.890476,,,,,0.05904,0.909896,,,,0.675481,,0.32963,0.348148,0.358763,,,,0.2178,9.36,0.0,0.615385,,1.8,,0.5556,25.8,0.2,0.2,0.3378,2.2,,,,5.571429,0.054945,0.767407,,9.15,,0.569253,15.904762,0.274725,0.340659,0.312286,1.890476,,,,,0.05904,0.909896,,,,0.675481,,0.32963,0.348148,0.358763,
2,0991650b-dfeb-42f1-9a83-ffc52c51d690,2015-05-25T23:07:00+00:00,2015-05-25,6,0,[{'id': 'aa614a82-e4c7-4129-ba6d-5205b1c9b63d'...,[{'id': 'f11efc76-62f5-4396-b145-e03839fd4d1c'...,1d678440-b4b1-4954-9b39-70afb3ebbcfa,Blue Jays,47f490cd-2f58-4ef7-9dfd-2ad6ba6c1ae8,White Sox,15168.0,5.604,6.061,0.0,3.0,1.0,3.0,1,0.0,0.75,,,0.125,4.5,0.111111,0.666667,,0.0,,0.425,15.5,0.1,0.1,0.2,2.5,,,,3.765286,0.04329,0.778621,,4.186286,,0.531914,36.714286,0.257143,0.285714,0.275957,1.839286,,,,,0.054919,0.865585,,,,0.692386,,0.390476,0.428571,0.371748,,,,0.125,4.5,0.111111,0.666667,,0.0,,0.425,15.5,0.1,0.1,0.2,2.5,,,,3.765286,0.04329,0.778621,,4.186286,,0.531914,36.714286,0.257143,0.285714,0.275957,1.839286,,,,,0.054919,0.865585,,,,0.692386,,0.390476,0.428571,0.371748,
3,6be1fab3-cb64-4bb4-bd31-7f69ef5a4773,2015-05-25T20:10:00+00:00,2015-05-25,8,10,[{'id': '79fc6097-04ab-442b-9d5f-8495b19fb3c5'...,[{'id': '792a2e07-49cb-4ad0-a1bc-a901ca6a7cf8'...,80715d0d-0d2a-450f-a970-1b9a3b18c7e7,Indians,d99f919b-1534-4516-8e8a-9cd106c6d8cd,Rangers,13614.0,4.5,2.314,1.0,0.0,0.0,1.0,0,1.0,1.0,,,0.30275,8.4376,0.142857,0.654786,,9.45,,0.956917,28.0,1.083333,1.25,0.448583,1.625,,,,6.937524,0.069841,0.705784,,8.276786,,0.723967,18.428571,0.43956,0.516484,0.421407,1.549107,,,,,0.178161,0.909983,,,,0.761583,,0.466667,0.477778,0.429078,,,,0.30275,8.4376,0.142857,0.654786,,9.45,,0.956917,28.0,1.083333,1.25,0.448583,1.625,,,,6.937524,0.069841,0.705784,,8.276786,,0.723967,18.428571,0.43956,0.516484,0.421407,1.549107,,,,,0.178161,0.909983,,,,0.761583,,0.466667,0.477778,0.429078,
4,ad8ee97e-aedf-483b-bd31-b94e5983daee,2015-05-25T23:05:00+00:00,2015-05-25,4,2,[{'id': '65de4cd1-ca86-468c-9346-1e68d6279a8e'...,[{'id': '0e762dec-639f-497f-a65b-af0eaa8be39f'...,481dfe7e-5dab-46ab-a49f-9dcc2b6e2cfd,Pirates,03556285-bdbb-4576-a06d-42f71f46ddc5,Marlins,20046.0,3.214,,2.0,1.0,,,1,0.666667,,,,0.3783,7.2,0.0,0.888889,,5.4,,0.9217,92.0,0.5,0.4,0.4867,1.6,,,,2.567286,0.028571,0.841662,,6.267857,,0.584486,92.0,0.257143,0.3,0.288086,1.2037,,,,,0.033333,0.877403,,,,0.722676,,0.428571,0.471429,0.393638,,,,0.3783,7.2,0.0,0.888889,,5.4,,0.9217,92.0,0.5,0.4,0.4867,1.6,,,,2.567286,0.028571,0.841662,,6.267857,,0.584486,92.0,0.257143,0.3,0.288086,1.2037,,,,,0.033333,0.877403,,,,0.722676,,0.428571,0.471429,0.393638,


In [23]:
# Save
df_game_sub_merged.to_csv("all_mlb_data_for_model.csv", index=False)