In [5]:
import pandas as pd
import numpy as np
import pickle
import os
import math
import warnings
import re
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

baseball_path = r"C:\Users\james\Documents\MLB\Data"

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [7]:
df = pd.read_csv(os.path.join(baseball_path, "Inputs", "Sample.csv"))
df.head()

Unnamed: 0.1,Unnamed: 0,index,atBatIndex,inning,halfInning,outs,type,id,event,eventType,...,slg_p_long,obp_p_long,hard_hit_p_long,to_left_p_long,to_middle_p_long,to_right_p_long,totalDistance_p_long,maxSpeed_p_long,maxSpin_p_long,launchSpeed_p_long
0,0,11736,51,6,bottom,1,atBat,493114,Groundout,field_out,...,0.619048,0.452381,0.047619,0.214286,0.380952,0.333333,418.0,87.7,2505.0,105.2
1,1,11737,52,6,bottom,2,atBat,605412,Groundout,field_out,...,0.604651,0.44186,0.046512,0.209302,0.395349,0.325581,418.0,87.7,2505.0,105.2
2,2,11738,53,6,bottom,3,atBat,434636,Groundout,field_out,...,0.590909,0.431818,0.045455,0.204545,0.386364,0.340909,418.0,87.7,2505.0,105.2
3,3,11763,78,9,bottom,1,atBat,493114,Groundout,field_out,...,0.541667,0.395833,0.041667,0.1875,0.375,0.375,418.0,87.7,2505.0,105.2
4,4,11764,79,9,bottom,1,atBat,605412,Double,double,...,0.530612,0.387755,0.040816,0.183673,0.367347,0.387755,418.0,87.7,2505.0,105.2


In [11]:
df2 = df.copy()
# Remove duplicate events within a PA
df2 = df2.drop_duplicates(subset=['gamePk', 'atBatIndex'], keep='last')
# Get dummies of events in the model
events = pd.get_dummies(df2['eventsModel'])
# Create counter of batters faced
events['faced'] = 1
# Create list of events
events_list = events.columns.tolist()
# Add game, pitcher, and RBI
events_list.append('gamePk')
events_list.append('pitcher')
events_list.append('rbi')
# Add dataframe of event dummies to main dataframe
df2 = pd.concat([df2, events], axis=1)

# Sum up events
sums = df2[events_list].groupby(['gamePk', 'pitcher']).cumsum()
sums = sums.add_suffix("_sum")
# Add list of columns to list
sums_list = sums.columns.tolist()
# sums_list.remove("Cut_sum")
# Add sum of events to main dataframe
df2 = pd.concat([df2, sums], axis=1)
# Create dataframe with the number of batters faced per game (max of cumulative running counter)
df_max = df2.groupby(['gamePk', 'pitcher'])['faced_sum'].max().reset_index()
# Add this to main dataframe
df2 = df2.merge(df_max, on=['gamePk', 'pitcher'], how='inner', suffixes=("", "_max"))

# Create dummy equal to one on a pitcher's last PA
df2['pulled'] = np.where((df2['faced_sum'] == df2['faced_sum_max']), 1, 0)
# Calculate scores bot batter and pitcher teams
df2['pitcher_score'] = np.where(df2['halfInning'] == "top", df2['homeScore'], df2['awayScore'])
df2['batter_score'] = np.where(df2['halfInning'] == "top", df2['awayScore'], df2['homeScore'])

# Create dataframe of first PAs in games
starter = df2.groupby(['gamePk', 'halfInning'])['pitcher'].first().reset_index()
# These pitchers are starters
starter.rename(columns={'pitcher':'sp'}, inplace=True)

# Merge these dataframes of starters onto main dataframe
df2 = df2.merge(starter, on=['gamePk', 'halfInning'], how='inner')
# Pitcher is a starter if they were the first one to pitch in that game
df2['starter'] = (df2['pitcher'] == df2['sp']).astype('int')



In [12]:
df2

Unnamed: 0.1,Unnamed: 0,index,atBatIndex,inning,halfInning,outs,type,id,event,eventType,...,so_sum,so_sum.1,faced_sum,rbi_sum,faced_sum_max,pulled,pitcher_score,batter_score,sp,starter
0,0,11736,51,6,bottom,1,atBat,493114,Groundout,field_out,...,0,0,1,0,5,0,6,0,518567,1
1,1,11737,52,6,bottom,2,atBat,605412,Groundout,field_out,...,0,0,2,0,5,0,6,0,518567,1
2,2,11738,53,6,bottom,3,atBat,434636,Groundout,field_out,...,0,0,3,0,5,0,6,0,518567,1
3,3,11763,78,9,bottom,1,atBat,493114,Groundout,field_out,...,0,0,4,0,5,0,9,0,518567,1
4,4,11764,79,9,bottom,1,atBat,605412,Double,double,...,0,0,5,0,5,1,9,0,518567,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128778,1128771,1364440,23,3,bottom,3,atBat,543068,Groundout,field_out,...,3,3,12,0,16,0,0,0,680686,1
1128779,1128776,1364445,28,4,bottom,1,atBat,641857,Strikeout,strikeout,...,4,4,13,0,16,0,0,0,680686,1
1128780,1128777,1364446,29,4,bottom,2,atBat,553869,Pop Out,field_out,...,4,4,14,0,16,0,0,0,680686,1
1128781,1128778,1364447,30,4,bottom,2,atBat,660707,Single,single,...,4,4,15,0,16,0,0,0,680686,1
