# 4. Dataset
Source: <br>
1. '4. MLB API <br>

Description: This creates usable datasets from the MLB API data <br>
Main outputs include batter and pitcher model inputs, neural network PA inputs, and a complete dataset

### Imports

In [1]:
import pandas as pd
import numpy as np
import os
import math
import warnings
import re
import datetime
import ast

from sklearn.preprocessing import StandardScaler

import sys
sys.path.append(r"C:\Users\james\Documents\MLB\Code")
from Utilities import *

# import import_ipynb
# from Utilities import *

from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

baseball_path = r"C:\Users\james\Documents\MLB\Data2"

In [2]:
# Today's Date
# YYYY-MM-DD (datetime)
todaysdate_dt = datetime.date.today()

# YYYY-MM-DD (string)
todaysdate_dash = str(todaysdate_dt)

# MM/DD/YYYY
todaysdate_slash = todaysdate_dash.split("-")
todaysdate_slash = todaysdate_slash[1] + "/" + todaysdate_slash[2] + "/" + todaysdate_slash[0]

# YYYYMMDD
todaysdate = todaysdate_dash.replace("-", "")

In [3]:
# This reads in Chadwick register with player codes.
# Let's move this to Utilities and either make compatible with name_clean or make a separate one (mostly done in chadwick now)
keep_list = ['key_mlbam', 'key_fangraphs', 'name_first', 'name_last']
chadwick = read_chadwick(keep_list)

# Take first two characters of first name
chadwick['First2'] = chadwick['name_first'].str.slice(0,2)
# And first 5 characters of last name
chadwick['Last5'] = chadwick['name_last'].str.slice(0,5)

# Make lower case
chadwick['First2'] = chadwick['First2'].str.lower()
chadwick['Last5'] = chadwick['Last5'].str.lower()

### Wind

In [4]:
# Calculate wind vectors
# Note: 2 is to centerfield, 6 is from centerfield, clockwise
# Note: y vector is positive to centerfield, negative from centerfield
# Note: x vector is positive from left to right, negatives from right to left
# Assumption is wind is blowing in 8 cardinal directions, so we can use simple right isosceles triangles
def y_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "Out To CF": 
        y_vect = wind_speed
    elif df['windDirection'] == "Out To RF": 
        y_vect = angled
    elif df['windDirection'] == "L To R": 
        y_vect = 0
    elif df['windDirection'] == "In From LF": 
        y_vect = angled * -1
    elif df['windDirection'] == "In From CF": 
        y_vect = wind_speed * - 1
    elif df['windDirection'] == "In From RF": 
        y_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        y_vect = 0
    elif df['windDirection'] == "Out To LF": 
        y_vect = angled
    else:
        y_vect = 0
        
    return y_vect

def x_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "L To R": 
        x_vect = wind_speed
    elif df['windDirection'] == "In From LF": 
        x_vect = angled
    elif df['windDirection'] == "In From CF": 
        x_vect = 0
    elif df['windDirection'] == "In From RF": 
        x_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        x_vect = wind_speed * - 1
    elif df['windDirection'] == "Out To LF": 
        x_vect = angled * -1
    elif df['windDirection'] == "Out To CF": 
        x_vect = 0
    elif df['windDirection'] == "Out To RF": 
        x_vect = angled
    else:
        x_vect = 0
        
    return x_vect

### Weather

In [5]:
def clean_weather(df):
    # Separate weather into temperature and weather type
    df[['temperature', 'weather']] = df['weather'].str.split(", ", expand=True)
    df['temperature'] = df['temperature'].str.replace(" degrees", "").astype('int')
    # Separate wind into speed and direction
    df[['windSpeed', 'windDirection']] = df['wind'].str.split(", ", expand=True)
    df['windSpeed'].fillna("0 mph", inplace=True)
    df['windSpeed'] = df['windSpeed'].str.replace(" mph", "")
    df['windSpeed'] = pd.to_numeric(df['windSpeed'], errors='coerce')
    df['windSpeed'].fillna(0, inplace=True)
    df['windDirection'].fillna('L to R', inplace=True)
    df['windSpeed'].unique()
    df['windDirection'] = df['windDirection'].str.replace(".", "")
    # Calculate vectors
    df['x_vect'] = df.apply(x_vect, axis=1)
    df['y_vect'] = df.apply(y_vect, axis=1)
    
    return df

### Events

In [6]:
def create_events(df):
    df['eventsModel'] = np.where(df['event'] == 'Strikeout', "so", "")
    df['eventsModel'] = np.where(df['event'] == 'Strikeout Double Play', "so", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Groundout', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Fielders Choice', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Double Play', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Grounded Into DP', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Triple Play', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Field Error', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Forceout', "go", df['eventsModel'])


    df['eventsModel'] = np.where(df['event'] == 'Lineout', "lo", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Bunt Lineout', "lo", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Flyout', "fo", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Sac Fly', "fo", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Sac Fly Double Play', "fo", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Pop Out', "po", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Bunt Pop Out', "po", df['eventsModel'])


    df['eventsModel'] = np.where(df['event'] == 'Hit By Pitch', "hbp", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Walk', "bb", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Intent Walk', "bb", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Single', "b1", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Double', "b2", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Triple', "b3", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Home Run', "hr", df['eventsModel'])

    df['eventsModel'] = np.where(df['eventsModel'] == "", "Cut", df['eventsModel'])
    
    return df

### Base Running
(Perhaps move elsewhere)
Calculate SBA2B%, SBA3B%, SB2B%, SB3B%
Calculate SB and CS totals
Derive new stats: 
    SBrate = SB / (SB + CS)
    SBArate = (SB + CS) / (BB + HBP + 1B)

Use actual data for these derived stats (from API) to project the four base running stats
    Observations should be player-seasons, but only use full seasons
Calculate derived stats in fangraphs projections
Use model to predict four base running stats in the fangraphs projections

In [7]:
# This turns several variables, including events, venues, hands, and bases into dummies
def create_dummies(df):
    event_dummies = pd.get_dummies(df['eventsModel'])
    venue_dummies = pd.get_dummies(df['venue_id'], prefix='venue')
    pitcher_dummies = pd.get_dummies(df['pitchHand'], prefix='p')
    batter_dummies = pd.get_dummies(df['batSide'], prefix='b')
    year_dummies = pd.get_dummies(df['year'], prefix='year')
    
    venue_list = venue_dummies.columns.tolist()
    year_list = year_dummies.columns.tolist()
    dummy_list = venue_list + year_list
    
    df = pd.concat([df, event_dummies, venue_dummies, pitcher_dummies, batter_dummies, year_dummies], axis=1)
    
    df['preOnFirst'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnFirst'].shift(1)
    df['preOnSecond'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnSecond'].shift(1)
    df['preOnThird'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnThird'].shift(1)
    
    df['onFirst'] = df['preOnFirst'].notnull().astype('int')
    df['onSecond'] = df['preOnSecond'].notnull().astype('int')
    df['onThird'] = df['preOnThird'].notnull().astype('int')
    
    df['top'] = np.where(df['halfInning'] == "top", 1, 0)
    
    df['pa'] = np.where(df['eventsModel'] != "Cut", 1, 0)
    df['ab'] = df['pa'] - df['hbp']
    
    return df, dummy_list

### Rolling Averages

In [8]:
# This will return a dataframe that can eventually be used as the model input. Has pitcher vs hitter stats, specific to hand
def rolling_pas(df, pa_num):
    stat_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo', 'hard_hit', 'to_left', 'to_middle', 'to_right', 'pa', 'ab']
    max_list = ['totalDistance', 'maxSpeed', 'maxSpin', 'launchSpeed']
                
    df['pa_num'] = df.index
    
    batter_stats = []
    pitcher_stats = []
    batter_stats2 = []
    pitcher_stats2 = []

    for stat in stat_list:
        batter_stat = stat + "_b"
        pitcher_stat = stat + "_p"
        batter_stats.append(batter_stat)
        pitcher_stats.append(pitcher_stat)

    for stat in max_list:
        batter_stat = stat + "_b"
        pitcher_stat = stat + "_p"
        batter_stats2.append(batter_stat)
        pitcher_stats2.append(pitcher_stat)
        
    df[batter_stats] = df.groupby(['batter', 'pitchHand'])[stat_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
    df[batter_stats2] = df.groupby(['batter', 'pitchHand'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
                
    df[pitcher_stats] = df.groupby(['pitcher', 'batSide'])[stat_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
    df[pitcher_stats2] = df.groupby(['pitcher', 'batSide'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
                
    df.sort_values(['pa_num'], axis=0, ascending=True, inplace=True)

    # wOBA - using 2022 values throughout
    df['woba_b'] = (0.690 * df['bb_b']) + (0.721 * df['hbp_b']) + (0.885 * df['b1_b']) + (1.262 * df['b2_b']) + (1.601 * df['b3_b']) + (2.070 * df['hr_b'])
    df['woba_p'] = (0.690 * df['bb_p']) + (0.721 * df['hbp_p']) + (0.885 * df['b1_p']) + (1.262 * df['b2_p']) + (1.601 * df['b3_p']) + (2.070 * df['hr_p'])
    
    # Slugging
    df['slg_b'] = (1 * df['b1_b']) + (2 * df['b2_b']) + (3 * df['b3_b']) + (4 * df['hr_b'])
    df['slg_b'] = df['slg_b'] / df['ab_b']
    df['slg_p'] = (1 * df['b1_p']) + (2 * df['b2_p']) + (3 * df['b3_p']) + (4 * df['hr_p'])
    df['slg_p'] = df['slg_p'] / df['ab_p']

    # OBP    
    df['obp_b'] = df[['b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b']].sum(axis=1)
    df['obp_p'] = df[['b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p']].sum(axis=1)
    
    # ISO
    df['iso_b'] = df['b2_b'] * 1 + df['b3_b'] * 2 + df['hr_b'] * 3
    df['iso_p'] = df['b2_p'] * 1 + df['b3_p'] * 2 + df['hr_p'] * 3

    
    
    # Calculate rates
    stat_short = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo', 'woba', 'obp', 'iso', 'hard_hit', 'to_left', 'to_middle', 'to_right']
    for stat in stat_short:
        batter_stat = stat + "_b"
        pitcher_stat = stat + "_p"  
        df[batter_stat] = df[batter_stat] / df['pa_b']
        df[pitcher_stat] = df[pitcher_stat] / df['pa_p']
        
    df.sort_values('pa_num', inplace=True)
    
    batter_stats = batter_stats + batter_stats2
    pitcher_stats = pitcher_stats + pitcher_stats2
                
    return df, batter_stats, pitcher_stats

### Import Data

In [9]:
# This reads in raw API data
def import_data(start_year, end_year):
    year_df_list = []
    while start_year <= end_year:
        # Choose file
        filename = "Play" + str(start_year) + ".csv"
        
        # Read in dataframe
        year_df = pd.read_csv(os.path.join(baseball_path, "3. MLB API", filename))
        
        # Only keep one observation per PA (don't keep each runner)
        year_df.drop_duplicates(['gamePk', 'atBatIndex'], keep='first', inplace=True, ignore_index=True)

        # Create year variable 
        year_df['year'] = start_year
        
        # Add it to list of dataframes
        year_df_list.append(year_df)
                
        start_year += 1
        
    df = pd.concat(year_df_list, axis=0)
    
    df['date'] = pd.to_datetime(df['date'])
    
    df = df[df['game_type'] == "R"]
    df.reset_index(inplace=True)
    df.drop(columns={'level_0', 'index'}, inplace=True)
    
    return df

### Model Inputs

In [10]:
def statcast(df):
    statcast = df.copy()
    # Hard hit dummy
    statcast['hard_hit'] = (statcast['hardness'].str.contains('hard')).astype('int')
    
    def find_max(lst):
        if lst:
            return max(lst)
        else:
            return 0
    
    # Max pitch speed
    statcast['startSpeeds'] = statcast['startSpeeds'].apply(lambda x: ast.literal_eval(x))
    statcast['maxSpeed'] = statcast['startSpeeds'].apply(find_max)
    # Have to drop, can't take lists
    statcast.drop(columns={'startSpeeds'}, inplace=True)
    
    # Max spin rate
    statcast['spinRates'] = statcast['spinRates'].apply(lambda x: ast.literal_eval(x))
    statcast['maxSpin'] = statcast['spinRates'].apply(find_max)
    # Have to drop, can't take lists
    statcast.drop(columns={'spinRates'}, inplace=True)
    
    # Launch speeds
    statcast['launchSpeeds'] = statcast['launchSpeeds'].str.replace("[", "")
    statcast['launchSpeeds'] = statcast['launchSpeeds'].str.replace("]", "")
    statcast['launchSpeed'] = (statcast['launchSpeeds']).astype('float', errors='ignore')
    statcast['launchSpeed'] = pd.to_numeric(statcast['launchSpeed'])
    
    # Launch angle
    statcast['launchAngles'] = statcast['launchAngles'].str.replace("[", "")
    statcast['launchAngles'] = statcast['launchAngles'].str.replace("]", "")
    statcast['launchAngle'] = (statcast['launchAngles']).astype('float', errors='ignore')
    statcast['launchAngle'] = pd.to_numeric(statcast['launchAngle'])
        
    # Total distances
    statcast['totalDistances'] = statcast['totalDistances'].str.replace("[", "")
    statcast['totalDistances'] = statcast['totalDistances'].str.replace("]", "")
    statcast['totalDistance'] = (statcast['totalDistances']).astype('float', errors='ignore')
    statcast['totalDistance'] = pd.to_numeric(statcast['totalDistance'])
    
    # Coordinates of batted ball
    statcast['coord'] = statcast['coord'].str.replace("[", "")
    statcast['coord'] = statcast['coord'].str.replace("]", "")    
    statcast[['x', 'y']] = statcast['coord'].str.split(",", expand=True)
    statcast['x'] = pd.to_numeric(statcast['x'])
    statcast['y'] = pd.to_numeric(statcast['y'])
    
    statcast['spray_angle'] = np.arctan((statcast['x']-125.42)/(198.27-statcast['y'])) * 180/np.pi * 0.75
    statcast['to_left'] = (statcast['spray_angle'] < -15).astype('int')
    statcast['to_middle'] = ((statcast['spray_angle'] >= -15) & (statcast['spray_angle'] <= 15)).astype('int')
    statcast['to_right'] = (statcast['spray_angle'] > 15).astype('int')

    
    return statcast

In [11]:
# This converts raw data to clean model input
def create_model_input(df, date):
    keep_list = ['so_b', 'b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b', 'lo_b', 'po_b', 'go_b', 'fo_b', 'pa_b', 'ab_b', 
                 'woba_b', 'slg_b', 'obp_b', 'iso_b', 'hard_hit_b', 'to_left_b', 'to_middle_b', 'to_right_b', 'totalDistance_b', 'maxSpeed_b', 'maxSpin_b', 'launchSpeed_b',
                 'so_p', 'b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p', 'lo_p', 'po_p', 'go_p', 'fo_p', 'pa_p', 'ab_p', 
                 'woba_p', 'slg_p', 'obp_p', 'iso_p', 'hard_hit_p', 'to_left_p', 'to_middle_p', 'to_right_p', 'totalDistance_p', 'maxSpeed_p', 'maxSpin_p', 'launchSpeed_p']
    
    
    # df = clean_columns(df)
    # Clean weather
    df = clean_weather(df)
    # Create events
    df = create_events(df)
    # Make dummies
    df, dummy_list = create_dummies(df)
    # Statcast
    df = statcast(df)
    
    # Turn date into datetime object
    date = datetime.datetime.strptime(date, "%Y%m%d")
    
    # Only keep if date is before selected date
    df = df[df['date'] < date]
    
    # Determine score before PA
    df['preAwayScore'] = df.groupby(['gamePk', 'inning', 'halfInning'])['awayScore'].shift(1)
    df['preHomeScore'] = df.groupby(['gamePk', 'inning', 'halfInning'])['homeScore'].shift(1)
    
    df['preAwayScore'].fillna(df['awayScore'], inplace=True)
    df['preHomeScore'].fillna(df['homeScore'], inplace=True)
    
    # Calculate score differential
    df['score_diff'] = np.where(df['top'] == 1, df['preAwayScore'] - df['preHomeScore'], df['preHomeScore'] - df['preAwayScore'])
    
    # Cut if event isn't one we care about (usually these are weird base running things)
    df = df[df['Cut'] != 1]

    # Calculate short time frame rolling stats (100 PAs for now)
    dfshort, batter_stats, pitcher_stats = rolling_pas(df, 100)
    dfmain = dfshort.copy()
    # Calculate long time frame rolling stats (300 PAs for now)
    dflong, batter_stats, pitcher_stats = rolling_pas(df, 300)
    dflong = dflong[keep_list]
    dflong = dflong.add_suffix("_long")
    # Concat them together
    df = pd.concat([dfmain, dflong], axis=1)
            
    # Delete intermediate DFs
    del dfmain, dfshort, dflong

    # For the purpose of creating a sample to train the model, only keep those PAs with significant data
    # sample = df[((df['pa_p'] > 40) & (df['pa_b'] > 40)) & ((df['pa_p_long'] > 40) & (df['pa_b_long'] > 40)) ]
    # sample.reset_index(inplace=True)
    
    ### Maybe pickle this?/save these means/stds?
    # Standardize variables
    standardize_list = ['totalDistance_b', 'maxSpeed_b', 'maxSpin_b', 'launchSpeed_b',  
                        'totalDistance_b_long', 'maxSpeed_b_long', 'maxSpin_b_long', 'launchSpeed_b_long']
    
    for stat in standardize_list:
        df[stat] = (df[stat] - df[stat].mean())/df[stat].std()
    
    # Standardize variables
    standardize_list = ['totalDistance_p', 'maxSpeed_p', 'maxSpin_p', 'launchSpeed_p',
                        'totalDistance_p_long', 'maxSpeed_p_long', 'maxSpin_p_long', 'launchSpeed_p_long']
    
    for stat in standardize_list:
        df[stat] = (df[stat] - df[stat].mean())/df[stat].std()
    
    # Clean this later, maybe only keep df
    sample = df.copy()
    
    
    return sample, df, batter_stats, pitcher_stats, dummy_list

In [12]:
# Create batter inputs
def create_batter_df(df, date):
    # Stats of interest
    batter_list = ['batter',  'batterName', 'batSide', 'p_L',
     'so_b', 'b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b', 'lo_b', 'po_b', 'go_b', 'fo_b', 
     'pa_b', 'ab_b', 'woba_b', 'slg_b', 'obp_b', 'iso_b', 'hard_hit_b', 'to_left_b', 'to_middle_b', 'to_right_b', 
     'totalDistance_b', 'maxSpeed_b', 'maxSpin_b', 'launchSpeed_b', 
     'so_b_long', 'b1_b_long', 'b2_b_long', 'b3_b_long', 'hr_b_long', 'bb_b_long', 'hbp_b_long', 'lo_b_long', 'po_b_long', 'go_b_long', 'fo_b_long', 
     'pa_b_long', 'ab_b_long', 'woba_b_long', 'slg_b_long', 'obp_b_long', 'iso_b_long', 'hard_hit_b_long', 'to_left_b_long', 'to_middle_b_long', 'to_right_b_long', 
     'totalDistance_b_long', 'maxSpeed_b_long', 'maxSpin_b_long', 'launchSpeed_b_long']

    # Only keep relevant stats
    batters = df[batter_list]
    # Only care about most recent stats of each batter before PA
    batters.drop_duplicates(subset=['batter', 'p_L'], keep='last', inplace=True)

    # Create separate dataframes for vs RHP and LHP
    vs_r = batters.query('p_L == 0')
    vs_l = batters.query('p_L == 1')

    # Merge them together
    batters = vs_l.merge(vs_r, on='batter', how='outer', suffixes=('_l', '_r'))

    # Drop duplicate columns
    batters.drop(columns={'batterName_r', 'p_L_l', 'p_L_r'}, inplace=True)
    # Only need this once
    batters.rename(columns={'batterName_l': 'batterName'}, inplace=True)
    
    
    # Merge with Chadwick
    batters = batters.merge(chadwick, left_on='batter', right_on='key_mlbam', how='left')
    
    
    # Export
    batters.to_csv(os.path.join(baseball_path, "4. Dataset", "Batters", "Batters" + date + ".csv"))

In [13]:
# Create pitcher inputs
def create_pitcher_df(df, date):
    # Stats of interest
    pitcher_list =  ['pitcher',  'pitcherName', 'pitchHand', 'b_L',
                     'b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p', 
                     'so_p', 'lo_p', 'po_p', 'go_p', 'fo_p', 
                     'woba_p', 'slg_p', 'obp_p', 'iso_p',
                     'to_left_p', 'to_middle_p', 'to_right_p', 
                     'hard_hit_p', 'maxSpeed_p', 'maxSpin_p', 'totalDistance_p', 'launchSpeed_p',
                     'pa_p', 'ab_p',
                     
                     'b1_p_long', 'b2_p_long', 'b3_p_long', 'hr_p_long', 'bb_p_long', 'hbp_p_long', 
                     'so_p_long', 'lo_p_long', 'po_p_long', 'go_p_long', 'fo_p_long', 
                     'woba_p_long', 'slg_p_long', 'obp_p_long', 'iso_p_long', 
                     'to_left_p_long', 'to_middle_p_long', 'to_right_p_long',
                     'hard_hit_p_long', 'maxSpeed_p_long', 'maxSpin_p_long', 'totalDistance_p_long', 'launchSpeed_p_long',
                     'pa_p_long', 'ab_p_long', 

                     'inning', 'outs', 'gamePk', 'eventsModel', 'game_date']
    
    # Only keep relevant stats
    pitchers = df[pitcher_list]
    
    # Calculate average outs
    # Create a copy of the dataframe
    pitchers_cut = pitchers.copy()
    # Only look at PAs since 2019
    pitchers_cut = pitchers_cut[pitchers_cut['game_date'] > '2019-04-01']
    pitchers_cut.drop_duplicates(subset=['pitcher', 'gamePk', 'inning'], keep='last', inplace=True)
    # Identify if they're a starter
    pitchers_cut['starter'] = (pitchers_cut['inning'] == 1).astype('int')
    # Add up starts
    pitchers_cut = pitchers_cut.groupby(['pitcher', 'gamePk'])['outs', 'starter'].sum().reset_index()
    # Calculate mean outs and sum of starts
    pitchers_cut = pitchers_cut.groupby('pitcher').agg({'outs': np.mean, 'starter': np.sum}).reset_index()

    # Only care about most recent stats of each pitcher before PA
    pitchers.drop_duplicates(subset=['pitcher', 'b_L'], keep='last', inplace=True)
    
    # Create separate dataframes for vs RHB and LBH
    vs_r = pitchers.query('b_L == 0')
    vs_l = pitchers.query('b_L == 1')

    # Merge them together
    pitchers = vs_l.merge(vs_r, on='pitcher', how='outer', suffixes=('_l', '_r'))
    # And add outs/starts
    pitchers = pitchers.merge(pitchers_cut, on='pitcher', how='left')
    
    # Drop duplicate columns
    pitchers.drop(columns={'pitcherName_r', 'b_L_l', 'b_L_r', 'inning_r', 'outs_r', 'gamePk_r', 'eventsModel_r', 'game_date_r',  'inning_l',
                           'outs_l', 'gamePk_l', 'eventsModel_l', 'game_date_l'}, inplace=True)
    # Only need this once
    pitchers.rename(columns={'pitcherName_l': 'pitcherName'}, inplace=True)
    
    # Merge with Chadwick
    pitchers = pitchers.merge(chadwick, left_on='pitcher', right_on='key_mlbam', how='left')
    
    # Take average number of batters faced
    faced = df.copy()
    faced['faced'] = 1
    games = faced.groupby(['pitcher', 'gamePk'])['faced'].sum().reset_index()
    games['avgFaced'] = games.groupby('pitcher')['faced'].rolling(30, min_periods=1).mean().shift().reset_index(level=0, drop=True)
    games.drop_duplicates(subset=['pitcher'], keep='last', inplace=True)
    games = games[['pitcher', 'avgFaced']]
    
    # Standardize variables
    standardize_list = ['totalDistance_p_l', 'maxSpeed_p_l', 'maxSpin_p_l', 'launchSpeed_p_l', 
                        'totalDistance_p_r', 'maxSpeed_p_r', 'maxSpin_p_r', 'launchSpeed_p_r', 
                        'totalDistance_p_long_l', 'maxSpeed_p_long_l', 'maxSpin_p_long_l', 'launchSpeed_p_long_l', 
                        'totalDistance_p_long_r', 'maxSpeed_p_long_r', 'maxSpin_p_long_r', 'launchSpeed_p_long_r']
    
    for stat in standardize_list:
        pitchers[stat] = (pitchers[stat] - pitchers[stat].mean())/pitchers[stat].std()
    
    pitchers = pitchers.merge(games, on='pitcher', how='inner')
    
    # Export    
    pitchers.to_csv(os.path.join(baseball_path, "4. Dataset", "Pitchers", "Pitchers" + date + ".csv"))

In [14]:
# This creates inputs on a given date
def create_datasets(df, date):
    # Create data for model and data for model inputs (once model is built, sample won't really be needed)
    sample, inputs, batter_stats, pitcher_stats, dummy_list = create_model_input(df, date)
    # Create batter and pitcher csvfiles
    create_batter_df(inputs, date)
    create_pitcher_df(inputs, date)
    
    return sample

### Run One

In [15]:
df = import_data(2019, 2023) # Changed to 2019 - keep full 2015- for training set below
sample = create_datasets(df, todaysdate)

### Run All

In [16]:
# for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data\A7. Matchups - 1. Salaries"): 
#     # 2023 
#     if filename.endswith(".csv") and filename.startswith("DKSalaries_2023"):
#         # Pull out date
#         date = filename[11:19]
#         print(date)
#         df = import_data(2019, 2023) # Changed to 2020 - keep full 2015- for training set below
#         sample = create_datasets(df, date)

### Create Inputs 

In [17]:
# # Sample used for training PA model (import_date from 2015 onward)
# sample.to_csv(os.path.join(baseball_path, "Inputs", "Sample100.csv"))

In [18]:
# # Can we get rid of this?
# # You probably don't need to run it again until a new park is created, but it does need to exist
# # Park list (all park dummies)
# parks = df[['home_name', 'venue_id']]
# parks = parks.drop_duplicates().sort_values('home_name')
# parks.to_csv(os.path.join(baseball_path, "Inputs", "All Parks.csv"))

In [19]:
print("Code was last run on: {} at {}.".format(datetime.date.today(), datetime.datetime.now().strftime("%H:%M:%S")))

Code was last run on: 2023-06-29 at 18:17:41.
