In [None]:
# Read in boxscore for weather
def create_box(gamePk):
    # Read in boxscore as json
    box = pd.json_normalize(statsapi.boxscore_data(gamePk, timecode=None), record_path='gameBoxInfo')
    
    # Define default values
    default_weather = "75 degrees, Clear."
    default_wind = "0 mph, L To R."
    default_venue = "Missing Park."
    default_date = "November 30, 1993"
    
    # Extract weather, wind, venue, and date
    weather = box.loc[box['label'] == "Weather", "value"].item() if 'Weather' in box['label'].values else default_weather
    wind = box.loc[box['label'] == "Wind", "value"].item() if 'Wind' in box['label'].values else default_wind
    venue = box.loc[box['label'] == "Venue", "value"].item() if 'Venue' in box['label'].values else default_venue
    
    try:
        date = box.iloc[-1, box.columns.get_loc('label')]
    except:
        date = default_date

    if "Weather" not in list(box['label']):
        missing_weather = True
    else:
        missing_weather = False
        
    
    
    return weather, wind, venue, date, missing_weather

##### Weather

In [10]:
# Positive to centerfield, negative from centerfield
def y_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "Out To CF": 
        y_vect = wind_speed
    elif df['windDirection'] == "Out To RF": 
        y_vect = angled
    elif df['windDirection'] == "L To R": 
        y_vect = 0
    elif df['windDirection'] == "In From LF": 
        y_vect = angled * -1
    elif df['windDirection'] == "In From CF": 
        y_vect = wind_speed * - 1
    elif df['windDirection'] == "In From RF": 
        y_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        y_vect = 0
    elif df['windDirection'] == "Out To LF": 
        y_vect = angled
    else:
        y_vect = 0

    
    return y_vect

# Positive from left to right, negative from right to left
def x_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "L To R": 
        x_vect = wind_speed
    elif df['windDirection'] == "In From LF": 
        x_vect = angled
    elif df['windDirection'] == "In From CF": 
        x_vect = 0
    elif df['windDirection'] == "In From RF": 
        x_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        x_vect = wind_speed * - 1
    elif df['windDirection'] == "Out To LF": 
        x_vect = angled * -1
    elif df['windDirection'] == "Out To CF": 
        x_vect = 0
    elif df['windDirection'] == "Out To RF": 
        x_vect = angled
    else:
        x_vect = 0

    
    return x_vect

# 2 is to centerfield, 6 is from centerfield, clockwise
# Assumption is wind is blowing in 8 cardinal directions, so we can use simple right isosceles triangles

In [11]:
def clean_weather(df):   
    # Separate weather into temperature and weather type
    df[['temperature', 'weather']] = df['weather'].str.split(", ", expand=True)
    df['temperature'] = df['temperature'].str.replace(" degrees", "").astype('int')
    # Separate wind into speed and direction
    df[['windSpeed', 'windDirection']] = df['wind'].str.split(", ", expand=True)
    df['windSpeed'].fillna("0 mph", inplace=True)
    df['windSpeed'] = df['windSpeed'].str.replace(" mph", "")
    df['windSpeed'] = pd.to_numeric(df['windSpeed'], errors='coerce')
    df['windSpeed'].fillna(0, inplace=True)
    df['windDirection'].fillna('L to R', inplace=True)
    df['windSpeed'].unique()
    df['windDirection'] = df['windDirection'].str.replace(".", "")
    # Calculate vectors
    df['x_vect'] = df.apply(x_vect, axis=1)
    df['y_vect'] = df.apply(y_vect, axis=1)

    ### TESTING:
    # Set temperature to 70 degrees if it's a dome or the roof is close
    df['temperature'] = df.apply(lambda row: 70 if 'Roof' in row['weather'] or 'Dome' in row['weather'] else row['temperature'], axis=1)

    
    return df

##### Model Inputs

In [12]:
# Assign play categories to full descriptions
def create_events(df):
    event_mapping = {
        'Strikeout': 'so',
        'Strikeout Double Play': 'so',
        'Groundout': 'go',
        'Fielders Choice': 'go',
        'Fielders Choice Out': 'go',
        'Double Play': 'go',
        'Grounded Into DP': 'go',
        'Triple Play': 'go',
        'Field Error': 'go',
        'Forceout': 'go',
        'Sac Bunt': 'go',
        'Sac Bunt Double Play': 'go', 
        'Bunt Groundout': 'go',
        'Lineout': 'lo',
        'Bunt Lineout': 'lo',
        'Flyout': 'fo',
        'Sac Fly': 'fo',
        'Sac Fly Double Play': 'fo',
        'Pop Out': 'po',
        'Bunt Pop Out': 'po',
        'Hit By Pitch': 'hbp',
        'Walk': 'bb',
        'Intent Walk': 'bb',
        'Single': 'b1',
        'Double': 'b2',
        'Triple': 'b3',
        'Home Run': 'hr'
    }

    df['eventsModel'] = df['event'].map(event_mapping).fillna('Cut')

    
    return df

In [13]:
# This turns several variables, including events, venues, hands, and bases into dummies
def create_dummies(df):    
    # Events
    event_dummies = pd.get_dummies(df['eventsModel'])
    # Venues
    df['venue_id'] = df['venue_id'].astype('str')
    venue_dummies = pd.get_dummies(df['venue_id'], prefix='venue')
    # Hands
    pitcher_dummies = pd.get_dummies(df['pitchHand'], prefix='p')
    batter_dummies = pd.get_dummies(df['batSide'], prefix='b')
    # Years
    df['year'] = df['game_date'].str[:4]
    year_dummies = pd.get_dummies(df['year'], prefix='year')
    
    # # Create lists of dummies
    # venue_list = venue_dummies.columns.tolist()
    # year_list = year_dummies.columns.tolist()
    
    # Add dummies to dataframe
    df = pd.concat([df, event_dummies, venue_dummies, pitcher_dummies, batter_dummies, year_dummies], axis=1)

    # Identify starting pitcher
    df['startingPitcher'] = df.groupby(['gamePk', 'halfInning'])['pitcherName'].transform('first')
    df['starter'] = (df['startingPitcher'] == df['pitcherName']).astype('int')
    
    # Create compatible date variable
    df['date'] = df['game_date'].str.replace('-', '')
    
    # Convert to numeric for sorting
    df['date'] = df['date'].astype('int')
    df['gamePk'] = df['gamePk'].astype('int')
    df['atBatIndex'] = df['atBatIndex'].astype('int')
    
    # Sort
    df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
    
    # Create dummy for runners on base
    df['preOnFirst'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnFirst'].shift(1)
    df['preOnSecond'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnSecond'].shift(1)
    df['preOnThird'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnThird'].shift(1)
    
    df['onFirst'] = df['preOnFirst'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    df['onSecond'] = df['preOnSecond'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    df['onThird'] = df['preOnThird'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    
    # Top of the inning dummy
    df['top'] = np.where(df['halfInning'] == "top", 1, 0)
    
    # Convert to numeric
    df['awayScore'] = df['awayScore'].astype('int')
    df['homeScore'] = df['homeScore'].astype('int')
    
    # Determine score before PA
    df['preAwayScore'] = df.groupby(['gamePk'])['awayScore'].shift(1)
    df['preHomeScore'] = df.groupby(['gamePk'])['homeScore'].shift(1)
    
    # If it's the first PA, it'll be missing. 
    df['preAwayScore'] = df['preAwayScore'].fillna(0)
    df['preHomeScore'] = df['preHomeScore'].fillna(0)
    
    # Calculate differential
    df['score_diff'] = np.where(df['top'] == 1, df['preAwayScore'] - df['preHomeScore'], df['preHomeScore'] - df['preAwayScore'])
    
    # Determine hitter and pitcher scores
    df['batterScore'] = np.where(df['halfInning'] == 'top', df['awayScore'], df['homeScore'])
    df['pitcherScore'] = np.where(df['halfInning'] == 'top', df['homeScore'], df['awayScore'])
    
    # Calculate PAs and ABs
    df['pa'] = np.where(df['eventsModel'] != "Cut", 1, 0)
    df['ab'] = df['pa'] - df['hbp'] - df['bb']           
            
    # Sort
    df.sort_values(['date', 'gamePk', 'atBatIndex'], inplace=True)

    
    return df

In [None]:
# This turns several variables, including events, venues, hands, and bases into dummies
def create_dummies(df):    
    # Events
    event_dummies = pd.get_dummies(df['eventsModel'])
    # Venues
    df['venue_id'] = df['venue_id'].astype('str')
    # venue_dummies = pd.get_dummies(df['venue_id'], prefix='venue')
    # Hands
    pitcher_dummies = pd.get_dummies(df['pitchHand'], prefix='p')
    batter_dummies = pd.get_dummies(df['batSide'], prefix='b')
    # Years
    df['year'] = df['game_date'].str[:4]
    # year_dummies = pd.get_dummies(df['year'], prefix='year')
    
    # Add dummies to dataframe
    # df = pd.concat([df, event_dummies, venue_dummies, pitcher_dummies, batter_dummies, year_dummies], axis=1)
    df = pd.concat([df, event_dummies, pitcher_dummies, batter_dummies], axis=1)

    # Identify starting pitcher
    df['startingPitcher'] = df.groupby(['gamePk', 'halfInning'])['pitcherName'].transform('first')
    df['starter'] = (df['startingPitcher'] == df['pitcherName']).astype('int')
    
    # Create compatible date variable
    df['date'] = df['game_date'].str.replace('-', '')
    
    # Convert to numeric for sorting
    df['date'] = df['date'].astype('int')
    df['gamePk'] = df['gamePk'].astype('int')
    df['atBatIndex'] = df['atBatIndex'].astype('int')
    
    # Sort
    df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
    
    # Create dummy for runners on base
    df['preOnFirst'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnFirst'].shift(1)
    df['preOnSecond'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnSecond'].shift(1)
    df['preOnThird'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnThird'].shift(1)
    
    df['onFirst'] = df['preOnFirst'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    df['onSecond'] = df['preOnSecond'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    df['onThird'] = df['preOnThird'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    
    # Top of the inning dummy
    df['top'] = np.where(df['halfInning'] == "top", 1, 0)
    
    # Convert to numeric
    df['awayScore'] = df['awayScore'].astype('int')
    df['homeScore'] = df['homeScore'].astype('int')
    
    # Determine score before PA
    df['preAwayScore'] = df.groupby(['gamePk'])['awayScore'].shift(1)
    df['preHomeScore'] = df.groupby(['gamePk'])['homeScore'].shift(1)
    
    # If it's the first PA, it'll be missing. 
    df['preAwayScore'] = df['preAwayScore'].fillna(0)
    df['preHomeScore'] = df['preHomeScore'].fillna(0)
    
    # Calculate differential
    df['score_diff'] = np.where(df['top'] == 1, df['preAwayScore'] - df['preHomeScore'], df['preHomeScore'] - df['preAwayScore'])
    
    # Determine hitter and pitcher scores
    df['batterScore'] = np.where(df['halfInning'] == 'top', df['awayScore'], df['homeScore'])
    df['pitcherScore'] = np.where(df['halfInning'] == 'top', df['homeScore'], df['awayScore'])
    
    # Determine score before PA
    df['preBatterScore'] = np.where(df['halfInning'] == 'top', df['preAwayScore'], df['preHomeScore'])
    df['prePitcherScore'] = np.where(df['halfInning'] == 'top', df['preHomeScore'], df['preAwayScore'])
    
    # Calculate PAs and ABs
    df['pa'] = np.where(df['eventsModel'] != "Cut", 1, 0)
    df['ab'] = df['pa'] - df['hbp'] - df['bb']           
            
    # Sort
    df.sort_values(['date', 'gamePk', 'atBatIndex'], inplace=True)

    
    return df

In [14]:
# Create useful Statcast variables
def clean_statcast(df):
    # Convert variables to numeric
    df['launch_speed'] = pd.to_numeric(df['launch_speed'], errors='coerce')
    df['launch_speed_angle'] = pd.to_numeric(df['launch_speed_angle'], errors='coerce')
    df['hc_x'] = pd.to_numeric(df['hc_x'], errors='coerce')
    df['hc_y'] = pd.to_numeric(df['hc_y'], errors='coerce')
    
    # Hard hit dummy
    df['hard_hit'] = (df['launch_speed'] >= 95).astype('int')
    
    # Barrel dummy
    df['barrel'] = (df['launch_speed_angle'] == 6).astype('int')

    # Spray 
    df['spray_angle'] = np.arctan((df['hc_x'] - 125.42) / (198.27 - df['hc_y'])) * 180 / np.pi * 0.75
    df['to_left'] = (df['spray_angle'] < -15).astype('int')
    df['to_middle'] = ((df['spray_angle'] >= -15) & (df['spray_angle'] <= 15)).astype('int')
    df['to_right'] = (df['spray_angle'] > 15).astype('int')

    
    return df

In [None]:
# Adjust for park factors
def park_adjustments(df):   
    # Read in park factors
    multiplier_df = pd.read_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"))

    # Convert to numeric for merging
    multiplier_df['gamePk'] = multiplier_df['gamePk'].astype(int)
    df['gamePk'] = df['gamePk'].astype('int')
    
    # Merge with park factors
    multiplier_columns = [column for column in multiplier_df.columns if "mult" in column]
    league_avg_columns = [column for column in multiplier_df.columns if "league" in column]
    df = df.merge(multiplier_df[["gamePk"] + multiplier_columns + league_avg_columns], on=['gamePk'], how='left', indicator=True)
    
    # Missings (old parks, other parks)
    # Multipliers of 1 
    df[multiplier_columns] = df[multiplier_columns].fillna(1)
    # Most recent league_averages
    df[league_avg_columns] = df[league_avg_columns].ffill()


    # Loop over events
    for event in events_list:
        # Adjust based on calculated multiplier
        df[event] = np.where(df['batSide'] == "L", df[event].astype(float) / df[f'{event}_mult_l'].astype(float), df[event].astype(float) / df[f'{event}_mult_r'].astype(float))
    
    return df

In [91]:
# This will return a dataframe that can eventually be used as the model input. Has pitcher vs hitter stats, specific to hand
def rolling_pas(df, pa_num):
    # Copy dataframe
    df_copy = df.copy()
    
    # Note: batter_avg_short will work even when pa_num refers to the "long" period. Suffix will be added in post.
    # Rename for compatibility purposes
    df_copy.rename(columns={'hit_distance_sc':'totalDistance', 'launch_speed':'launchSpeed'}, inplace=True)          
            
    # Convert to numeric and fill with 0s
    combined_list = avg_list + max_list
    for col in combined_list:
        # Check if the column is not numeric
        if not pd.api.types.is_numeric_dtype(df_copy[col]):
            # Convert the non-numeric column to numeric and fill missing values with 0
            df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
            df_copy[col] = df_copy[col].fillna(0)

    # Sort
    df_copy.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
            
    # Data types may vary. This makes grouping impossible. 
    df_copy['batter'] = df_copy['batter'].astype('int')
    df_copy['pitcher'] = df_copy['pitcher'].astype('int')
        
    ### Batter stats 
    # Stats for which you want the average 
    df_copy[batter_avg_short] = df_copy.groupby(['batter', 'pitchHand'])[avg_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).mean())
    # Stats for which you want the maximum
    df_copy[batter_max_short] = df_copy.groupby(['batter', 'pitchHand'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
    # Stats for which you just want the sum 
    df_copy[['ab_b', 'pa_b']] = df_copy.groupby(['batter', 'pitchHand'])[['ab', 'pa']].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
                
    ### Pitcher stats
    # Stats for which you want the average
    df_copy[pitcher_avg_short] = df_copy.groupby(['pitcher', 'batSide'])[avg_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).mean())
    # Stats for which you want the maximum
    df_copy[pitcher_max_short] = df_copy.groupby(['pitcher', 'batSide'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
    # Stats for which you just want the sum 
    df_copy[['ab_p', 'pa_p']] = df_copy.groupby(['pitcher', 'batSide'])[['ab', 'pa']].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
                
    # Create imputation flags (these observations will have imputed inputs)
    df_copy['imp_b'] = (df_copy['pa_b'] < 40).astype('int')
    df_copy['imp_p'] = (df_copy['pa_p'] < 40).astype('int')

    # Create compatible date variable
    df_copy['date'] = df_copy['game_date'].str.replace('-', '')
    
    # Convert to numeric for sorting
    df_copy['date'] = df_copy['date'].astype('int')
    df_copy['gamePk'] = df_copy['gamePk'].astype('int')
    df_copy['atBatIndex'] = df_copy['atBatIndex'].astype('int')
    df_copy['batter'] = df_copy['batter'].astype('int')
    df_copy['pitcher'] = df_copy['pitcher'].astype('int')
    
    # Sort
    df_copy.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

    ### Advanced stats
    # wOBA - using 2022 values throughout
    df_copy['woba_b'] = (0.690 * df_copy['bb_b']) + (0.721 * df_copy['hbp_b']) + (0.885 * df_copy['b1_b']) + (1.262 * df_copy['b2_b']) + (1.601 * df_copy['b3_b']) + (2.070 * df_copy['hr_b'])
    df_copy['woba_p'] = (0.690 * df_copy['bb_p']) + (0.721 * df_copy['hbp_p']) + (0.885 * df_copy['b1_p']) + (1.262 * df_copy['b2_p']) + (1.601 * df_copy['b3_p']) + (2.070 * df_copy['hr_p'])
    
    # Slugging
    df_copy['slg_b'] = ((1 * df_copy['b1_b']) + (2 * df_copy['b2_b']) + (3 * df_copy['b3_b']) + (4 * df_copy['hr_b'])) * (1 / (1-(df_copy['bb_b'] + df_copy['hbp_b'])))
    df_copy['slg_p'] = ((1 * df_copy['b1_p']) + (2 * df_copy['b2_p']) + (3 * df_copy['b3_p']) + (4 * df_copy['hr_p'])) * (1 / (1-(df_copy['bb_p'] + df_copy['hbp_p'])))
    # OBP    
    df_copy['obp_b'] = df_copy[['b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b']].sum(axis=1)
    df_copy['obp_p'] = df_copy[['b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p']].sum(axis=1)
    
    # ISO
    df_copy['iso_b'] = (df_copy['b2_b'] * 1 + df_copy['b3_b'] * 2 + df_copy['hr_b'] * 3) * (1 / (1-(df_copy['bb_b'] + df_copy['hbp_b'])))
    df_copy['iso_p'] = (df_copy['b2_p'] * 1 + df_copy['b3_p'] * 2 + df_copy['hr_p'] * 3) * (1 / (1-(df_copy['bb_p'] + df_copy['hbp_p'])))

    
    return df_copy

In [None]:
# This creates information about starts for use in pulling pitchers
def start_data(df):
    # Calculate the sum for each group
    df['br'] = df[['b1', 'b2', 'b3', 'hr', 'bb', 'hbp']].astype('float').sum(axis=1)

    # Calculate the cumulative sum within each group
    df['br_inning'] = df.groupby(['gamePk', 'inning', 'halfInning'])['br'].cumsum()
    
    # Convert to numeric
    df['inning'] = pd.to_numeric(df['inning'])
    df['outs'] = pd.to_numeric(df['outs'])
    df['rbi'] = df['rbi'].astype('int')
    
    # Number of batters faced (will be used to calculate rolling sum)
    df['faced'] = 1
    
    # Cumulative counts
    # Stats to sum
    sums_list = ['gamePk', 'pitcher'] + events_list + ['rbi', 'faced']
    # Calculate
    sums = df[sums_list].groupby(['gamePk', 'pitcher']).cumsum()
    # Add suffix
    sums = sums.add_suffix("_sum")
    
    # Add rolling sums
    df = pd.concat([df, sums], axis=1)
    
    # Identify if it's the bottom of the inning (a little more helpful than "top" as it's sortable)
    df['bottom'] = (df['top'] == 0).astype('int')
    
    # Sort to identify starting pitchers
    df = df.sort_values(by=['date', 'gamePk', 'bottom', 'atBatIndex'])
    
    # The starter has the lowest atBatIndex
    df['atBatIndex_min'] = df.groupby(['gamePk', 'bottom'])['atBatIndex'].transform('min')
    df['start'] = (df['atBatIndex'] == df['atBatIndex_min']).astype('int')
        
    # Identify starter throughout
    df['starter'] = df.groupby(['pitcher', 'gamePk'])['start'].cumsum()
    
    # Keep only starters
    df = df.query('starter == 1')
    
    # The starter is pulled at their highest atBatIndex
    df['atBatIndex_max'] = df.groupby(['gamePk', 'bottom'])['atBatIndex'].transform('max')
    df['pulled'] = (df['atBatIndex'] == df['atBatIndex_max']).astype('int')
    
    # Batters faced that inning
    df['faced_inning'] = df.groupby(['gamePk', 'inning', 'bottom']).cumcount()+1
    df['faced_inning'] = np.where(df['outs'] == 3, 0, df['faced_inning'])
    
    # Rolling sums stats (post-rolling sum)
    rolled_sums_list = [f'{stat}_sum' for stat in events_list] + ['rbi_sum', 'faced_sum']
    
    # Outs recorded by starting pitcher 
    df['OUT'] = ((df['inning'] - 1) * 3) + df['outs']
    
    # This adjusts timing to better reflect when pitchers are pulled
    # If a pitcher is pulled after 6 innings in the data, that's the same as pulling at the top of the 7th, which more closely reflects how the sim works
    df['inning_adj'] = df['inning'] + (df['outs'] == 3).astype('int')
    df['outs_adj'] = np.where(df['outs'] == 3, 0, df['outs'])

    
    return df

In [None]:
# This creates information about starts for use in pulling pitchers
def start_data(df):
    # Calculate the sum for each group
    df['br'] = df[['b1', 'b2', 'b3', 'hr', 'bb', 'hbp']].astype('float').sum(axis=1)

    # Calculate the cumulative sum within each group
    df['br_inning'] = df.groupby(['gamePk', 'inning', 'halfInning'])['br'].cumsum()
    
    # Convert to numeric
    df['inning'] = pd.to_numeric(df['inning'])
    df['outs'] = pd.to_numeric(df['outs'])
    df['rbi'] = df['rbi'].astype('int')
    
    # Number of batters faced (will be used to calculate rolling sum)
    df['faced'] = 1
    
    # Cumulative counts
    # Stats to sum
    sums_list = ['gamePk', 'pitcher'] + events_list + ['rbi', 'faced']
    # Calculate
    sums = df[sums_list].groupby(['gamePk', 'pitcher']).cumsum()
    # Add suffix
    sums = sums.add_suffix("_sum")
    
    # Add rolling sums
    df = pd.concat([df, sums], axis=1)
    
    # Identify if it's the bottom of the inning (a little more helpful than "top" as it's sortable)
    df['bottom'] = (df['top'] == 0).astype('int')
    
    # Sort to identify starting pitchers
    df = df.sort_values(by=['date', 'gamePk', 'bottom', 'atBatIndex'])
    
    # The starter has the lowest atBatIndex
    df['atBatIndex_min'] = df.groupby(['gamePk', 'bottom'])['atBatIndex'].transform('min')
    df['start'] = (df['atBatIndex'] == df['atBatIndex_min']).astype('int')
        
    # Identify starter throughout
    df['starter'] = df.groupby(['pitcher', 'gamePk'])['start'].cumsum()
    
    # # Keep only starters
    # df = df.query('starter == 1')
    
    # The starter is pulled at their highest atBatIndex
    df['atBatIndex_max'] = df.groupby(['gamePk', 'bottom'])['atBatIndex'].transform('max')
    df['pulled'] = (df['atBatIndex'] == df['atBatIndex_max']).astype('int')
    
    # Batters faced that inning
    df['faced_inning'] = df.groupby(['gamePk', 'inning', 'bottom']).cumcount()+1
    df['faced_inning'] = np.where(df['outs'] == 3, 0, df['faced_inning'])
    
    # Rolling sums stats (post-rolling sum)
    rolled_sums_list = [f'{stat}_sum' for stat in events_list] + ['rbi_sum', 'faced_sum']
    
    # Outs recorded by starting pitcher 
    df['OUT'] = ((df['inning'] - 1) * 3) + df['outs']
    
    # This adjusts timing to better reflect when pitchers are pulled
    # If a pitcher is pulled after 6 innings in the data, that's the same as pulling at the top of the 7th, which more closely reflects how the sim works
    df['inning_adj'] = df['inning'] + (df['outs'] == 3).astype('int')
    df['outs_adj'] = np.where(df['outs'] == 3, 0, df['outs'])

    
    return df

In [None]:
def merge_datasets(start_year=2015, end_year=2024):
    # List of merged datasets
    df_list = []
    # Read in datasets
    for year in range(start_year, end_year+1):
        statsapi_df = pd.read_csv(os.path.join(baseball_path, "A02. MLB API", "1. Stats API", f"Stats API {year}.csv"), encoding='iso-8859-1')
        statcast_df = pd.read_csv(os.path.join(baseball_path, "A02. MLB API", "2. Statcast", f"Statcast {year}.csv"), encoding='iso-8859-1')
        
        # Merge them together
        merged_df = pd.merge(statsapi_df, statcast_df, on=['gamePk', 'atBatIndex'], how='left')

        # Drop duplicate observations
        merged_df.drop_duplicates(['gamePk', 'atBatIndex'], keep='first', inplace=True)
        
        # Add them to a list
        df_list.append(merged_df)
    
    # Create raw dataset
    df = pd.concat(df_list, axis=0)   
    
    # Create data variable (without dashes)
    df['date'] = df['game_date'].str.replace('-', '')

    # Convert to numeric for sorting
    df['date'] = df['date'].astype('int')
    df['gamePk'] = df['gamePk'].astype('int')
    df['atBatIndex'] = df['atBatIndex'].astype('int')

    # Sort
    df.sort_values(['game_date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

    # Only keep one observation per at bat
    df.drop_duplicates(['gamePk', 'atBatIndex'], keep='first', inplace=True)

    # Determine outs coming into PA
    df['outs_pre'] = df.groupby(['gamePk', 'inning', 'halfInning'])['outs'].shift(fill_value=0)

    
    return df

In [None]:
# Creates model inputs
def create_pa_inputs(park_factors, team_map, start_year, end_year, short=50, long=300, adjust=True):
    # Merge together raw Stats API and Statcast data
    df = merge_datasets(start_year, end_year)
    # Clean weather
    df2 = clean_weather(df)
    # Create PA events 
    df3 = create_events(df2)
    # Create dummy variables 
    df4 = create_dummies(df3)
    # Create Statcast variables
    df5 = clean_statcast(df4)   
    # Adjust for park factors
    if adjust == True:
        df6 = park_adjustments(df5)
        df6.drop(columns={'_merge'}, inplace=True)
    else:
        df6 = df5.copy()
    ### TESTING
    # Add start data
    df6 = start_data(df6)
    # ### TESTING
    # df6 = df6.query('eventsModel != "Cut"')
    # Clean up
    df6.fillna(0, inplace=True)
    
    ### Rolling stats
    # Short
    df_short = rolling_pas(df6, short)
    # Long
    df_long = rolling_pas(df6, long)
    df_long = df_long.add_suffix("_long")
        
    # We only need the rolling stats 
    long_stats = batter_stats_long + pitcher_stats_long
    df_long = df_long[long_stats]
    
    # Dataset
    complete_dataset = pd.concat([df_short, df_long], axis=1)

    # Fix Guardians name to make uniform
    complete_dataset['away_name'] = np.where(complete_dataset['away_name'] == "Cleveland Indians", "Cleveland Guardians", complete_dataset['away_name'])
    complete_dataset['home_name'] = np.where(complete_dataset['home_name'] == "Cleveland Indians", "Cleveland Guardians", complete_dataset['home_name'])

    # Only keep regular season
    complete_dataset = complete_dataset[complete_dataset['game_type_x'] == "R"]

    # Reset index
    complete_dataset.reset_index(drop=True, inplace=True)
    
    # Sort
    complete_dataset.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
    
    
    return complete_dataset

In [None]:
# Creates model inputs
def create_pull_inputs(park_factors, team_map, start_year, end_year, short=50, long=300, adjust=True):
    # Merge together raw Stats API and Statcast data
    df = merge_datasets(start_year, end_year)
    # Clean weather
    df2 = clean_weather(df)
    # Create PA events 
    df3 = create_events(df2)
    # Create dummy variables 
    df4 = create_dummies(df3)
    # Create Statcast variables
    df5 = clean_statcast(df4)   
    # Adjust for park factors
    if adjust == True:
        df6 = park_adjustments(df5)
        df6.drop(columns={'_merge'}, inplace=True)
    else:
        df6 = df5.copy()
    # Add start data
    complete_dataset = start_data(df6)
    
    # Clean up
    # complete_dataset.drop(columns={'_merge'}, inplace=True)
    complete_dataset.fillna(0, inplace=True)
    
    # Sort
    complete_dataset.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

    
    return complete_dataset

### Steamer

##### 1. Hitters

In [1]:
def clean_steamer_hitters(df):
    ### Hitting
    # Basic stats
    hit_list = ['1B', '2B', '3B', 'HR', 'BB', 'HBP', 'K']

    # Advance stats
    rate_list = ['OBP', 'SLG', 'wOBA']
    for stat in hit_list:
        rate = stat + "_rate"
        rate_list.append(rate)
        df[rate] = df[stat] / df['PA']

    ### Base running
    # Stolen base attempts
    df['SBA'] = df['SB'] + df['CS']
    # Stolen base opportunities (times on first)
    df['SBO'] = df['1B'] + df['BB'] + df['HBP']
    # Implied stolen base attempt rate
    df['sba_imp'] = df['SBA'] / df['SBO']

    # Cap implied stolen base attempt rate
    df['sba_imp'] = np.where(df['sba_imp'] > 0.15, 0.15, df['sba_imp'])
    
    # Determine stolen base success rate
    df['sbr'] = df['SB'] / df['SBA']
    
    # Fill in missings
    df['sbr'].fillna(0.6, inplace=True) # assume 25th percentile 
    df['sba_imp'].fillna(0.05, inplace=True) # assume low probability
    
    # Date
    df['date'] = df['proj_date'].str.replace("-", "")
    df['date'] = df['date'].astype('int')
    
    # Keep relevant variables
    keep_list = ['date', 'firstname', 'lastname', 'mlbamid', 'steamerid', 'sba_imp', 'sbr'] + rate_list
    df = df[keep_list]
    
    # Clean up
    df.columns = df.columns.str.lower()
    df.rename(columns={'1b_rate': 'b1_rate', '2b_rate': 'b2_rate', '3b_rate': 'b3_rate', 'k_rate':'so_rate'}, inplace=True)
    df.dropna(inplace=True)
    
    # Drop duplicates
    df.drop_duplicates(subset=['steamerid', 'date'], inplace=True)
    
    # Calculate stolen base attempt and success rates by base
    sba_2b_reg = pickle.load(open(os.path.join(model_path, 'sba_2b_20220901.sav'), 'rb'))
    df['sba_2b'] = sba_2b_reg.predict(df[['sba_imp']])

    sba_3b_reg = pickle.load(open(os.path.join(model_path, 'sba_3b_20220901.sav'), 'rb'))
    df['sba_3b'] = sba_3b_reg.predict(df[['sba_imp']])

    sb_2b_reg = pickle.load(open(os.path.join(model_path, 'sb_2b_20220901.sav'), 'rb'))
    df['sb_2b'] = sb_2b_reg.predict(df[['sbr']])

    sb_3b_reg = pickle.load(open(os.path.join(model_path, 'sb_3b_20220901.sav'), 'rb'))
    df['sb_3b'] = sb_3b_reg.predict(df[['sbr']])

    
    return df 

##### 2. Pitchers

In [None]:
def clean_steamer_pitchers(df):
    # Hits per 9 innings
    df['H9'] = df['H'] / df['IP'] * 9
    
    # Calculate average innings per game started
    df['IP_start'] = df['start_IP'] / df['GS']
    df['IP_start'].fillna(0, inplace=True)
    # Replace infinites
    df['IP_start'].replace([np.inf, -np.inf], 3, inplace=True)

    # Date
    df['date'] = df['proj_date'].str.replace("-", "")
    df['date'] = df['date'].astype('int')
    
    # Keep relevant variables
    keep_list = ['date', 'firstname', 'lastname', 'mlbamid', 'steamerid'] + pitcher_stats_fg2 
    df = df[keep_list]
    
    # Drop duplicates
    df.drop_duplicates(subset=['steamerid', 'date'], inplace=True)

    
    return df