# A02. MLB API
This extracts DraftKings contest data and saves results
- Type: Data
- Run Frequency: Once daily
- Sources:
    - MLB Stats API
    - Statcast (via pybaseball package)
- Dates:
    - Created: 9/23/2023
    - Updated: 4/21/2024

### Imports

In [1]:
if "running_pipeline" not in globals():
    %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"

### Settings

If it's not a pipeline run or a run from A06B. Park and Weather Factors, assign a value to year and actually create the datasets

In [1]:
if all(var not in globals() for var in ["running_pipeline", "running_weather", "running_base_runners", "running_steals"]):
    year = 2024
    run_datasets = True

SyntaxError: invalid syntax (983997220.py, line 1)

### Functions

##### 1. MLB Stats API

Extract game information from boxscore

In [None]:
def create_box(gamePk):
    # Read in boxscore as json
    box = pd.json_normalize(statsapi.boxscore_data(gamePk, timecode=None), record_path='gameBoxInfo')
    
    # Define default values
    default_weather = "75 degrees, Clear."
    default_wind = "0 mph, L To R."
    default_venue = "Missing Park."
    default_date = "November 30, 1993"
    
    # Extract weather, wind, venue, and date
    weather = box.loc[box['label'] == "Weather", "value"].item() if 'Weather' in box['label'].values else default_weather
    wind = box.loc[box['label'] == "Wind", "value"].item() if 'Wind' in box['label'].values else default_wind
    venue = box.loc[box['label'] == "Venue", "value"].item() if 'Venue' in box['label'].values else default_venue
    
    try:
        date = box.iloc[-1, box.columns.get_loc('label')]
    except:
        date = default_date

    if "Weather" not in list(box['label']):
        missing_weather = True
    else:
        missing_weather = False
    
    
    return weather, wind, venue, date, missing_weather

Extract relevant data or provide default (helper function)

In [None]:
def extract_field(data, field, default=None):
    try:
        return data[field]
    except:
        return default

Extract play-by-play data

In [None]:
def create_game(gamePk):
    game = statsapi.get('game_playByPlay', {'gamePk': gamePk})
    
    # Create list with relevant variables
    game_data = []
    for play in game['allPlays']:
        about = play['about']
        count = play['count']
        result = play['result']
        matchup = play['matchup']
        runners = play['runners']
        
        atBatIndex = about['atBatIndex']
        inning = about['inning']
        halfInning = about['halfInning']
        outs = count['outs']
        
        type = extract_field(result, 'type')
        event = extract_field(result, 'event')
        eventType = extract_field(result, 'eventType')
        description = extract_field(result, 'description')
        rbi = extract_field(result, 'rbi', 0)
        awayScore = extract_field(result, 'awayScore', 0)
        homeScore = extract_field(result, 'homeScore', 0)
        
        batter = extract_field(matchup['batter'], 'id', 999999)
        batterName = extract_field(matchup['batter'], 'fullName', 'Missing Name')
        batSide = extract_field(matchup['batSide'], 'code', 'R')
        pitcher = extract_field(matchup['pitcher'], 'id', 999999)
        pitcherName = extract_field(matchup['pitcher'], 'fullName', 'Missing Name')
        pitchHand = extract_field(matchup['pitchHand'], 'code', 'R')
        
        # Baserunner on base at the end of the play
        postOnFirst = extract_field(matchup, 'postOnFirst', None)
        postOnSecond = extract_field(matchup, 'postOnSecond', None)
        postOnThird = extract_field(matchup, 'postOnThird', None)
        
        # Extract base runner information
        for runner in runners:
            details = runner['details']
            movement = runner['movement']
            
            runner_id = details['runner']['id']
            start = movement['start']
            end = movement['end']
            movementReason = details['movementReason']
            isScoringEvent = details['isScoringEvent']
            earned = details['earned']
            
            game_data.append([atBatIndex, inning, halfInning, outs, type, runner_id, event, eventType, description, 
                              rbi, awayScore, homeScore, batter, batterName, batSide, pitcher, pitcherName, pitchHand, 
                              postOnFirst, postOnSecond, postOnThird, runner_id, start, end, movementReason, isScoringEvent, earned])
    
    # Create dataframe
    df = pd.DataFrame(game_data, columns=['atBatIndex', 'inning', 'halfInning', 'outs', 'type', 'id', 'event', 'eventType', 'description', 
                                          'rbi', 'awayScore', 'homeScore', 'batter', 'batterName', 'batSide', 'pitcher', 
                                          'pitcherName', 'pitchHand', 'postOnFirst', 'postOnSecond', 'postOnThird', 'runner_id', 'start', 'end', 'movementReason', 'isScoringEvent', 'earned'])
 
    # Create weather variables
    weather, wind, venue, date, missing_weather = create_box(gamePk)
    df['gamePk'] = gamePk
    df['weather'] = weather
    df['wind'] = wind
    df['venue'] = venue
    df['date'] = date
    
    
    return df

Extract API data

In [None]:
def plays_statsapi(start_date, end_date):
    # Extract year
    year = start_date[-4:]
    
    # Read in schedule
    games = statsapi.schedule(start_date=start_date, end_date=end_date)

    # Use a list comprehension to extract unique game_ids
    game_ids = list(game['game_id'] for game in games)
    away_names = list(game['away_name'] for game in games)
    home_names = list(game['home_name'] for game in games)
    game_dates = list(game['game_date'] for game in games)
    game_types = list(game['game_type'] for game in games)
    venue_ids = list(game['venue_id'] for game in games)

    # Run all in parallel
    df_list = Parallel(n_jobs=-1, verbose=0)(delayed(create_game)(gamePk=game_id) for game_id in game_ids)

    # Add additional information from schedule
    for i in range(len(df_list)):
        df_list[i]['away_name'] = away_names[i]
        df_list[i]['home_name'] = home_names[i]
        df_list[i]['game_date'] = game_dates[i]
        df_list[i]['game_type'] = game_types[i]
        df_list[i]['venue_id'] = venue_ids[i]
    
    # Append all dataframes together
    df = pd.concat(df_list, axis=0)

    
    return df

##### 2. Statcast

Extract Statcast data

In [None]:
def plays_statcast(start_date, end_date):
    # Extract year
    year = start_date[:4]
    
    # Use pybaseball to read in Statcast data
    data = statcast(start_date, end_date)
    
    # Create atBatIndex compatible with Statsapi
    data['atBatIndex'] = data['at_bat_number'] - 1 
    
    # Highest level during the at bat
    data['maxSpeed'] = data.groupby(['game_pk', 'atBatIndex'])['effective_speed'].transform(max)
    data['maxSpin'] = data.groupby(['game_pk', 'atBatIndex'])['release_spin_rate'].transform(max)
    
    # Convert to numeric for sorting
    data['game_pk'] = data['game_pk'].astype('int')
    data['atBatIndex'] = data['atBatIndex'].astype('int')
    data['pitch_number'] = data['pitch_number'].astype('int')
    
    # Only want the deciding (last) pitch
    data.sort_values(['game_pk', 'atBatIndex', 'pitch_number'], inplace=True)
    data.drop_duplicates(['game_pk', 'atBatIndex'], keep='last', inplace=True)
    
    data.rename(columns={'game_pk':'gamePk'}, inplace=True)
    
    # Keep relevant variables
    keep_list = ['gamePk', 'atBatIndex', 'pitch_number', 'pitch_name', 'game_type',
                 'hc_x', 'hc_y', 'hit_location', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'launch_speed_angle',
                 'woba_value', 'woba_denom', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
                 'iso_value', 'babip_value',
                 'maxSpeed', 'maxSpin']
                
    data = data[keep_list]

    
    return data

##### 3. Dataset 

##### Weather

Note: 2 is to centerfield, 6 is from centerfield, clockwise (may not be relevant here)

Calculate wind vectors

y-vector: positive to centerfield, negative from centerfield

In [None]:
def y_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "Out To CF": 
        y_vect = wind_speed
    elif df['windDirection'] == "Out To RF": 
        y_vect = angled
    elif df['windDirection'] == "L To R": 
        y_vect = 0
    elif df['windDirection'] == "In From LF": 
        y_vect = angled * -1
    elif df['windDirection'] == "In From CF": 
        y_vect = wind_speed * - 1
    elif df['windDirection'] == "In From RF": 
        y_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        y_vect = 0
    elif df['windDirection'] == "Out To LF": 
        y_vect = angled
    else:
        y_vect = 0

    
    return y_vect

x-vector: positive to right, negative to left

In [None]:
# Positive from left to right, negative from right to left
def x_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "L To R": 
        x_vect = wind_speed
    elif df['windDirection'] == "In From LF": 
        x_vect = angled
    elif df['windDirection'] == "In From CF": 
        x_vect = 0
    elif df['windDirection'] == "In From RF": 
        x_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        x_vect = wind_speed * - 1
    elif df['windDirection'] == "Out To LF": 
        x_vect = angled * -1
    elif df['windDirection'] == "Out To CF": 
        x_vect = 0
    elif df['windDirection'] == "Out To RF": 
        x_vect = angled
    else:
        x_vect = 0

    
    return x_vect

Create clean weather dataframe

In [None]:
def clean_weather(df):   
    # Separate weather into temperature and weather type
    df[['temperature', 'weather']] = df['weather'].str.split(", ", expand=True)
    df['temperature'] = df['temperature'].str.replace(" degrees", "").astype('int')
    
    # Separate wind into speed and direction
    df[['windSpeed', 'windDirection']] = df['wind'].str.split(", ", expand=True)
    df['windSpeed'].fillna("0 mph", inplace=True)
    df['windSpeed'] = df['windSpeed'].str.replace(" mph", "")
    df['windSpeed'] = pd.to_numeric(df['windSpeed'], errors='coerce')
    df['windSpeed'].fillna(0, inplace=True)
    df['windDirection'].fillna('L to R', inplace=True)
    df['windSpeed'].unique()
    df['windDirection'] = df['windDirection'].str.replace(".", "")
    
    # Calculate vectors
    df['x_vect'] = df.apply(x_vect, axis=1)
    df['y_vect'] = df.apply(y_vect, axis=1)

    ### TESTING:
    # Set temperature to 70 degrees if it's a dome or the roof is close
    df['temperature'] = df.apply(lambda row: 70 if 'Roof' in row['weather'] or 'Dome' in row['weather'] else row['temperature'], axis=1)

    
    return df

##### Model Inputs

Categorize API events into model events

In [None]:
def create_events(df):
    event_mapping_dict = {
        # Strikeout
        'Strikeout': 'so',
        'Strikeout Double Play': 'so',
        # Groundout
        'Groundout': 'go',
        'Fielders Choice': 'go',
        'Fielders Choice Out': 'go',
        'Double Play': 'go',
        'Grounded Into DP': 'go',
        'Triple Play': 'go',
        'Field Error': 'go',
        'Forceout': 'go',
        'Sac Bunt': 'go',
        'Sac Bunt Double Play': 'go', 
        'Bunt Groundout': 'go',
        # Lineout
        'Lineout': 'lo',
        'Bunt Lineout': 'lo',
        # Flyout
        'Flyout': 'fo',
        'Sac Fly': 'fo',
        'Sac Fly Double Play': 'fo',
        # Pop out
        'Pop Out': 'po',
        'Bunt Pop Out': 'po',
        # Hit by pitch
        'Hit By Pitch': 'hbp',
        # Walk
        'Walk': 'bb',
        'Intent Walk': 'bb',
        # Single
        'Single': 'b1',
        # Double
        'Double': 'b2',
        # Triple
        'Triple': 'b3',
        # Home run
        'Home Run': 'hr'
    }
    # Assign, categorizing all others as fit to cut
    df['eventsModel'] = df['event'].map(event_mapping_dict).fillna('Cut')

    
    return df

Create dummy variables from events, venues, handedness, and bases (also generates and cleans some other simple variables)

In [None]:
def create_dummies(df):    
    # Events
    event_dummies = pd.get_dummies(df['eventsModel'])
    
    # Hands
    pitcher_dummies = pd.get_dummies(df['pitchHand'], prefix='p')
    batter_dummies = pd.get_dummies(df['batSide'], prefix='b')
    
    # Years
    df['year'] = df['game_date'].str[:4].astype(int)
    
    # Add dummies to dataframe
    df = pd.concat([df, event_dummies, pitcher_dummies, batter_dummies], axis=1)

    # Identify starting pitcher
    df['startingPitcher'] = df.groupby(['gamePk', 'halfInning'])['pitcherName'].transform('first')
    df['starter'] = (df['startingPitcher'] == df['pitcherName']).astype('int')
    
    # Create compatible date variable
    df['date'] = df['game_date'].str.replace('-', '')
    
    # Convert to numeric for sorting
    df['date'] = df['date'].astype('int')
    df['gamePk'] = df['gamePk'].astype('int')
    df['atBatIndex'] = df['atBatIndex'].astype('int')
    
    # Sort
    df.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
    
    # Create dummy for runners on base
    df['preOnFirst'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnFirst'].shift(1)
    df['preOnSecond'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnSecond'].shift(1)
    df['preOnThird'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnThird'].shift(1)
    
    df['onFirst'] = df['preOnFirst'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    df['onSecond'] = df['preOnSecond'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    df['onThird'] = df['preOnThird'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    
    # Top of the inning dummy
    df['top'] = np.where(df['halfInning'] == "top", 1, 0)
    
    # Convert to numeric
    df['awayScore'] = df['awayScore'].astype('int')
    df['homeScore'] = df['homeScore'].astype('int')
    
    # Determine score before PA
    df['preAwayScore'] = df.groupby(['gamePk'])['awayScore'].shift(1)
    df['preHomeScore'] = df.groupby(['gamePk'])['homeScore'].shift(1)
    
    # If it's the first PA, it'll be missing. 
    df['preAwayScore'] = df['preAwayScore'].fillna(0)
    df['preHomeScore'] = df['preHomeScore'].fillna(0)
    
    # Calculate differential
    df['score_diff'] = np.where(df['top'] == 1, df['preAwayScore'] - df['preHomeScore'], df['preHomeScore'] - df['preAwayScore'])
    
    # Determine hitter and pitcher scores
    df['batterScore'] = np.where(df['halfInning'] == 'top', df['awayScore'], df['homeScore'])
    df['pitcherScore'] = np.where(df['halfInning'] == 'top', df['homeScore'], df['awayScore'])
    
    # Determine score before PA
    df['preBatterScore'] = np.where(df['halfInning'] == 'top', df['preAwayScore'], df['preHomeScore'])
    df['prePitcherScore'] = np.where(df['halfInning'] == 'top', df['preHomeScore'], df['preAwayScore'])
    
    # Calculate PAs and ABs
    df['pa'] = np.where(df['eventsModel'] != "Cut", 1, 0)
    df['ab'] = df['pa'] - df['hbp'] - df['bb']           
            
    # Sort
    df.sort_values(['date', 'gamePk', 'atBatIndex'], inplace=True)

    
    return df

Create variables from Statcast data

In [None]:
def clean_statcast(df):
    # Convert variables to numeric
    df['launch_speed'] = pd.to_numeric(df['launch_speed'], errors='coerce')
    df['launch_speed_angle'] = pd.to_numeric(df['launch_speed_angle'], errors='coerce')
    df['hc_x'] = pd.to_numeric(df['hc_x'], errors='coerce')
    df['hc_y'] = pd.to_numeric(df['hc_y'], errors='coerce')
    
    # Hard hit dummy
    df['hard_hit'] = (df['launch_speed'] >= 95).astype('int')
    
    # Barrel dummy
    df['barrel'] = (df['launch_speed_angle'] == 6).astype('int')

    # Spray 
    df['spray_angle'] = np.arctan((df['hc_x'] - 125.42) / (198.27 - df['hc_y'])) * 180 / np.pi * 0.75
    df['to_left'] = (df['spray_angle'] < -15).astype('int')
    df['to_middle'] = ((df['spray_angle'] >= -15) & (df['spray_angle'] <= 15)).astype('int')
    df['to_right'] = (df['spray_angle'] > 15).astype('int')

    
    return df

Adjust for park factors

In [None]:
def park_adjustments(df):   
    # Read in park factors
    multiplier_df = pd.read_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"))

    # Convert to numeric for merging
    multiplier_df['gamePk'] = multiplier_df['gamePk'].astype(int)
    df['gamePk'] = df['gamePk'].astype('int')
    
    # Merge with park factors
    multiplier_columns = [column for column in multiplier_df.columns if "mult" in column]
    league_avg_columns = [column for column in multiplier_df.columns if "league" in column]
    df = df.merge(multiplier_df[["gamePk"] + multiplier_columns + league_avg_columns], on=['gamePk'], how='left', indicator=True)
    
    # Missings (old parks, other parks)
    # Multipliers of 1 
    df[multiplier_columns] = df[multiplier_columns].fillna(1)
    # Most recent league_averages
    df[league_avg_columns] = df[league_avg_columns].ffill()


    # Loop over events
    for event in events_list:
        # Adjust based on calculated multiplier
        df[event] = np.where(df['batSide'] == "L", df[event].astype(float) / df[f'{event}_mult_l'].astype(float), df[event].astype(float) / df[f'{event}_mult_r'].astype(float))
    
    return df

In [None]:
# New park adjustments
def park_adjustments(df, multiplier_df):   
    # # Read in park factors
    # multiplier_df = pd.read_csv(os.path.join(baseball_path, "New Multiplier Dataset.csv"), dtype='str')

#     # Convert to numeric for merging
#     multiplier_df['gamePk'] = multiplier_df['gamePk'].astype(int)
#     df['gamePk'] = df['gamePk'].astype('int')
    
    # Merge with park factors
    # multiplier_columns = [column for column in multiplier_df.columns if "mult" in column]
    # league_avg_columns = [column for column in multiplier_df.columns if "league" in column]
    pfx_columns = [col for col in multiplier_df.columns if "pfx" in column]
    df = df.merge(multiplier_df[['gamePk'] + pfx_columns], on=['gamePk'], how='left')
    
    # Missings (old parks, other parks)
    # Multipliers of 1 
    # df[pfx_columns] = df[pfx_columns].fillna(1)
    # # Most recent league_averages
    # df[league_avg_columns] = df[league_avg_columns].ffill()

    

    # Loop over events
    for event in events_list:
        # Adjust based on calculated multiplier
        df[event] = np.where(df['batSide'] == "L", df[event].astype(float) / df[f'{event}_pfx_l'].astype(float), df[event].astype(float) / df[f'{event}_pfx_r'].astype(float))
    
    return df

In [None]:
# This will return a dataframe that can eventually be used as the model input. Has pitcher vs hitter stats, specific to hand
def rolling_pas(df, pa_num):
    # Copy dataframe
    df_copy = df.copy()
    
    # Note: batter_avg_short will work even when pa_num refers to the "long" period. Suffix will be added in post.
    # Rename for compatibility purposes
    df_copy.rename(columns={'hit_distance_sc':'totalDistance', 'launch_speed':'launchSpeed'}, inplace=True)          
            
    # Convert to numeric and fill with 0s
    combined_list = avg_list + max_list
    for col in combined_list:
        # Check if the column is not numeric
        if not pd.api.types.is_numeric_dtype(df_copy[col]):
            # Convert the non-numeric column to numeric and fill missing values with 0
            df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
            df_copy[col] = df_copy[col].fillna(0)

    # Sort
    df_copy.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
            
    # Data types may vary. This makes grouping impossible. 
    df_copy['batter'] = df_copy['batter'].astype('int')
    df_copy['pitcher'] = df_copy['pitcher'].astype('int')
        
    ### Batter stats 
    # Stats for which you want the average 
    df_copy[batter_avg_short] = df_copy.groupby(['batter', 'pitchHand'])[avg_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).mean())
    # Stats for which you want the maximum
    df_copy[batter_max_short] = df_copy.groupby(['batter', 'pitchHand'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
    # Stats for which you just want the sum 
    df_copy[['ab_b', 'pa_b']] = df_copy.groupby(['batter', 'pitchHand'])[['ab', 'pa']].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
                
    ### Pitcher stats
    # Stats for which you want the average
    df_copy[pitcher_avg_short] = df_copy.groupby(['pitcher', 'batSide'])[avg_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).mean())
    # Stats for which you want the maximum
    df_copy[pitcher_max_short] = df_copy.groupby(['pitcher', 'batSide'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
    # Stats for which you just want the sum 
    df_copy[['ab_p', 'pa_p']] = df_copy.groupby(['pitcher', 'batSide'])[['ab', 'pa']].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
                
    # Create imputation flags (these observations will have imputed inputs)
    df_copy['imp_b'] = (df_copy['pa_b'] < 40).astype('int')
    df_copy['imp_p'] = (df_copy['pa_p'] < 40).astype('int')

    # Create compatible date variable
    df_copy['date'] = df_copy['game_date'].str.replace('-', '')
    
    # Convert to numeric for sorting
    df_copy['date'] = df_copy['date'].astype('int')
    df_copy['gamePk'] = df_copy['gamePk'].astype('int')
    df_copy['atBatIndex'] = df_copy['atBatIndex'].astype('int')
    df_copy['batter'] = df_copy['batter'].astype('int')
    df_copy['pitcher'] = df_copy['pitcher'].astype('int')
    
    # Sort
    df_copy.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

    ### Advanced stats
    # wOBA - using 2022 values throughout
    df_copy['woba_b'] = (0.690 * df_copy['bb_b']) + (0.721 * df_copy['hbp_b']) + (0.885 * df_copy['b1_b']) + (1.262 * df_copy['b2_b']) + (1.601 * df_copy['b3_b']) + (2.070 * df_copy['hr_b'])
    df_copy['woba_p'] = (0.690 * df_copy['bb_p']) + (0.721 * df_copy['hbp_p']) + (0.885 * df_copy['b1_p']) + (1.262 * df_copy['b2_p']) + (1.601 * df_copy['b3_p']) + (2.070 * df_copy['hr_p'])
    
    # Slugging
    df_copy['slg_b'] = ((1 * df_copy['b1_b']) + (2 * df_copy['b2_b']) + (3 * df_copy['b3_b']) + (4 * df_copy['hr_b'])) * (1 / (1-(df_copy['bb_b'] + df_copy['hbp_b'])))
    df_copy['slg_p'] = ((1 * df_copy['b1_p']) + (2 * df_copy['b2_p']) + (3 * df_copy['b3_p']) + (4 * df_copy['hr_p'])) * (1 / (1-(df_copy['bb_p'] + df_copy['hbp_p'])))
    # OBP    
    df_copy['obp_b'] = df_copy[['b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b']].sum(axis=1)
    df_copy['obp_p'] = df_copy[['b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p']].sum(axis=1)
    
    # ISO
    df_copy['iso_b'] = (df_copy['b2_b'] * 1 + df_copy['b3_b'] * 2 + df_copy['hr_b'] * 3) * (1 / (1-(df_copy['bb_b'] + df_copy['hbp_b'])))
    df_copy['iso_p'] = (df_copy['b2_p'] * 1 + df_copy['b3_p'] * 2 + df_copy['hr_p'] * 3) * (1 / (1-(df_copy['bb_p'] + df_copy['hbp_p'])))

    
    return df_copy

In [None]:
# This creates information about starts for use in pulling pitchers
def start_data(df):
    # Calculate the sum for each group
    df['br'] = df[['b1', 'b2', 'b3', 'hr', 'bb', 'hbp']].astype('float').sum(axis=1)

    # Calculate the cumulative sum within each group
    df['br_inning'] = df.groupby(['gamePk', 'inning', 'halfInning'])['br'].cumsum()
    
    # Convert to numeric
    print(df[['inning', 'outs', 'rbi']].dtypes)
    df['inning'] = pd.to_numeric(df['inning'])
    df['outs'] = pd.to_numeric(df['outs'])
    df['rbi'] = df['rbi'].astype('int')
    
    # Number of batters faced (will be used to calculate rolling sum)
    df['faced'] = 1
    
    # Cumulative counts
    # Stats to sum
    sums_list = ['gamePk', 'pitcher'] + events_list + ['rbi', 'faced']
    # Calculate
    sums = df[sums_list].groupby(['gamePk', 'pitcher']).cumsum()
    # Add suffix
    sums = sums.add_suffix("_sum")
    
    # Add rolling sums
    df = pd.concat([df, sums], axis=1)
    
    # Identify if it's the bottom of the inning (a little more helpful than "top" as it's sortable)
    df['bottom'] = (df['top'] == 0).astype('int')
    
    # Sort to identify starting pitchers
    df = df.sort_values(by=['date', 'gamePk', 'bottom', 'atBatIndex'])
    
    # The starter has the lowest atBatIndex
    df['atBatIndex_min'] = df.groupby(['gamePk', 'bottom'])['atBatIndex'].transform('min')
    df['start'] = (df['atBatIndex'] == df['atBatIndex_min']).astype('int')
        
    # Identify starter throughout
    df['starter'] = df.groupby(['pitcher', 'gamePk'])['start'].cumsum()
    
    # Keep only starters
    df = df.query('starter == 1')
    
    # The starter is pulled at their highest atBatIndex
    df['atBatIndex_max'] = df.groupby(['gamePk', 'bottom'])['atBatIndex'].transform('max')
    df['pulled'] = (df['atBatIndex'] == df['atBatIndex_max']).astype('int')
    
    # Batters faced that inning
    df['faced_inning'] = df.groupby(['gamePk', 'inning', 'bottom']).cumcount()+1
    df['faced_inning'] = np.where(df['outs'] == 3, 0, df['faced_inning'])
    
    # Rolling sums stats (post-rolling sum)
    rolled_sums_list = [f'{stat}_sum' for stat in events_list] + ['rbi_sum', 'faced_sum']
    
    # Outs recorded by starting pitcher 
    df['OUT'] = ((df['inning'] - 1) * 3) + df['outs']
    
    # This adjusts timing to better reflect when pitchers are pulled
    # If a pitcher is pulled after 6 innings in the data, that's the same as pulling at the top of the 7th, which more closely reflects how the sim works
    df['inning_adj'] = df['inning'] + (df['outs'] == 3).astype('int')
    df['outs_adj'] = np.where(df['outs'] == 3, 0, df['outs'])

    
    return df

In [None]:
def merge_datasets(start_year=2015, end_year=2024):
    # List of merged datasets
    df_list = []
    # Read in datasets
    for year in range(start_year, end_year+1):
        statsapi_df = pd.read_csv(os.path.join(baseball_path, "A02. MLB API", "1. Stats API", f"Stats API {year}.csv"), encoding='iso-8859-1')
        statcast_df = pd.read_csv(os.path.join(baseball_path, "A02. MLB API", "2. Statcast", f"Statcast {year}.csv"), encoding='iso-8859-1')
        
        # Merge them together
        merged_df = pd.merge(statsapi_df, statcast_df, on=['gamePk', 'atBatIndex'], how='left')

        # Drop duplicate observations
        merged_df.drop_duplicates(['gamePk', 'atBatIndex'], keep='first', inplace=True)
        
        # Add them to a list
        df_list.append(merged_df)
    
    # Create raw dataset
    df = pd.concat(df_list, axis=0)   
    
    # Create data variable (without dashes)
    df['date'] = df['game_date'].str.replace('-', '')

    # Convert to numeric for sorting
    df['date'] = df['date'].astype('int')
    df['gamePk'] = df['gamePk'].astype('int')
    df['atBatIndex'] = df['atBatIndex'].astype('int')

    # Sort
    df.sort_values(['game_date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

    # Only keep one observation per at bat
    df.drop_duplicates(['gamePk', 'atBatIndex'], keep='first', inplace=True)

    # Determine outs coming into PA
    df['outs_pre'] = df.groupby(['gamePk', 'inning', 'halfInning'])['outs'].shift(fill_value=0)

    
    return df

In [None]:
# Creates model inputs
def create_pa_inputs(park_factors, team_map, start_year, end_year, short=50, long=300, adjust=True):
    # Merge together raw Stats API and Statcast data
    df = merge_datasets(start_year, end_year)
    # Clean weather
    df2 = clean_weather(df)
    # Create PA events 
    df3 = create_events(df2)
    # Create dummy variables 
    df4 = create_dummies(df3)
    # Create Statcast variables
    df5 = clean_statcast(df4)   
    # Adjust for park factors
    if adjust == True:
        df6 = park_adjustments(df5)
        df6.drop(columns={'_merge'}, inplace=True)
    else:
        df6 = df5.copy()
    # Clean up
    df6 = df6.query('eventsModel != "Cut"')
    df6.fillna(0, inplace=True)
    
    ### Rolling stats
    # Short
    df_short = rolling_pas(df6, short)
    # Long
    df_long = rolling_pas(df6, long)
    df_long = df_long.add_suffix("_long")
        
    # We only need the rolling stats 
    long_stats = batter_stats_long + pitcher_stats_long
    df_long = df_long[long_stats]
    
    # Dataset
    complete_dataset = pd.concat([df_short, df_long], axis=1)

    # Fix Guardians name to make uniform
    complete_dataset['away_name'] = np.where(complete_dataset['away_name'] == "Cleveland Indians", "Cleveland Guardians", complete_dataset['away_name'])
    complete_dataset['home_name'] = np.where(complete_dataset['home_name'] == "Cleveland Indians", "Cleveland Guardians", complete_dataset['home_name'])

    # Only keep regular season
    complete_dataset = complete_dataset[complete_dataset['game_type_x'] == "R"]

    # Reset index
    complete_dataset.reset_index(drop=True, inplace=True)
    
    # Sort
    complete_dataset.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)
    
    
    return complete_dataset

In [None]:
# Creates model inputs
def create_pull_inputs(park_factors, team_map, start_year, end_year, short=50, long=300, adjust=True):
    # Merge together raw Stats API and Statcast data
    df = merge_datasets(start_year, end_year)
    # Clean weather
    df2 = clean_weather(df)
    # Create PA events 
    df3 = create_events(df2)
    # Create dummy variables 
    df4 = create_dummies(df3)
    # Create Statcast variables
    df5 = clean_statcast(df4)   
    # Adjust for park factors
    if adjust == True:
        df6 = park_adjustments(df5)
        df6.drop(columns={'_merge'}, inplace=True)
    else:
        df6 = df5.copy()
    # Add start data
    complete_dataset = start_data(df6)
    
    # Clean up
    # complete_dataset.drop(columns={'_merge'}, inplace=True)
    complete_dataset.fillna(0, inplace=True)
    
    # Sort
    complete_dataset.sort_values(['date', 'gamePk', 'atBatIndex'], ascending=True, inplace=True)

    
    return complete_dataset

### Run

##### 1. Stats API 

In [None]:
if run_datasets == True:
    statsapi_df = plays_statsapi(f"03/20/{year}", f"11/15/{year}")
    statsapi_df.to_csv(os.path.join(baseball_path, "A02. MLB API", "1. Stats API", f"Stats API {year}.csv"), index=False, encoding='iso-8859-1')

    del statsapi_df

##### 2. Statcast

In [None]:
if run_datasets == True:
    statcast_df = plays_statcast(f"{year}-03-20", f"{year}-11-15")
    statcast_df.to_csv(os.path.join(baseball_path, "A02. MLB API", "2. Statcast", "Statcast 2024.csv"), index=False, encoding='iso-8859-1')

    del statcast_df