### Imports

In [1]:
if "running_pipeline" not in globals():
    print("Running imports...")
    %run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"
    %run "C:\Users\james\Documents\MLB\Code\U5. Models.ipynb"
    print("Imports in.")
else:
    print("Imports already in.")

Running imports...
Imports in.


### Settings

In [2]:
if "running_pipeline" not in globals():
    write_complete_dataset = False

In [3]:
start_date = "20240101"
end_date = todaysdate
write_complete_dataset = True

### Games

In [4]:
game_df = create_games(start_date, end_date, team_map)

In [5]:
game_df = game_df[game_df['status'] != "Postponed"].reset_index(drop=True)

### Functions

##### Create Matchup File

In [6]:
def create_matchup_file(game_df, row, complete_dataset, steamer_hitters_df, steamer_pitchers_df, team_map):
    game_id = game_df['game_id'][row]
    game_datetime = game_df['game_datetime'][row]
    game_date = game_df['game_date'][row]
    date = int(game_date.replace("-", ""))
    away_id = game_df['away_id'][row]
    home_id = game_df['home_id'][row]
    # Retrieve Baseball Reference team abbreviation
    team_map_cut = team_map[['teamId', 'BBREFTEAM']].set_index('teamId')
    
    away_team = team_map_cut.loc[away_id]['BBREFTEAM']
    home_team = team_map_cut.loc[home_id]['BBREFTEAM']
    
    for team in away_team, home_team:
        # Read in rosters
        roster_df = pd.read_csv(os.path.join(baseball_path, "A05. Rosters", "2. Rosters", f"Rosters {date}", f"Roster {team} {date}.csv"), encoding='iso-8859-1')

        # Read in batting orders
        order_df = pd.read_csv(os.path.join(baseball_path, "A05. Rosters", "1. Batting Orders", f"Batting Orders {date}", f"Batting Order {team} {game_id}.csv"), encoding='iso-8859-1')        
        
        # Read in bullpens
        bullpen_df = pd.read_csv(os.path.join(baseball_path, "A04. Bullpens", f"Bullpens {date}", f"Bullpen {team} {date}.csv"), encoding='iso-8859-1')  
        
        # Merge batting order onto roster
        team_df = pd.merge(roster_df, order_df[['id', 'fullName', 'position', 'status', 'order']], on='id', how='outer', suffixes=("","2"))
        
        # Fill in missings
        team_df['batSide'].fillna('Right', inplace=True)
        team_df['pitchHand'].fillna('Right', inplace=True)
        team_df['fullName'].fillna(team_df['fullName2'], inplace=True)
        team_df['position'].fillna(team_df['position2'], inplace=True)
        
        # Merge pitcher leverage onto roster
        ### Testing
        team_df['fullName'] = team_df['fullName'].apply(remove_accents)
        ### Testing
        team_df = pd.merge(team_df, bullpen_df[['Name', 'Leverage']], left_on='fullName', right_on='Name', how='left')
        
        # Add weather
        box = create_box(game_id)
        team_df['weather'] = box[0]
        team_df['wind'] = box[1]
        team_df['park'] = box[2]
        team_df = clean_weather(team_df)

        # Add venue
        team_df['venue_id'] = game_df['venue_id'][row]
        
        # Add starters
        team_df['away_starter'] = game_df['away_probable_pitcher'][row]
        team_df['home_starter'] = game_df['home_probable_pitcher'][row]

        team_df['away_starter'] = team_df['away_starter'].apply(remove_accents)
        team_df['home_starter'] = team_df['home_starter'].apply(remove_accents)
        
        
        # Assign Leverage of 1 to starting pitcher
        team_df['Leverage'] = np.where((team_df['fullName'] == team_df['away_starter']) | (team_df['fullName'] == team_df['home_starter']), 1, team_df['Leverage'])

        # Determine batting order
        team_df['order'] = pd.to_numeric(team_df['order'], errors='coerce')
        team_df['batting_order'] = np.nan
        for i in range(9):
            team_df['batting_order'] = np.where(team_df['order'] == (i+1)*100, i+1, team_df['batting_order'])

        ### Batters
        batter_df = team_df[team_df['position'] != "Pitcher"]

        ## Dataset
        # Vs. LHP
        vs_l = complete_dataset[complete_dataset['date'] < int(date)]
        vs_l = vs_l[vs_l['pitchHand'] == "L"]
        vs_l.drop_duplicates(subset='batter', keep='last', inplace=True)

        # Merge in stats
        batter_df = pd.merge(batter_df, vs_l[['batter'] + batter_inputs + ['imp_b', 'pa_b', 'pa_b_long']], left_on='id', right_on='batter', how='left')

        # Vs. RHP
        vs_r = complete_dataset[complete_dataset['date'] < int(date)]
        vs_r = vs_r[vs_r['pitchHand'] == "R"]
        vs_r.drop_duplicates(subset='batter', keep='last', inplace=True)

        # Merge in stats
        batter_df = pd.merge(batter_df, vs_r[['batter'] + batter_inputs + ['imp_b', 'pa_b', 'pa_b_long']], left_on='id', right_on='batter', how='left', suffixes=("_l", "_r"))

        ## Steamer 
        # Keep last observation before date (may switch to <= if I find projections are up early)
        steamer_hitters_last_df = steamer_hitters_df[steamer_hitters_df['date'] <= int(date)]
        steamer_hitters_last_df.drop_duplicates(subset='mlbamid', keep='last', inplace=True)

        # Merge
        batter_df = pd.merge(batter_df, steamer_hitters_last_df, left_on='id', right_on='mlbamid', how='left', suffixes=("", "_fg"))

        # Remove redundant variables
        batter_df.drop(columns={'batter_l', 'batter_r', 'firstname', 'lastname', 'mlbamid', 'fullName2', 'position2'}, inplace=True)

        # Clean
        # batter_df = clean_order(batter_df)

        # Move 'batting_order' to the desired position
        batter_df.insert(batter_df.columns.get_loc('order') + 1, 'batting_order', batter_df.pop('batting_order'))

        # Sort
        batter_df.sort_values('batting_order', inplace=True)


        ### Pitchers
        pitcher_df = team_df[(team_df['position'] == "Pitcher") | (team_df['position'] == "Two-Way Player")]

        ## Dataset
        # Vs. LHB
        vs_l = complete_dataset[complete_dataset['date'] < int(date)]
        vs_l = vs_l[vs_l['batSide'] == "L"]
        vs_l.drop_duplicates(subset='pitcher', keep='last', inplace=True)

        # Merge in stats
        pitcher_df = pd.merge(pitcher_df, vs_l[['pitcher'] + pitcher_inputs + ['imp_p', 'pa_p', 'pa_p_long']], left_on='id', right_on='pitcher', how='left')

        # Vs. RHB
        vs_r = complete_dataset[complete_dataset['date'] < int(date)]
        vs_r = vs_r[vs_r['batSide'] == "R"]
        vs_r.drop_duplicates(subset='pitcher', keep='last', inplace=True)

        # Merge in stats
        pitcher_df = pd.merge(pitcher_df, vs_r[['pitcher'] + pitcher_inputs + ['imp_p', 'pa_p', 'pa_p_long']], left_on='id', right_on='pitcher', how='left', suffixes=("_l", "_r"))

        ## Steamer 
        # Keep last observation before date (may switch to <= if I find projections are up early)
        steamer_pitchers_last_df = steamer_pitchers_df[steamer_pitchers_df['date'] <= int(date)]
        steamer_pitchers_last_df.drop_duplicates(subset='mlbamid', keep='last', inplace=True)

        # Merge
        pitcher_df = pd.merge(pitcher_df, steamer_pitchers_last_df, left_on='id', right_on='mlbamid', how='left', suffixes=("", "_fg"))

        # Remove redundant variables
        pitcher_df.drop(columns={'pitcher_l', 'pitcher_r', 'firstname', 'lastname', 'mlbamid', 'fullName2', 'position2'}, inplace=True)

        # Move 'batting_order' to the desired position
        pitcher_df.insert(pitcher_df.columns.get_loc('order') + 1, 'batting_order', pitcher_df.pop('batting_order'))

        # Sort
        pitcher_df.sort_values('Leverage', inplace=True)

        if team == away_team:
            away_batter_df = batter_df.copy()
            away_pitcher_df = pitcher_df.copy()
        else:
            home_batter_df = batter_df.copy()
            home_pitcher_df = pitcher_df.copy()

    # Drop duplicates: 
    away_batter_df.drop_duplicates('id', keep='last', inplace=True)
    away_batter_df.reset_index(drop=True, inplace=True)
    
    home_batter_df.drop_duplicates('id', keep='last', inplace=True)
    home_batter_df.reset_index(drop=True, inplace=True)
    
    away_pitcher_df.drop_duplicates('id', keep='last', inplace=True)
    away_pitcher_df.reset_index(drop=True, inplace=True)
    
    home_pitcher_df.drop_duplicates('id', keep='last', inplace=True)
    home_pitcher_df.reset_index(drop=True, inplace=True)
       
        
    return away_batter_df, away_pitcher_df, home_batter_df, home_pitcher_df

##### Create Matchup Files

In [22]:
def create_matchup_files(game_df, row, complete_dataset, steamer_hitters_df, steamer_pitchers_df, team_map):

    # Extract IDs
    game_id = game_df['game_id'][row]
    away_id = game_df['away_id'][row]
    home_id = game_df['home_id'][row]

    # Retrieve Baseball Reference team abbreviation
    team_map_cut = team_map[['teamId', 'BBREFTEAM']].set_index('teamId')
    away_team = team_map_cut.loc[away_id]['BBREFTEAM']
    home_team = team_map_cut.loc[home_id]['BBREFTEAM']    

    # Extract date
    game_date = game_df['game_date'][row]
    game_date = game_date.replace("-", "")
    game_datetime = game_df['game_datetime'][row]

    # Convert string to datetime object
    utc_datetime = datetime.datetime.strptime(game_datetime, "%Y-%m-%dT%H:%M:%SZ")

    # Define the UTC timezone
    utc_timezone = pytz.timezone("UTC")

    # Set the UTC timezone for the datetime object
    utc_datetime = utc_timezone.localize(utc_datetime)

    # Convert to Eastern Standard Time (EST)
    est_timezone = pytz.timezone("US/Eastern")
    est_datetime = utc_datetime.astimezone(est_timezone)

    # Format the result
    formatted_time = est_datetime.strftime("%H%M")


    # Create position dfs
    away_batter_df, away_pitcher_df, home_batter_df, home_pitcher_df = create_matchup_file(game_df, row, complete_dataset, steamer_hitters_df, steamer_pitchers_df, team_map)

    
    # Create folder, if it doesn't exist
    os.makedirs(os.path.join(baseball_path, "B01. Matchups", f'Matchups {game_date}'), exist_ok=True)

    # File name
    matchup_file = f"{away_team}@{home_team} {game_id} {formatted_time}"

    # Write to Excel
    away_batter_df.to_excel(os.path.join(baseball_path, "B01. Matchups", f'Matchups {game_date}', f'{matchup_file}.xlsx'), sheet_name="AwayBatters", engine='openpyxl', index=False)

    with pd.ExcelWriter(os.path.join(baseball_path, "B01. Matchups", f'Matchups {game_date}', f'{matchup_file}.xlsx'), mode='a', engine='openpyxl') as writer:  
        home_batter_df.to_excel(writer, sheet_name='HomeBatters', index=False)

    with pd.ExcelWriter(os.path.join(baseball_path, "B01. Matchups", f'Matchups {game_date}', f'{matchup_file}.xlsx'), mode='a', engine='openpyxl') as writer:  
        away_pitcher_df.to_excel(writer, sheet_name='AwayPitchers', index=False)

    with pd.ExcelWriter(os.path.join(baseball_path, "B01. Matchups", f'Matchups {game_date}', f'{matchup_file}.xlsx'), mode='a', engine='openpyxl') as writer:  
        home_pitcher_df.to_excel(writer, sheet_name='HomePitchers', index=False)

### Run

In [8]:
%run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"

In [9]:
# Read in park factors
multiplier_df = pd.read_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"))

In [10]:
%%time
if write_complete_dataset == True:
    # # Read in dataset 
    # complete_dataset = create_pa_inputs(multiplier_df, 2015, 2024, 50, 300, True)
    complete_dataset = pd.read_csv(os.path.join(baseball_path, "nn_dataset.csv"))
    
    # Subset
    complete_dataset = complete_dataset.query('date > 20210301')

    # Read in Steamer hitters
    steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')
    steamer_hitters_df_current = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters.csv"), encoding='iso-8859-1')
    steamer_hitters_df = pd.concat([steamer_hitters_df, steamer_hitters_df_current], axis=0)
    steamer_hitters_df['proj_year'].fillna(2024, inplace=True)
    steamer_hitters_df['proj_date'].fillna(todaysdate_dash, inplace=True)
    steamer_hitters_df = clean_steamer_hitters(steamer_hitters_df)

    # Read in Steamer pitchers
    steamer_pitchers_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_pitchers_weekly_log.csv"), encoding='iso-8859-1')
    steamer_pitchers_df_current = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_pitchers.csv"), encoding='iso-8859-1')
    steamer_pitchers_df = pd.concat([steamer_pitchers_df, steamer_pitchers_df_current], axis=0)
    steamer_pitchers_df['proj_year'].fillna(2024, inplace=True)
    steamer_pitchers_df['proj_date'].fillna(todaysdate_dash, inplace=True)
    steamer_pitchers_df = clean_steamer_pitchers(steamer_pitchers_df)

    # Write to CSV (we'll read these later in B.)
    complete_dataset.to_csv(os.path.join(baseball_path, "Complete Dataset.csv"), index=False)
    steamer_hitters_df.to_csv(os.path.join(baseball_path, "Steamer Hitters.csv"), index=False)
    steamer_pitchers_df.to_csv(os.path.join(baseball_path, "Steamer Pitchers.csv"), index=False)
    
else:
    complete_dataset = pd.read_csv(os.path.join(baseball_path, "Complete Dataset.csv"))
    steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "Steamer Hitters.csv"))
    steamer_pitchers_df = pd.read_csv(os.path.join(baseball_path, "Steamer Pitchers.csv"))

CPU times: total: 3min 26s
Wall time: 3min 47s


Shrink datasets for faster processing

In [11]:
complete_dataset.drop(columns={'description', 'batterName', 'pitcherName', 'postOnFirst', 'postOnSecond', 'postOnThird', 'preOnFirst', 'preOnSecond', 'preOnThird', 'pitch_name'}, inplace=True)

In [12]:
# Shrink datasets for faster processing
# complete_dataset.drop(columns={'description', 'batterName', 'pitcherName', 'postOnFirst', 'postOnSecond', 'postOnThird', 'preOnFirst', 'preOnSecond', 'preOnThird', 'pitch_name'}, inplace=True)
# complete_dataset = complete_dataset[complete_dataset['date'].astype(int) > game_df["date"].astype(int).min()-10000]
# steamer_hitters_df = steamer_hitters_df[steamer_hitters_df['date'].astype(int) > game_df["date"].astype(int).min()-10000]
# steamer_pitchers_df = steamer_pitchers_df[steamer_pitchers_df['date'].astype(int) > game_df["date"].astype(int).min()-10000]

In [23]:
%%time
print(len(game_df))
empty_list = Parallel(n_jobs=4, verbose=True)(delayed(create_matchup_files)(game_df, row, complete_dataset, steamer_hitters_df, steamer_pitchers_df, team_map) for row in range(len(game_df)))

2472


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  4.4min


PicklingError: Could not pickle the task to send it to the workers.