# 001. Sim Pipeline
- This runs the pre-contest pipeline, simulations, and optimizations
- Type: Pipeline
- Run Frequency: Daily (Contest)
- Created: 1/1/2025
- Updated: 5/29/2025

### Imports

In [1]:
%run "C:\Users\james\Documents\MLB\Code\U1. Imports.ipynb"
%run "C:\Users\james\Documents\MLB\Code\U2. Utilities.ipynb"
%run "C:\Users\james\Documents\MLB\Code\U3. Classes.ipynb"
%run "C:\Users\james\Documents\MLB\Code\U4. Datasets.ipynb"
%run "C:\Users\james\Documents\MLB\Code\U5. Models.ipynb"

In [2]:
pause_code(start_time='2025-06-15T12:30:00', timezone='EST')

Time until 12:30PM. -332 hours, 21 minutes, and 24 seconds.


In [3]:
# Toronto - not always getting most updated weather

### Settings

In [4]:
start_date, end_date = "20240101", "20241231"
# start_date, end_date = "20240522", "20241231"
# start_date, end_date = todaysdate, todaysdate

In [5]:
historic, slate_only = True, False
# historic, slate_only = False, True

### Games

Read in games

In [6]:
%%time
historic_game_df = pd.read_csv(os.path.join(baseball_path, "game_df.csv"))
recent_game_df = create_games(yesterdaysdate, todaysdate, team_dict)
historic_game_df = historic_game_df[~historic_game_df['date'].astype(str).isin([yesterdaysdate, todaysdate])]
all_game_df = pd.concat([historic_game_df, recent_game_df], axis=0)
all_game_df.to_csv(os.path.join(baseball_path, "game_df.csv"), index=False)

CPU times: total: 203 ms
Wall time: 480 ms


Select games

In [7]:
game_df = all_game_df[(all_game_df['date'].astype(str) >= start_date) & (all_game_df['date'].astype(str) <= end_date)].reset_index(drop=True)

### 002. Night Pipeline

In [None]:
print("game_df")

In [None]:
%%time
if historic == False:
    %run "002. Night Pipeline.ipynb"

### Contests

Read in contests

In [None]:
%%time
contest_df = create_contests(start_date=start_date, end_date=end_date, name="Four", entryFee=None, exclusions=['vs', 'Turbo', '@'])

In [None]:
contest_df = contest_df.drop(columns=['game_type', 'game_num', 'date', 'away_score', 'home_score']).merge(game_df, on=['game_id'], how='left')
contest_df.sort_values('game_datetime').drop_duplicates('contestKey', keep='first').reset_index(drop=True).head(5)

Select contestKeys

In [None]:
selections = [178137794]

if selections == []:
    selections = contest_df['contestKey'].astype(int).unique()

selected_contest_df = contest_df[contest_df['contestKey'].astype(int).isin(selections)].reset_index(drop=True)

### Weather

Function

In [8]:
def color_rows_by_tag(row):
    color_map = {
        'green': 'background-color: lightgreen',
        'yellow': 'background-color: lightyellow',
        'orange': 'background-color: lightsalmon',
        'red': 'background-color: lightcoral'
    }
    color = color_map.get(row['Tag'], '')
    return [color] * len(row)

Display

In [9]:
# If the file exists and it's not a historic run
if os.path.exists(os.path.join(baseball_path, "A06. Weather", "2. RotoGrinders", f"RotoGrinders {todaysdate}.csv")) and historic == False:
    # Read RotoGrinders weather
    rotogrinders_df = pd.read_csv(os.path.join(baseball_path, "A06. Weather", "2. RotoGrinders", f"RotoGrinders {todaysdate}.csv"))
    # If it's missing, it's Oakland
    rotogrinders_df['Away'].fillna("OAK", inplace=True)
    rotogrinders_df['Home'].fillna("OAK", inplace=True)
    # rotogrinders_df['Away'] = rotogrinders_df['Away'].str.replace("nan", "OAK")
    # Identify high-risk matchups
    red_list = list(rotogrinders_df.query('Tag == "red" or Tag2 == "red"')['Away']) + list(rotogrinders_df.query('Tag == "red" or Tag2 == "red"')['Home'])
    orange_list = list(rotogrinders_df.query('Tag == "orange" or Tag2 == "orange"')['Away']) + list(rotogrinders_df.query('Tag == "orange" or Tag2 == "orange"')['Home'])
    # Display RotoGrinders weather
    display(rotogrinders_df[['Tag', 'Tag2', 'Away', 'Home', 'date', 'Description']].style.apply(color_rows_by_tag, axis=1))

else:    
    red_list, orange_list = [], []

### Lineup Info

In [10]:
if historic == False:
    # Might struggle in bulk daily runs
    draftGroupId, contestDate, contestTime = selected_contest_df['draftGroupId'][0], selected_contest_df['Game Info'][0].split(" ")[1], selected_contest_df['Game Info'][0].split(" ")[2]
    
    # Openers
    draftables = pd.read_csv(os.path.join(baseball_path, "A01. DraftKings", "2. Draftables", f"Draftables {draftGroupId}.csv"), encoding='iso-8859-1')
    opener_list = list(draftables.query('Salary <= 5500 and `Roster Position` == "P"')['Name'])

    # Teams with Projected Lineups
    projected_lineups = pd.read_csv(os.path.join(baseball_path, "A05. Rosters", "3. Batting Orders Projected", f"Batting Orders Projected {todaysdate}.csv"))
else:
    opener_list, projected_lineups = [], None

### B02. Simulations

In [11]:
%run "C:\Users\james\Documents\MLB\Code\B02. Simulations.ipynb"

##### Settings

In [12]:
wfx_type = "adj"
num_batches, batch_size = os.cpu_count(), 63
# num_batches, batch_size = 1, 1
num_batches * batch_size

1008

##### Games

In [48]:
if slate_only == True:
    # Just the slate(s) - Note: if running multiple contestKeys, overlapping slates, we don't need to rerun the same gmaes
    sim_game_df = selected_contest_df.sort_values('game_datetime').reset_index(drop=True).drop_duplicates('game_id').copy()
else:
    # All games
    sim_game_df = game_df.copy()

##### Run

In [None]:
%%time
# Print out games to simulate
_ = [print(f"{row['away_team']}@{row['home_team']} {pd.to_datetime(row['game_datetime']).tz_convert('US/Eastern').strftime('%Y-%m-%d %H:%M:%S')}") for index, row in sim_game_df.head(15).iterrows()]

# Loop over dates of interest
for date in list(sim_game_df['date'].astype(str).unique()): 
    print(date)
    
    # Extract matchups
    matchup_list = [os.path.splitext(f)[0] for f in os.listdir(os.path.join(baseball_path, "B01. Matchups", f"Matchups {date}"))]
    
    # Create Game Sim path
    os.makedirs(os.path.join(baseball_path, "B02. Simulations", "1. Game Sims", f"Matchups {date}"), exist_ok=True)  
    
    # Read in weather
    try:
        daily_weather_df = pd.read_csv(os.path.join(baseball_path, "A06. Weather", "3. Park and Weather Factors", f"Park and Weather Factors {date}.csv"))
        # Choose WFX
        for event in events_list:
            daily_weather_df[f'{event}_wfx_l'] = daily_weather_df[f'{event}_wfx_{wfx_type}_l'].copy()
            daily_weather_df[f'{event}_wfx_r'] = daily_weather_df[f'{event}_wfx_{wfx_type}_r'].copy()        
    except:
        print("Weather dataframe not created.")
        continue
        
    # Read in projected lineups
    try:
        # Read in dataframe
        daily_order_bm_df = pd.read_csv(os.path.join(baseball_path, "A05. Rosters", "3. Batting Orders Projected", f"Batting Orders Projected {date}.csv"))
        # Create BBREFTEAM column
        daily_order_bm_df['BBREFTEAM'] = daily_order_bm_df['team code'].map(team_dict)
        # Clean whitespace from column names
        daily_order_bm_df.columns = [col.strip() for col in daily_order_bm_df.columns]
        # Rename
        daily_order_bm_df.rename(columns={'mlb id': 'id', 'batting order': 'batting_order'}, inplace=True)
    except:
        daily_order_bm_df = None
        print("Projected batting orders not found.")
    
    # Loop over games
    daily_df = sim_game_df[sim_game_df['date'].astype(str) == str(date)].reset_index()
    for i in range(len(daily_df)):            
        # Extract info from sim_game_df to look up proper file in matchups path
        away_team = daily_df['away_team'][i]
        home_team = daily_df['home_team'][i]
        game_id = daily_df['game_id'][i]
        game_num = daily_df['game_num'][i]
        
        # Beginning of filename in matchup folder
        lookup = f"{away_team}@{home_team} {game_id}"
        
        # Find the matchup file
        matchup = next((matchup for matchup in matchup_list if matchup.startswith(lookup)), None)
        print(date, matchup)
        
        ### Read in lineups
        ## Away
        # MLB API
        away_order_api_df = create_order_api(date=date, team=away_team, game_id=game_id)
        # Baseball Monster
        away_order_bm_df = create_order_bm(daily_order_bm_df, away_team, game_num=game_num)
        # Choose Baseball Monster if it's complete and API is not
        if away_order_bm_df is not None and away_order_bm_df['batting_order'].sum() == 45 and away_order_api_df['batting_order'].sum() != 45:
            away_order_df = away_order_bm_df.copy()
        else:
            away_order_df = away_order_api_df.copy()
        
        ## Home
        # MLB API
        home_order_api_df = create_order_api(date=date, team=home_team, game_id=game_id)
        # Baseball Monster
        home_order_bm_df = create_order_bm(daily_order_bm_df, home_team, game_num=game_num)
        # Choose Baseball Monster if it's complete and API is not
        if home_order_bm_df is not None and home_order_bm_df['batting_order'].sum() == 45 and home_order_api_df['batting_order'].sum() != 45:
            home_order_df = home_order_bm_df.copy()
        else:
            home_order_df = home_order_api_df.copy()    
            
        ### Player Dataframes
        # Create matchup path
        matchup_path = os.path.join(baseball_path, "B01. Matchups", f"Matchups {date}", f"{matchup}.xlsx")
        
        # Read in Dataframes
        away_batter_df = pd.read_excel(matchup_path, sheet_name="AwayBatters")
        home_batter_df = pd.read_excel(matchup_path, sheet_name="HomeBatters")
        away_pitcher_df = pd.read_excel(matchup_path, sheet_name="AwayPitchers")
        home_pitcher_df = pd.read_excel(matchup_path, sheet_name="HomePitchers")
        
        ####### Remove from matchup files later - then this can be deleted
        away_batter_df.drop(columns={'batting_order'}, inplace=True)
        home_batter_df.drop(columns={'batting_order'}, inplace=True)
        ###### Remove from matchup files later - then this can be deleted
        
        
        ### Player Objects
        AwayBatters = create_batter_objects(away_batter_df, away_order_df, scale_batter_stats, scale_batter_stats_steamer, impute_batter_stats)
        HomeBatters = create_batter_objects(home_batter_df, home_order_df, scale_batter_stats, scale_batter_stats_steamer, impute_batter_stats)
        AwayPitchers = create_pitcher_objects(away_pitcher_df, scale_pitcher_stats, scale_pitcher_stats_steamer, impute_pitcher_stats)
        HomePitchers = create_pitcher_objects(home_pitcher_df, scale_pitcher_stats, scale_pitcher_stats_steamer, impute_pitcher_stats)       
    

        ### Park Object
        # Subset weather
        weather_df = daily_weather_df.query(f'gamePk == {game_id}').reset_index(drop=True)
        # Create object
        try:
            row_data = weather_df.iloc[0].to_dict()  # Convert the single row to a dictionary
            park_object = Park(**row_data)
        except:
            print("Game missing from weather dataframe.")
            continue
            
        ### Scoreboard Object
        game_template = Scoreboard(AwayBatters, HomeBatters, AwayPitchers, HomePitchers, 9)       
        
        ### Sim games
        start = time.time()
        game_list = Parallel(n_jobs=num_batches, verbose=False)(delayed(sim_game_batch)(game_template, predict_pulls, predict_leverage, predict_binary, predict_outs, 
                                                                                        predict_safe, predict_all, predict_all_adjusted, 
                                                                                        opener_list, park_object, innings=9, debug=True, 
                                                                                        batch_size=batch_size) for batches in range(num_batches))
        
        game_list = [game for sublist in game_list for game in sublist]
        print(f"Simming {batch_size*num_batches} games took {round(time.time() - start, 2)} seconds.")
        
        # Create player path
        player_path = os.path.join(baseball_path, "B02. Simulations", "2. Player Sims", f"Matchups {date}", f"{matchup}") 
        # Clear it 
        if os.path.exists(player_path):
            shutil.rmtree(player_path)
        # Make folder
        os.makedirs(player_path, exist_ok=True)
        
        # Create list of game scores
        game_score_df_list = []
        # Save each object in the list to a separate file
        for i, game_object in enumerate(game_list):
            # Delete unnecessary attributes from batter objects
            for batter in game_object.away_batters + game_object.home_batters:
                batter = batter.keep_selected_attributes()
            # Delete unnecessary attributes from pitcher objects
            for pitcher in game_object.away_pitchers + game_object.home_pitchers:
                pitcher = pitcher.keep_selected_attributes()
            # Delete unnecessary attributes from game objects
            game_object.keep_selected_attributes()

            ### Construct DataFrames
            # Game
            game_score_df = pd.DataFrame({
                "away_score": [game_object.away_score],
                "home_score": [game_object.home_score]
            })
            # Append game_scores
            game_score_df_list.append(game_score_df)

            # Batters
            away_batters_df = pd.DataFrame([vars(batter) for batter in game_object.away_batters])
            away_batters_df['team'] = "away"
            home_batters_df = pd.DataFrame([vars(batter) for batter in game_object.home_batters])
            home_batters_df['team'] = "home"
            batters_df = pd.concat([away_batters_df, home_batters_df], axis=0)
            # Save
            batters_df.to_csv(os.path.join(player_path, "batters_{}.csv").format(i), index=False)

            # Pitchers
            away_pitchers_df = pd.DataFrame([vars(pitcher) for pitcher in game_object.away_pitchers])
            away_pitchers_df['team'] = "away"
            home_pitchers_df = pd.DataFrame([vars(pitcher) for pitcher in game_object.home_pitchers])
            home_pitchers_df['team'] = "home"
            pitchers_df = pd.concat([away_pitchers_df, home_pitchers_df], axis=0)
            # Save
            pitchers_df.to_csv(os.path.join(player_path, "pitchers_{}.csv").format(i), index=False)
            
        # Concatenate game scores
        game_scores_df = pd.concat(game_score_df_list, axis=0).reset_index(drop=True)
        # Write to CSV
        game_scores_df.to_csv(os.path.join(baseball_path, "B02. Simulations", "1. Game Sims", f"Matchups {date}", f"game_{game_id}.csv"), index=False)
        # Print average score
        print(f"{away_team}: {round(game_scores_df['away_score'].mean(), 2)}", f"{home_team}: {round(game_scores_df['home_score'].mean(), 2)}")

LAD@SDP 2024-03-20 06:05:00
SDP@LAD 2024-03-21 06:05:00
LAA@BAL 2024-03-28 15:05:00
WSN@CIN 2024-03-28 16:10:00
SFG@SDP 2024-03-28 16:10:00
STL@LAD 2024-03-28 16:10:00
TOR@TBR 2024-03-28 16:10:00
MIN@KCR 2024-03-28 16:10:00
DET@CHW 2024-03-28 16:10:00
PIT@MIA 2024-03-28 16:10:00
NYY@HOU 2024-03-28 16:10:00
CHC@TEX 2024-03-28 19:35:00
CLE@OAK 2024-03-28 22:07:00
COL@ARI 2024-03-28 22:10:00
BOS@SEA 2024-03-28 22:10:00
20240320
Projected batting orders not found.
20240320 LAD@SDP 745444 0605
Missing Baseball Monster order.
Missing Baseball Monster order.
Simming 1008 games took 30.77 seconds.
LAD: 4.94 SDP: 3.91
20240321
Projected batting orders not found.
20240321 SDP@LAD 746175 0605
Missing Baseball Monster order.
Missing Baseball Monster order.


In [None]:
# Big Q: Are individual game scores drawn toward each other or toward the average?

### B03. Lineups

In [None]:
%run "B03. Optimizer.ipynb"

In [None]:
# Pareto Set
pareto_set = ['Plus3', 'batter rostership']
sense_list = ['Max', 'Min']
# Lineup ranking method
sort_by = ['pareto', 'Plus3']
ascending_list = [False, False]

# Options
sort_by_list = ['P50', 'P75', 'P90', 'P95', 'P99', 'P100', 'Tail', 'Sim STD', 'Plus2', 'Plus3', 
                'Top1%', 'Top5%', 'Top10%', 'Top20%', 'Top50%', 'rostership', 'pitcher rostership', 'batter rostership', 'pareto']


# Set maximum ownership by position group 
max_exposure_batters, max_exposure_pitchers = 0.5, 0.7

# Share of each stack type
stack_dictionary = {"5-2-1":   0.37,
                    "5-3":     0.23,
                    "5-1-1-1": 0.18,
                    "4-3-1":   0.11,
                    "4-2-1-1": 0.11}

# Number of lineups to create
num_lineups = 1000

##### Run

##### 1. Players

In [None]:
%%time
for _, row in selected_contest_df.drop_duplicates(subset=['contestKey'])[['contestKey', 'draftGroupId', 'roto_slate']].iterrows():
    contestKey, draftGroupId, roto_slate = row['contestKey'], row['draftGroupId'], row['roto_slate']
    print(contestKey)
    guide = selected_contest_df[selected_contest_df['contestKey'] == contestKey].reset_index(drop=True)
    # Create draftables with sims
    draftables_with_sims = create_player_file(contestKey, guide, draftGroupId, roto_slate, max_exposure_pitchers, max_exposure_batters, 
                                              projections='robot', rostership='roto', ownership_spread=0.25)
    # Write to CSV
    draftables_with_sims.to_csv(os.path.join(baseball_path, "B03. Lineups", "1. Players", f"Players {contestKey}.csv"), index=False, encoding='iso-8859-1')

In [None]:
excel_button(os.path.join(baseball_path, "B03. Lineups", "1. Players", f"Players {contestKey}.csv"))

##### 2. Lineups

In [None]:
%%time
for _, row in selected_contest_df.drop_duplicates(subset=['contestKey'])[['contestKey', 'slate_size']].iterrows():
    contestKey, slate_size = row['contestKey'], row['slate_size']
    print(contestKey, slate_size)
    
    # Define the constraints
    maximum_constraints = [
        (contestKey, 49000, 4, [5, 2, 1],    red_list + orange_list, 10, None, 0.2, 0.05, math.ceil(stack_dictionary['5-2-1']   * num_lineups), "Max"),
        (contestKey, 49000, 4, [5, 3],       red_list + orange_list, 10, None, 0.2, 0.05, math.ceil(stack_dictionary['5-3']     * num_lineups), "Max"),
        (contestKey, 49000, 4, [5, 1, 1, 1], red_list + orange_list, 10, None, 0.2, 0.05, math.ceil(stack_dictionary['5-1-1-1'] * num_lineups), "Max"),
        (contestKey, 49000, 4, [4, 3, 1],    red_list + orange_list, 10, None, 0.2, 0.05, math.ceil(stack_dictionary['4-3-1']   * num_lineups), "Max"),
        (contestKey, 49000, 4, [4, 2, 1, 1], red_list + orange_list, 10, None, 0.2, 0.05, math.ceil(stack_dictionary['4-2-1-1'] * num_lineups), "Max")
    ]
    
    minimum_constraints = [
        (contestKey, 49000, 1, [5, 2, 1],    red_list, 0, None, 0.2, 0.05, math.ceil(stack_dictionary['5-2-1']   * num_lineups), "Min"),
        (contestKey, 49000, 1, [5, 3],       red_list, 0, None, 0.2, 0.05, math.ceil(stack_dictionary['5-3']     * num_lineups), "Min"),
        (contestKey, 49000, 1, [5, 1, 1, 1], red_list, 0, None, 0.2, 0.05, math.ceil(stack_dictionary['5-1-1-1'] * num_lineups), "Min"),
        (contestKey, 49000, 1, [4, 3, 1],    red_list, 0, None, 0.2, 0.05, math.ceil(stack_dictionary['4-3-1']   * num_lineups), "Min"),
        (contestKey, 49000, 1, [4, 2, 1, 1], red_list, 0, None, 0.2, 0.05, math.ceil(stack_dictionary['4-2-1-1'] * num_lineups), "Min")
    ]
    
    # Track failed constraints
    failed_max_constraints = []
    failed_max_indices = []

    # Create lineups with maximum constraints
    print("Attempting Maximum Constraints.")
    optimizers = Parallel(n_jobs=-1, backend="threading", verbose=0)(delayed(create_lineups2)(params) for params in maximum_constraints)
    
    # Print errors and store failed constraints with their indices
    for i, optimizer in enumerate(optimizers):
        if type(optimizer) == str:
            print(optimizer)
            failed_max_constraints.append(maximum_constraints[i])  # Store failed constraints
            failed_max_indices.append(i)  # Track the index of the failed constraint

    # Combine the optimizers from maximum constraints
    combined_optimizers = optimizers.copy()

    # If there are any failed constraints, attempt to run the corresponding minimum constraints for them
    if failed_max_constraints:
        print("Maximum Constraints Failed. Attempting Minimum Constraints for failed stacks.")
        
        # Create lineups with corresponding minimum constraints for failed stacks
        min_optimizers = Parallel(n_jobs=-1, backend="threading", verbose=0)(
            delayed(create_lineups2)(minimum_constraints[i]) for i in failed_max_indices)
        
        # Print errors for minimum constraints
        for optimizer in min_optimizers:
            if type(optimizer) == str:
                print(optimizer)

        # Combine the optimizers from minimum constraints with the maximum ones
        combined_optimizers.extend(min_optimizers)

    # Write to CSV
    try:
        write_lineups(combined_optimizers)
    except:
        print(f"Couldn't run contest {contestKey}")

##### 3. Lineups Ranked

In [None]:
%%time
for _, row in selected_contest_df.drop_duplicates(subset=['contestKey'])[['contestKey', 'roto_slate']].iterrows():
    contestKey, roto_slate = row['contestKey'], row['roto_slate']
    print(contestKey)
    lineups_ranked = choose_lineups(contestKey, roto_slate, pareto_set, sense_list, sort_by, ascending_list)
    lineups_ranked.to_csv(os.path.join(baseball_path, "B03. Lineups", "3. Lineups Ranked", f"Lineups Ranked {contestKey}.csv"), index=False)
    
    lineups_ranked.reset_index(drop=False, inplace=True)

In [None]:
excel_button(os.path.join(baseball_path, "B03. Lineups", "3. Lineups Ranked", f"Lineups Ranked {contestKey}.csv"))

##### Plot

In [None]:
fig = px.scatter(
    lineups_ranked,
    y='Plus3', # should this be sort_by?
    x='batter rostership',
    color='pareto',
    hover_data={
        'index': True,
        'P': True,
        'P.1': True,
        '1B': True,
        '2B': True,
        '3B': True,
        'SS': True,
        'OF': True,
        'OF.1': True,
        'OF.2': True,
        'Wins': True,
        'Top1%': True},
    title='Scatter Plot of Plus3 vs Batter Rostership',
    labels={'Plus3': 'Plus3', 'batter rostership': 'Batter Rostership'}
)

# Update the size of the figure and invert the y-axis
fig.update_layout(
    width=1000,  # Set the width of the plot
    height=1000,  # Set the height of the plot
    xaxis=dict(autorange='reversed')  # Invert the y-axis
)

fig.update_traces(marker=dict(size=10))

fig.show()

##### 4. Uploads

In [None]:
upload = create_upload_file(contestKey, sort_by)
upload.head(50).to_csv(os.path.join(baseball_path, "B03. Lineups", "4. Uploads", f"Upload {contestKey}.csv"), index=False)

##### 5. Entries

In [None]:
entry = create_entry_file(draftGroupId, contestKey)
entry.to_csv(os.path.join(baseball_path, "B03. Lineups", "5. Entries", f"Entries {draftGroupId}.csv"), index=False, encoding='iso-8859-1')

##### Upload

In [None]:
upload_entries(draftGroupId)

##### Email

In [None]:
email_upload_file(draftGroupId, contestKey, contestTime)