# 001. Sim Pipeline
- This runs the pre-contest pipeline, simulations, and optimizations
- Type: Pipeline
- Run Frequency: Daily (Contest)
- Created: 1/1/2025
- Updated: 8/20/2025

### Imports

In [None]:
# %run "C:\Users\james\Documents\MLB\Code\U01. Imports.ipynb"
# %run "C:\Users\james\Documents\MLB\Code\U02. Functions.ipynb"
# %run "C:\Users\james\Documents\MLB\Code\U03. Classes.ipynb"
# %run "C:\Users\james\Documents\MLB\Code\U04. Datasets.ipynb"
# %run "C:\Users\james\Documents\MLB\Code\U05. Models.ipynb"

In [None]:
pause_code(start_time='2025-09-28T14:00:00', timezone='EST')

### Settings

In [None]:
start_date, end_date = "20240101", "20241231"
# start_date, end_date = "20240408", "20241231"
# start_date, end_date = todaysdate, todaysdate

In [None]:
historic, slate_only = True, False
# historic, slate_only = False, True

### Games

Read in games

In [None]:
# %%time
# historic_game_df = pd.read_csv(os.path.join(baseball_path, "game_df.csv"))
# recent_game_df = create_games(yesterdaysdate, todaysdate, team_dict)
# historic_game_df = historic_game_df[~historic_game_df['date'].astype(str).isin([yesterdaysdate, todaysdate])]
# all_game_df = pd.concat([historic_game_df, recent_game_df], axis=0)
# all_game_df.to_csv(os.path.join(baseball_path, "game_df.csv"), index=False)

Offseason

In [None]:
%%time
all_game_df = pd.read_csv(os.path.join(baseball_path, "game_df.csv"))

Select games

In [None]:
game_df = all_game_df[(all_game_df['date'].astype(str) >= start_date) & (all_game_df['date'].astype(str) <= end_date)].reset_index(drop=True)

### 002. Night Pipeline

In [None]:
print("game_df")

In [None]:
%%time
if historic == False:
    %run "002. Night Pipeline.ipynb"

### Contests

Read in contests

In [None]:
%%time
contest_df = create_contests(start_date=start_date, end_date=end_date, name="Four", entryFee=None, exclusions=['vs', 'Turbo', '@'])

In [None]:
contest_df = contest_df.drop(columns=['game_type', 'game_num', 'date', 'away_score', 'home_score']).merge(game_df, on=['game_id'], how='left')
contest_df.sort_values('game_datetime').drop_duplicates('contestKey', keep='first').reset_index(drop=True).head(5)

Select contestKeys

In [None]:
selections = [182902615]

if selections == []:
    selections = contest_df['contestKey'].astype(int).unique()

selected_contest_df = contest_df[contest_df['contestKey'].astype(int).isin(selections)].reset_index(drop=True)

### Weather

##### Function

In [None]:
def color_rows_by_tag(row):
    color_map = {
        'green': 'background-color: lightgreen',
        'yellow': 'background-color: lightyellow',
        'orange': 'background-color: lightsalmon',
        'red': 'background-color: lightcoral'
    }
    color = color_map.get(row['Tag'], '')
    return [color] * len(row)

##### Display

In [None]:
# If the file exists and it's not a historic run
if os.path.exists(os.path.join(baseball_path, "A06. Weather", "2. RotoGrinders", f"RotoGrinders {todaysdate}.csv")) and historic == False:
    # Read RotoGrinders weather
    rotogrinders_df = pd.read_csv(os.path.join(baseball_path, "A06. Weather", "2. RotoGrinders", f"RotoGrinders {todaysdate}.csv"))
    # If it's missing, it's Oakland
    rotogrinders_df['Away'].fillna("OAK", inplace=True)
    rotogrinders_df['Home'].fillna("OAK", inplace=True)
    # rotogrinders_df['Away'] = rotogrinders_df['Away'].str.replace("nan", "OAK")
    # Identify high-risk matchups
    red_list = list(rotogrinders_df.query('Tag == "red" or Tag2 == "red"')['Away']) + list(rotogrinders_df.query('Tag == "red" or Tag2 == "red"')['Home'])
    orange_list = list(rotogrinders_df.query('Tag == "orange" or Tag2 == "orange"')['Away']) + list(rotogrinders_df.query('Tag == "orange" or Tag2 == "orange"')['Home'])
    # Display RotoGrinders weather
    display(rotogrinders_df[['Tag', 'Tag2', 'Away', 'Home', 'date', 'Description']].style.apply(color_rows_by_tag, axis=1))

else:    
    red_list, orange_list = [], []

### Lineup Info

In [None]:
if historic == False:
    # Might struggle in bulk daily runs
    selected_contest_df.sort_values('game_datetime', ascending=True, inplace=True, ignore_index=True)
    draftGroupId, contestDate, contestTime = selected_contest_df['draftGroupId'][0], selected_contest_df['Game Info'][0].split(" ")[1], selected_contest_df['Game Info'][0].split(" ")[2]
    
    # Openers
    draftables = pd.read_csv(os.path.join(baseball_path, "A01. DraftKings", "2. Draftables", f"Draftables {draftGroupId}.csv"), encoding='iso-8859-1')
    opener_list = list(draftables.query('Salary <= 5500 and `Roster Position` == "P"')['Name'])

else:
    opener_list = []

### B02. Simulations

In [None]:
# %run "C:\Users\james\Documents\MLB\Code\U05. Models.ipynb"
# %run "C:\Users\james\Documents\MLB\Code\B02. Simulations.ipynb"

##### Settings

In [None]:
all_filename, all_adjusted_filename

In [None]:
wfx_type = "adj"
wfx_adjustment=True
debug = False
num_batches, batch_size = os.cpu_count(), 63
# num_batches, batch_size = 1, 1
# num_batches, batch_size = 1, 63

num_batches * batch_size

##### Games

In [None]:
if slate_only == True:
    # Just the slate(s) - Note: if running multiple contestKeys, overlapping slates, we don't need to rerun the same games
    sim_game_df = selected_contest_df.sort_values('game_datetime').reset_index(drop=True).drop_duplicates('game_id').copy()
else:
    # All games
    sim_game_df = game_df.copy()

##### Run

In [None]:
import os
import shutil
import time
import pandas as pd

# -----------------------------------------------------------
# 0. GLOBAL SETTINGS
# -----------------------------------------------------------
wfx_adjustment = True

# -----------------------------------------------------------
# 1. PRINT MATCHUPS
# -----------------------------------------------------------
for _, row in sim_game_df.head(15).iterrows():
    print(f"{row['away_team']}@{row['home_team']} "
          f"{pd.to_datetime(row['game_datetime']).tz_convert('US/Eastern').strftime('%Y-%m-%d %H:%M:%S')}")

# -----------------------------------------------------------
# 2. MAIN DATE LOOP
# -----------------------------------------------------------
for date in sim_game_df['date'].astype(str).unique(): 
    print(date)
    matchup_dir = os.path.join(baseball_path, "B01. Matchups", f"Matchups {date}")
    matchup_dict = {f.split()[1]: os.path.splitext(f)[0] for f in os.listdir(matchup_dir)}

    game_sim_dir = os.path.join(baseball_path, "B02. Simulations", "1. Game Sims", f"Matchups {date}")
    os.makedirs(game_sim_dir, exist_ok=True)

    # Load weather
    try:
        daily_weather_df = pd.read_csv(os.path.join(
            baseball_path, "A06. Weather", "3. Park and Weather Factors",
            f"Park and Weather Factors {date}.csv"
        ))
        for event in events_list:
            daily_weather_df[f'{event}_wfx_l'] = daily_weather_df[f'{event}_wfx_{wfx_type}_l']
            daily_weather_df[f'{event}_wfx_r'] = daily_weather_df[f'{event}_wfx_{wfx_type}_r']
        weather_lookup = {row['gamePk']: row for _, row in daily_weather_df.iterrows()}
    except Exception:
        print("Weather dataframe not created.")
        continue

    # Load projected lineups for today's date
    if date == todaysdate:
        projected_lineup_df = pd.read_csv(os.path.join(
            baseball_path, "A05. Rosters", "3. Projected Lineups - RotoGrinders",
            f"{date} Projected Lineups - RotoGrinders.csv"
        ))
    else:
        projected_lineup_df = None

    # Filter daily matchups
    daily_df = sim_game_df[sim_game_df['date'].astype(int) == int(date)].reset_index(drop=True)
    for _, game_row in daily_df.iterrows():
        away_team, home_team = game_row['away_team'], game_row['home_team']
        game_id, game_num = game_row['game_id'], game_row['game_num']

        matchup = matchup_dict.get(str(game_id))
        if matchup is None:
            continue
        print(date, matchup)

        # -------------------------------------------------------
        # CREATE ORDERS AND PLAYER OBJECTS
        # -------------------------------------------------------
        away_order_df = create_order_api(date, away_team, game_id)
        if away_order_df['batting_order'].sum() != 45:
            away_order_df = create_order_rg(projected_lineup_df, away_team, game_num)

        home_order_df = create_order_api(date, home_team, game_id)
        if home_order_df['batting_order'].sum() != 45:
            home_order_df = create_order_rg(projected_lineup_df, home_team, game_num)

        matchup_path = os.path.join(matchup_dir, f"{matchup}.xlsx")
        matchup_xl = pd.read_excel(matchup_path, sheet_name=None)

        away_batter_df = matchup_xl["AwayBatters"].drop(columns="batting_order", errors="ignore")
        home_batter_df = matchup_xl["HomeBatters"].drop(columns="batting_order", errors="ignore")
        away_pitcher_df = matchup_xl["AwayPitchers"]
        home_pitcher_df = matchup_xl["HomePitchers"]

        AwayBatters = create_batter_objects(
            away_batter_df, away_order_df, scale_batter_stats, scale_batter_stats_steamer, impute_batter_stats
        )
        HomeBatters = create_batter_objects(
            home_batter_df, home_order_df, scale_batter_stats, scale_batter_stats_steamer, impute_batter_stats
        )
        AwayPitchers = create_pitcher_objects(
            away_pitcher_df, scale_pitcher_stats, scale_pitcher_stats_steamer, impute_pitcher_stats
        )
        HomePitchers = create_pitcher_objects(
            home_pitcher_df, scale_pitcher_stats, scale_pitcher_stats_steamer, impute_pitcher_stats
        )

        row_data = weather_lookup.get(game_id)
        if row_data is None:
            print("Game missing from weather dataframe.")
            continue

        park_object = Park(**row_data.to_dict())
        game_template = Scoreboard(AwayBatters, HomeBatters, AwayPitchers, HomePitchers, 9)

        # -------------------------------------------------------
        # SIMULATE GAMES
        # -------------------------------------------------------
        start_time = time.time()
        
        game_list_batches = Parallel(n_jobs=num_batches, verbose=5, backend='loky')(
            delayed(sim_game_batch)(
                opener_list,
                AwayBatters,
                HomeBatters,
                AwayPitchers,
                HomePitchers,
                park_object,
                innings=9,
                wfx_adjustment=wfx_adjustment,
                debug=False,
                batch_size=batch_size
            )
            for _ in range(num_batches)
        )
        
        # Flatten the list of lists
        game_list = [g for batch in game_list_batches for g in batch]

        # -------------------------------------------------------
        # SAVE RESULTS
        # -------------------------------------------------------
        start_time = time.time()
        player_path = os.path.join(baseball_path, "B02. Simulations", "2. Player Sims", f"Matchups {date}", matchup)
        if os.path.exists(player_path):
            shutil.rmtree(player_path)
        os.makedirs(player_path, exist_ok=True)

        game_score_df_list = []

        for i, game_object in enumerate(game_list):
            # Keep only selected attributes
            for batter in game_object.away_batters + game_object.home_batters:
                batter.keep_selected_attributes()
            for pitcher in game_object.away_pitchers + game_object.home_pitchers:
                pitcher.keep_selected_attributes()
            game_object.keep_selected_attributes()

            # Store game scores
            game_score_df_list.append(pd.DataFrame({
                "away_score": [game_object.away_score],
                "home_score": [game_object.home_score]
            }))

            # Save batter stats
            away_batters_df = pd.DataFrame([vars(b) for b in game_object.away_batters])
            away_batters_df['team'] = "away"
            home_batters_df = pd.DataFrame([vars(b) for b in game_object.home_batters])
            home_batters_df['team'] = "home"
            batters_df = pd.concat([away_batters_df, home_batters_df], axis=0)
            batters_df.to_csv(os.path.join(player_path, f"batters_{i}.csv"), index=False)

            # Save pitcher stats
            away_pitchers_df = pd.DataFrame([vars(p) for p in game_object.away_pitchers])
            away_pitchers_df['team'] = "away"
            home_pitchers_df = pd.DataFrame([vars(p) for p in game_object.home_pitchers])
            home_pitchers_df['team'] = "home"
            pitchers_df = pd.concat([away_pitchers_df, home_pitchers_df], axis=0)
            pitchers_df.to_csv(os.path.join(player_path, f"pitchers_{i}.csv"), index=False)

        # Save game scores
        game_scores_df = pd.concat(game_score_df_list, ignore_index=True)
        game_scores_df.to_csv(os.path.join(game_sim_dir, f"game_{game_id}.csv"), index=False)

        print(f"{away_team}: {round(game_scores_df['away_score'].mean(), 2)}",
              f"{home_team}: {round(game_scores_df['home_score'].mean(), 2)}")
        print("Saving", time.time() - start_time)


### B03. Lineups

In [None]:
%run "B03. Optimizer.ipynb"

In [None]:
# Pareto Set
pareto_set = ['Plus3', 'batter rostership']
sense_list = ['Max', 'Min']
# Lineup ranking method
sort_by = ['pareto', 'Plus3']
ascending_list = [False, False]

# Options
sort_by_list = ['P50', 'P75', 'P90', 'P95', 'P99', 'P100', 'Tail', 'Sim STD', 'Plus2', 'Plus3', 
                'Top1%', 'Top5%', 'Top10%', 'Top20%', 'Top50%', 'rostership', 'pitcher rostership', 'batter rostership', 'pareto']


# Set maximum ownership by position group 
max_exposure_batters, max_exposure_pitchers = 0.5, 0.7

# Share of each stack type
stack_dictionary = {"5-2-1":   0.37,
                    "5-3":     0.23,
                    "5-1-1-1": 0.18,
                    "4-3-1":   0.11,
                    "4-2-1-1": 0.11}

# Number of lineups to create
num_lineups = 1000

##### Run

##### 1. Players

In [None]:
for _, row in selected_contest_df.drop_duplicates(subset=['contestKey'])[['contestKey', 'draftGroupId', 'roto_slate']].iterrows():
    contestKey, draftGroupId, roto_slate = row['contestKey'], row['draftGroupId'], row['roto_slate']
    print(contestKey)
    guide = selected_contest_df[selected_contest_df['contestKey'] == contestKey].reset_index(drop=True)
    # Create draftables with sims
    draftables_with_sims = create_player_file(contestKey, guide, draftGroupId, roto_slate, max_exposure_pitchers, max_exposure_batters, 
                                              projections='robot', rostership='roto', ownership_spread=0.25)
    # Write to CSV
    draftables_with_sims.to_csv(os.path.join(baseball_path, "B03. Lineups", "1. Players", f"Players {contestKey}.csv"), index=False, encoding='iso-8859-1')

In [None]:
excel_button(os.path.join(baseball_path, "B03. Lineups", "1. Players", f"Players {contestKey}.csv"))

##### 2. Lineups

In [None]:
%%time
for _, row in selected_contest_df.drop_duplicates(subset=['contestKey'])[['contestKey', 'slate_size']].iterrows():
    contestKey, slate_size = row['contestKey'], row['slate_size']
    print(contestKey, slate_size)
    
    # Define the constraints
    maximum_constraints = [
        (contestKey, 49000, 4, [5, 2, 1],    red_list + orange_list, 10, None, 0.2, 0.05, math.ceil(stack_dictionary['5-2-1']   * num_lineups), "Max"),
        (contestKey, 49000, 4, [5, 3],       red_list + orange_list, 10, None, 0.2, 0.05, math.ceil(stack_dictionary['5-3']     * num_lineups), "Max"),
        (contestKey, 49000, 4, [5, 1, 1, 1], red_list + orange_list, 10, None, 0.2, 0.05, math.ceil(stack_dictionary['5-1-1-1'] * num_lineups), "Max"),
        (contestKey, 49000, 4, [4, 3, 1],    red_list + orange_list, 10, None, 0.2, 0.05, math.ceil(stack_dictionary['4-3-1']   * num_lineups), "Max"),
        (contestKey, 49000, 4, [4, 2, 1, 1], red_list + orange_list, 10, None, 0.2, 0.05, math.ceil(stack_dictionary['4-2-1-1'] * num_lineups), "Max")
    ]
    
    minimum_constraints = [
        (contestKey, 49000, 0, [5, 2, 1],    red_list, 0, None, 0.2, 0.05, math.ceil(stack_dictionary['5-2-1']   * num_lineups), "Min"),
        (contestKey, 49000, 0, [5, 3],       red_list, 0, None, 0.2, 0.05, math.ceil(stack_dictionary['5-3']     * num_lineups), "Min"),
        (contestKey, 49000, 0, [5, 1, 1, 1], red_list, 0, None, 0.2, 0.05, math.ceil(stack_dictionary['5-1-1-1'] * num_lineups), "Min"),
        (contestKey, 49000, 0, [4, 3, 1],    red_list, 0, None, 0.2, 0.05, math.ceil(stack_dictionary['4-3-1']   * num_lineups), "Min"),
        (contestKey, 49000, 0, [4, 2, 1, 1], red_list, 0, None, 0.2, 0.05, math.ceil(stack_dictionary['4-2-1-1'] * num_lineups), "Min")
    ]
    
    # Track failed constraints
    failed_max_constraints = []
    failed_max_indices = []

    # Create lineups with maximum constraints
    print("Attempting Maximum Constraints.")
    optimizers = Parallel(n_jobs=-1, backend="threading", verbose=0)(delayed(create_lineups2)(params) for params in maximum_constraints)
    
    # Print errors and store failed constraints with their indices
    for i, optimizer in enumerate(optimizers):
        if type(optimizer) == str:
            print(optimizer)
            failed_max_constraints.append(maximum_constraints[i])  # Store failed constraints
            failed_max_indices.append(i)  # Track the index of the failed constraint

    # Combine the optimizers from maximum constraints
    combined_optimizers = optimizers.copy()

    # If there are any failed constraints, attempt to run the corresponding minimum constraints for them
    if failed_max_constraints:
        print("Maximum Constraints Failed. Attempting Minimum Constraints for failed stacks.")
        
        # Create lineups with corresponding minimum constraints for failed stacks
        min_optimizers = Parallel(n_jobs=-1, backend="threading", verbose=0)(
            delayed(create_lineups2)(minimum_constraints[i]) for i in failed_max_indices)
        
        # Print errors for minimum constraints
        for optimizer in min_optimizers:
            if type(optimizer) == str:
                print(optimizer)

        # Combine the optimizers from minimum constraints with the maximum ones
        combined_optimizers.extend(min_optimizers)

    # Write to CSV
    try:
        write_lineups(combined_optimizers)
    except:
        print(f"Couldn't run contest {contestKey}")

##### 3. Lineups Ranked

In [None]:
%%time
for _, row in selected_contest_df.drop_duplicates(subset=['contestKey'])[['contestKey', 'roto_slate']].iterrows():
    contestKey, roto_slate = row['contestKey'], row['roto_slate']
    print(contestKey)
    lineups_ranked = choose_lineups(contestKey, roto_slate, pareto_set, sense_list, sort_by, ascending_list)
    lineups_ranked.to_csv(os.path.join(baseball_path, "B03. Lineups", "3. Lineups Ranked", f"Lineups Ranked {contestKey}.csv"), index=False)
    
    lineups_ranked.reset_index(drop=False, inplace=True)

In [None]:
excel_button(os.path.join(baseball_path, "B03. Lineups", "3. Lineups Ranked", f"Lineups Ranked {contestKey}.csv"))

##### Plot

In [None]:
fig = px.scatter(
    lineups_ranked,
    y='Plus3', # should this be sort_by?
    x='batter rostership',
    color='pareto',
    hover_data={'index': True,'P': True,'P.1': True,'1B': True,'2B': True,'3B': True,'SS': True,'OF': True,'OF.1': True,'OF.2': True,'Wins': True,'Top1%': True},
    title='Scatter Plot of Plus3 vs Batter Rostership',
    labels={'Plus3': 'Plus3', 'batter rostership': 'Batter Rostership'})

# Update the size of the figure and invert the y-axis
fig.update_layout(width=1000, height=1000, xaxis=dict(autorange='reversed'))  # Invert the y-axis
fig.update_traces(marker=dict(size=10))

fig.show()

##### 4. Uploads

In [None]:
upload = create_upload_file(contestKey, sort_by)
upload.head(50).to_csv(os.path.join(baseball_path, "B03. Lineups", "4. Uploads", f"Upload {contestKey}.csv"), index=False)

##### 5. Entries

In [None]:
entry = create_entry_file(draftGroupId, contestKey)
entry.to_csv(os.path.join(baseball_path, "B03. Lineups", "5. Entries", f"Entries {draftGroupId}.csv"), index=False, encoding='iso-8859-1')

##### Upload

In [None]:
upload_entries(draftGroupId)

##### Email

In [None]:
email_upload_file(draftGroupId, contestKey, contestTime)