In [41]:
import pandas as pd
import numpy as np
import pickle as pkl
import warnings
import datetime
import numpy as np
from numpy import random
from datetime import datetime as dt
import datetime

from build_datasets.dataset_builder import DatasetBuilder
from utils import convert_rotowire_weather_to_proference


warnings.simplefilter('ignore')

# Code to Build Data Overnight -- NEED TO INCLUDE PITCH COLLECTION FROM MLB DATA COLLECTION

In [2]:
# Load in the Model
with open('../train_models/data/models/final_model_xgb.pkl', 'rb') as fpath:
    model = pkl.load(fpath)

# Load in the Label Encoder
with open('../train_models/data/y-label_encoder.pkl', 'rb') as fpath:
    encoder = pkl.load(fpath)

## Get the most recent stats for each player

In [None]:
class GameSimulation():
    def __init__(self, date, home_team, lineup_dict, PA_model=model, encoder=encoder, verbose=False):
        self.date = date
        self.year, self.month, self.day = str(self.date.year), str(self.date.month), str(self.date.day)
        self.PA_model = PA_model
        self.encoder = encoder
        self.verbose = verbose
        self.lineup_dict = lineup_dict
        self.home_lineup = self.lineup_dict['home_lineup']
        self.away_lineup = self.lineup_dict['away_lineup']
        self.home_pitcher = self.lineup_dict['home_pitcher']['id']
        self.away_pitcher = self.lineup_dict['away_pitcher']['id']

        # Grab the daily rolled stats
        with open(f'../../../../MLB-Data/daily_stats_dfs/daily_stats_df_updated_{self.year.zfill(2)}-{self.month.zfill(2)}-{self.day.zfill(2)}.pkl', 'rb') as fpath:
            self.daily_dataset = pkl.load(fpath)
            self.daily_dataset = self.daily_dataset.drop(columns = ['play_type', 'is_on_base'])

        # Grab the expected weather data
        with open(f'../../../../MLB-Data/rotowire_weather_data/weather_data_updated_{self.year.zfill(2)}-{self.month.zfill(2)}-{self.day.zfill(2)}', 'rb') as fpath:
            self.expected_weather = pkl.load(fpath)

        # Grab the team name conversions for the weather conversion
        self.name_conversions = pd.read_excel('../build_datasets/data/non_mlb_data/Ballpark Info.xlsx', header=2)

        # Get the individual stats for each batter and pitcher + the league at the given moment, so we can pull them as needed for a given at bat
        batter_stats = self.daily_dataset.groupby(by='batter').last()
        batter_stats = batter_stats[[col for col in batter_stats.columns if 'PA' in col and 'LA' not in col and 'pitcher' not in col]]

        pitcher_stats = self.daily_dataset.groupby(by='pitcher').last()
        pitcher_stats = pitcher_stats[[col for col in pitcher_stats.columns if 'PA' in col and 'LA' not in col and 'pitcher' in col]]

        LA_stats = self.daily_dataset.iloc[-1] #Just pull the final row bc we know that all the LA columns are the same for the given day
        LA_stats = LA_stats[[col for col in LA_stats.index if 'LA' in col]]

        # Turn the stats into dicts for faster access
        self.batter_stats = dict(batter_stats.T)
        self.pitcher_stats = dict(pitcher_stats.T)
        self.LA_stats = pd.Series(LA_stats)

        # Finally, Initialize the game state
        self.home_team = home_team
        self.home_park = self.name_conversions[self.name_conversions['Full Name'] == self.home_team].Stadium.iloc[0]
        self.inning = 1
        self.inning_topbot = 1 # 1 for top 0 for bottom
        self.pitbat = None
        self.on_3b = 0
        self.on_2b = 0
        self.on_1b = 0
        self.outs_when_up = 0
        self.bat_score = 0
        self.field_score = 0
        self.score_tracker = {'home':0, 'away':0}
        
        # Initialize Weather info
        self.weather_row = self.expected_weather[self.expected_weather.game_id.str.contains(self.home_team)].iloc[0] # Just grabs the first game - this 'fails' if 2x header
        self.converted_weather = convert_rotowire_weather_to_proference(self.weather_row)

        # Initialize handedness dictionaries
        self.batter_handedness = self.daily_dataset.groupby('batter')['pitbat'].apply(lambda x: x.iloc[-1][0]).to_dict()
        self.pitcher_handedness = self.daily_dataset.groupby('pitcher')['pitbat'].apply(lambda x: x.iloc[-1][1]).to_dict()
    
    def _get_pitbat(self, batter_id, pitcher_id):
        batter_hand = self.batter_handedness.get(batter_id, "X")
        pitcher_hand = self.pitcher_handedness.get(pitcher_id, "X")
        return batter_hand + pitcher_hand
    
    def make_PA_row(self, batter_id, pitcher_id):
        data = {
            'ballpark': self.home_park,
            'batter': batter_id,
            'pitcher': pitcher_id,
            'pitbat': self._get_pitbat(batter_id, pitcher_id),
            'on_3b': self.on_3b,
            'on_2b': self.on_2b,
            'on_1b': self.on_1b,
            'outs_when_up': self.outs_when_up,
            'inning': self.inning,
            'inning_topbot': self.inning_topbot,
            'bat_score': self.bat_score,
            'fld_score': self.field_score
            }

        series = pd.Series(data)

        # Insert the batting, pitching, and LA stats
        batter_stats = self.batter_stats[batter_id]
        pitcher_stats = self.pitcher_stats[pitcher_id]
        LA_stats = self.LA_stats

        # Insert the weather data
        weather_dict = self.converted_weather
        weather_series = pd.Series(weather_dict)
        series = pd.concat([series, batter_stats, pitcher_stats, LA_stats, weather_series])
        self.current_PA = series.to_frame().T
        
        # # Make sure all the columns are in the original order
        self.current_PA = self.current_PA[self.daily_dataset.columns]
           
    def update_current_PA(self, batter_id, pitcher_id):
        batter_stats = self.batter_stats[batter_id]
        pitcher_stats = self.pitcher_stats[pitcher_id]

        # Create a new temporary series with the batter stats and pitcher stats
        batter_stats.loc[batter_stats.index] = batter_stats
        pitcher_stats.loc[pitcher_stats.index] = pitcher_stats
    
    def predict_PA(self):
        probabilities = self.PA_model.predict_proba(self.current_PA).flatten()
        outcome = np.random.choice(self.encoder.categories_[0], p=probabilities)
        return outcome
       

    def simulate_game(self):
        # Define which batter is currently batting in the lineup
        self.lineup_tracker = {'home':1, 'away':1}
        self.inning = 1
        self.on_1b = 0
        self.on_2b = 0
        self.on_3b = 0
        self.score_tracker = {'home':0, 'away':0}

        # Start with away team batting
        while self.inning <= 9:
            # Start the away team's half inning
            self.batting_team = 'away'
            self.inning_topbot = 1
            self.bat_score = self.score_tracker['away']
            self.field_score = self.score_tracker['home']
            if self.verbose:
                print(f"Inning {self.inning}, Away Team")
            self.simulate_inning(self.away_lineup, self.batting_team)
            
            # If it's not the 9th inning, switch to the home team's turn
            if self.inning < 9:
                self.batting_team = 'home'
                self.inning_topbot = 0
                self.bat_score = self.score_tracker['home']
                self.field_score = self.score_tracker['away']
                if self.verbose:
                    print(f"Inning {self.inning}, Home Team")
                self.simulate_inning(self.home_lineup, self.batting_team)
            
            self.inning += 1
            if self.verbose:
                print(f"Score after inning {self.inning - 1}: Away - {self.score_tracker['away']}, Home - {self.score_tracker['home']}\n")

    def simulate_inning(self, lineup, team_type):
        # Track outs in an inning
        self.outs_when_up = 0

        while self.outs_when_up < 3:  # Three outs in an inning
            batter_id = float(lineup[self.lineup_tracker[team_type]]['id'])
            pitcher_id = float(self.home_pitcher if team_type == 'away' else self.away_pitcher)

            # Create the at-bat row for the batter and pitcher
            self.make_PA_row(batter_id, pitcher_id)
            
            # Predict the outcome of the at-bat
            outcome = self.predict_PA()

            # Handle the outcome accordingly
            self.handle_outcome(outcome, team_type)
            
            # Move to the next batter (circular lineup)
            self.lineup_tracker[team_type] = (self.lineup_tracker[team_type] + 1) % 9
            self.lineup_tracker[team_type] = self.lineup_tracker[team_type] + 1 if self.lineup_tracker[team_type] == 0 else self.lineup_tracker[team_type]

        # Reset the game state
        self.outs_when_up = 0
        self.on_1b = 0
        self.on_2b = 0
        self.on_3b = 0
        
    def handle_outcome(self, outcome, team_type):
        if outcome == 'strikeout':
            self.outs_when_up += 1  # Increment outs when batter is out
        elif outcome == 'field_out':
            self.outs_when_up += 1  # Increment outs when batter is out
        elif outcome == 'walk':
            if self.on_1b == 0: # Bases Empty
                self.on_1b == 1
            elif self.on_2b == 0: # Just man on 1b
                self.on_2b == 1
            elif self.on_3b == 0: # Men on 1b and 2b
                self.on_3b == 1
            else: # Bases loaded
                self.score_tracker[team_type] += 1
        elif outcome == 'single':
            self.handle_base_hit(1)  # Handle single (advance to 1st base)
        elif outcome == 'double':
            self.handle_base_hit(2)  # Handle double (advance to 2nd base)
        elif outcome == 'home_run':
            self.handle_home_run()  # Handle home run (score and reset bases)
        elif outcome == 'error':
            # Handle error (place batter on base without advancing outs)
            error_value = np.random.random()
            if error_value > 0.75:
                self.handle_base_hit(2) # 2 base error with 25% chance
            else:
                self.handle_base_hit(1) # 1 base error with 75% chance
        elif outcome == 'double_play':
            self.outs_when_up += 2  # Double play = two outs
            if self.on_1b and self.on_2b:  # DP with runners on 1st and 2nd
                self.on_1b = 0  # Remove runner on 1st
                self.on_2b = 0  # Remove runner on 2nd
            elif self.on_2b and self.on_3b:  # DP with runners on 2nd and 3rd
                self.on_2b = 0  # Remove runner on 2nd
                self.on_3b = 0  # Remove runner on 3rd
            elif self.on_1b and self.on_3b:  # NOT adjacent, remove 1st base only
                self.on_1b = 0  # Remove runner on 1st
                if self.outs_when_up < 3: # Then the runner on 3b can score
                    self.on_3b = 0
                    self.score_tracker[team_type] += 1
            else:  # If only one runner is on base, remove them
                if self.on_3b:
                    self.on_3b = 0
                elif self.on_2b:
                    self.on_2b = 0
                elif self.on_1b:
                    self.on_1b = 0
        elif outcome == 'sacrifice':
            # Advance all runners by one base
            self.outs_when_up += 1
            if self.on_3b:
                self.on_3b = 0
                self.score_tracker[team_type] += 1
            if self.on_2b:
                self.on_2b = 0
                self.on_3b = 1
            if self.on_1b:
                self.on_1b = 0
                self.on_2b = 1
        elif outcome == 'triple':
            self.handle_base_hit(3)  # Handle triple (advance to 3rd base)

    def handle_base_hit(self, bases):
        # Handle moving runners based on the type of base hit (single, double, triple)
        if bases == 1:
            self.on_1b = 1  # Batter advances to 1st base
        elif bases == 2:
            self.on_2b = 1  # Batter advances to 2nd base
        elif bases == 3:
            self.on_3b = 1  # Batter advances to 3rd base

        # Move any existing runners on base
        self.advance_runners(bases)

    def advance_runners(self, bases):
        # Update base runners based on the type of hit
        # Update base runners based on the type of hit
        if bases == 1:  # Single
            if self.on_3b:
                self.score_tracker[self.batting_team] += 1  # Run scores from 3rd base
                self.on_3b = 0

            runner_on_2b_scores = self.on_2b and random.random() < 0.50  # 50% chance to score
            runner_on_1b_scores = self.on_1b and random.random() < 0.15  # 15% chance to score
            runner_on_1b_advances = self.on_1b and runner_on_2b_scores and random.random() < 0.40  # Can only advance if 2nd scores

            if runner_on_2b_scores:
                self.score_tracker[self.batting_team] += 1  # Score from 2nd
                self.on_2b = 0  # Clear 2nd base

            if runner_on_1b_scores:
                self.score_tracker[self.batting_team] += 1  # Score from 1st
                self.on_1b = 0  # Clear 1st base
            elif runner_on_1b_advances:
                self.on_3b = self.on_1b  # Move to 3rd
                self.on_1b = 0  # Clear 1st base
            else:
                self.on_2b = self.on_1b  # Normal advance to 2nd

            self.on_1b = 1  # Batter takes 1st base

        elif bases == 2:  # Double
            if self.on_3b:
                self.score_tracker[self.batting_team] += 1  # Run scores from 3rd base
                self.on_3b = 0

            if self.on_2b:
                self.score_tracker[self.batting_team] += 1  # Run scores from 2nd base
                self.on_2b = 0  # Clear 2nd base since they scored

            runner_on_1b_scores = self.on_1b and random.random() < 0.50  # 50% chance to score from 1st

            if runner_on_1b_scores:
                self.score_tracker[self.batting_team] += 1  # Runner from 1st scores
                self.on_1b = 0  # Clear 1st base
            else:
                self.on_3b = self.on_1b  # Move runner from 1st to 3rd

            self.on_2b = 1  # Batter takes 2nd base
            self.on_1b = 0  # 1st base remains empty after a double

        elif bases == 3:  # Triple
            if self.on_3b != 0:
                self.score_tracker[self.batting_team] += 1  # Run scores from 3rd base
            if self.on_2b != 0:
                self.score_tracker[self.batting_team] += 1  # Run scores from 2nd base
            if self.on_1b != 0:
                self.score_tracker[self.batting_team] += 1
            self.on_3b = 1
            self.on_2b = 0
            self.on_1b = 0

    def handle_home_run(self):
        # Home run: Score the batter and clear the bases
        self.score_tracker[self.batting_team] += sum([self.on_1b, self.on_2b, self.on_3b, 1])
        self.on_3b = 0
        self.on_2b = 0
        self.on_1b = 0


In [133]:
with open('../../../../MLB-Data/expected_lineups/expected_lineups_2025-03-03', 'rb') as fpath:
    lineups = pkl.load(fpath)
    lineup = lineups['lineups'][0]
    
date = dt.today() - datetime.timedelta(1)


In [134]:
%%capture
home_scores = []
away_scores = []

for i in range(5):
   lineup = lineups['lineups'][i]
   game = GameSimulation(date, 'Astros', lineup)
   for n in range(10):
      # game = GameSimulation(date, 'Astros', lineup)
      game.simulate_game()

      score = game.score_tracker
      home_scores.append(score['home'])
      away_scores.append(score['away'])

   print(lineup['home_team'], lineup['away_team'])
   print(sum(home_scores) / len(home_scores))
   print(sum(away_scores) / len(away_scores))

TypeError: GameSimulation.handle_outcome() takes 3 positional arguments but 5 were given

In [125]:
print(sum(home_scores) / len(home_scores))
print(sum(away_scores) / len(away_scores))

5.43
6.01


In [127]:
lineups

{'games': [('Tigers', 'Orioles'),
  ('Twins', 'Reds'),
  ('Pirates', 'Royals'),
  ('Phillies', 'Mets'),
  ('Nationals', 'Marlins'),
  ('Yankees', 'Red Sox'),
  ('Blue Jays', 'Cardinals'),
  ('Guardians', 'Rays'),
  ('White Sox', 'Athletics'),
  ('Rockies', 'Cubs'),
  ('Giants', 'Padres'),
  ('Angels', 'Astros'),
  ('Mariners', 'Rangers'),
  ('D-backs', 'Brewers'),
  ('Braves', 'Dodgers')],
 'lineups': {0: {'home_team': 'Tigers',
   'away_team': 'Orioles',
   'home_pitcher': {'name': 'Keider Montero', 'id': '672456'},
   'away_pitcher': {'name': 'Cade Povich', 'id': '700249'},
   'stadium': 'Comerica Park',
   'home_lineup': {1: {'position': '3B',
     'player': 'Andy Ibáñez',
     'id': '628451'},
    2: {'position': 'DH', 'player': 'Justyn-Henry Malloy', 'id': '669234'},
    3: {'position': 'LF', 'player': 'Riley Greene', 'id': '682985'},
    4: {'position': 'RF', 'player': 'Matt Vierling', 'id': '663837'},
    5: {'position': '2B', 'player': 'Colt Keith', 'id': '690993'},
    6: {'po

## Scrape Lineups from MLB.com
We will likely need to replace with rotowire or MLB game cards 
or something else with bench players once the season starts

# Simulate a Specific Game

#### Grab just the stats for the specific game to speed up filtering for each batter

need to do the following:
- find a way to not throw an error for players making debut, and then impute stats for them

In [None]:
df = mlb_scrape('2024-09-15')


In [207]:
# Segment the home and away lineups (IDs) #MAKE DYNAMIC
home_lineup = [int(player['id']) for player in lineups['lineups'][0]['home_lineup'].values()]
away_lineup = [int(player['id']) for player in lineups['lineups'][0]['away_lineup'].values()]
total_lineup = home_lineup + away_lineup # THIS DOES NOT ACCOUNT FOR PLAYERS TO COME OFF BENCH. MAKE SURE TO SCRAPE THEIR IDS TOO IN THE GET_LINEUPS.PY

# Segment the home and away Pitchers (IDs)
home_starting_pitchers = int(lineups['lineups'][0]['home_pitcher']['id'])
away_starting_pitchers = int(lineups['lineups'][0]['away_pitcher']['id'])
total_pitchers = [home_starting_pitchers, away_starting_pitchers] # THIS DOES NOT ACCOUNT FOR PLAYERS TO COME OFF BENCH. MAKE SURE TO SCRAPE THEIR IDS TOO IN THE GET_LINEUPS.PY

# Grab the stats rows belonging to each batter
filtered_batter_stats = batter_stats.loc[total_lineup]

# Grab the stats rows belonging to each pitcher
filtered_pitcher_stats = pitcher_stats.loc[total_pitchers]

# Grab the weather line from the game
with open('data/raw_weather/weather_data_2024', 'rb') as fpath:
    weather = pkl.load(fpath)

game_weather = weather[(weather.game_id.str.contains(f'{year}-{month}-{day}')) & ((weather.game_id.str.contains(lineups['games'][0][0])) |
                                                                                 (weather.game_id.str.contains(lineups['games'][0][1])))]

NameError: name 'lineups' is not defined

In [None]:
weather

Unnamed: 0.1,Unnamed: 0,game_id,rain_percentage,temprature,wind_speed,wind_direction,is_dome
0,0,Dodgers @ Blue Jays on 2024-04-27 3:07 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"
1,1,A's @ Orioles on 2024-04-27 4:05 PM,"[0.0, 35.0, 5.0, 7.0]","[55.0, 59.0, 59.0, 58.0]","[8.0, 10.0, 9.0, 9.0]","['right to left', 'R-L', 'Out', 'Out']","[False, False, False, False]"
2,2,Reds @ Rangers on 2024-04-27 4:05 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"
3,3,Cardinals @ Mets on 2024-04-27 4:05 PM,"[0.0, 6.0, 12.0, 5.0]","[56.0, 57.0, 55.0, 53.0]","[17.0, 11.0, 13.0, 13.0]","['right to left', 'R-L', 'R-L', 'R-L']","[False, False, False, False]"
4,4,Cubs @ Red Sox on 2024-04-27 4:10 PM,"[0.0, 0.0, 0.0, 0.0]","[58.0, 63.0, 61.0, 60.0]","[17.0, 10.0, 10.0, 10.0]","['right to left', 'R-L', 'R-L', 'Out']","[False, False, False, False]"
...,...,...,...,...,...,...,...
1462,1462,Braves @ Padres on 2024-10-01 8:38 PM,"[0.0, 0.0, 0.0, 0.0]","[73.0, 72.0, 69.0, 67.0]","[8.0, 7.0, 6.0, 5.0]","['left to right', 'L-R', 'L-R', 'L-R']","[False, False, False, False]"
1463,1463,Tigers @ Astros on 2024-10-02 2:32 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"
1464,1464,Royals @ Orioles on 2024-10-02 4:38 PM,"[18.0, 16.0, 16.0, 16.0]","[70.0, 69.0, 70.0, 68.0]","[2.0, 3.0, 2.0, 2.0]","['right to left', '', '', '']","[False, False, False, False]"
1465,1465,Mets @ Brewers on 2024-10-02 7:38 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"


In [None]:
weather[(weather.game_id.str.contains(f'{year}-{month}-{day}'))]

Unnamed: 0.1,Unnamed: 0,game_id,rain_percentage,temprature,wind_speed,wind_direction,is_dome


In [None]:
# Predict a PA # IF SLOW CONVERT THE DF TO A DICT

# Combine the rolled batter/pitcher stats
batter = home_lineup[0]
pitcher = away_starting_pitchers

batter_df = filtered_batter_stats.loc[batter]
pitcher_df = filtered_pitcher_stats.loc[pitcher]

prediction_dataset = pd.concat([batter_df, pitcher_df, LA_stats])

### Add in the outside factors like ballpark, weather, etc. ###
prediction_dataset['ballpark'] = 'Target'
prediction_dataset['batter'] = 0
prediction_dataset['pitcher'] = 1
prediction_dataset['pitbat'] = 'RR'
prediction_dataset['on_3b'] = 1
prediction_dataset['on_2b'] = 0
prediction_dataset['on_1b'] = 0
prediction_dataset['outs_when_up'] = 0
prediction_dataset['inning'] = 9
prediction_dataset['inning_topbot'] = 0
prediction_dataset['bat_score'] = 0
prediction_dataset['fld_score'] = 0
prediction_dataset['Left to Right'] = 0
prediction_dataset['Right to Left'] = 0
prediction_dataset['in'] = 0
prediction_dataset['out'] = 0
prediction_dataset['zero'] = 0
prediction_dataset['temprature_sq'] = 4900

NameError: name 'home_lineup' is not defined

In [None]:
predictions = model.predict_proba(prediction_dataset.to_frame().T)

In [None]:
def generate_play_from_prediction_probas(prediction_probas):
    values = range(12)
    play_num = np.random.choice(values, p=prediction_probas.flatten())

    return encoder.inverse_transform(np.array([play_num]).reshape(-1, 1))[0][0]

l = []
for i in range(10000):
    play = generate_play_from_prediction_probas(predictions)
    l.append(play)
x = pd.Series(l)

NameError: name 'predictions' is not defined

In [None]:
x.value_counts()

field_out          2399
sacrifice          2074
single             1582
strikeout          1118
walk                727
fielders_choice     582
double_play         547
double              401
home_run            360
intent_walk          76
error                71
triple               63
Name: count, dtype: int64