In [134]:
import pandas as pd
import numpy as np
import pickle as pkl
import warnings
import datetime

from build_datasets.build_datasets import dataset_builder
from get_lineups import mlb_scrape

warnings.simplefilter('ignore')

# Code to Build Data Overnight -- NEED TO INCLUDE PITCH COLLECTION FROM MLB DATA COLLECTION

In [182]:
# Load in the Model
with open('../train_models/data/models/final_model_xgb.pkl', 'rb') as fpath:
    model = pkl.load(fpath)

# Load in the Label Encoder
with open('../train_models/data/y-label_encoder.pkl', 'rb') as fpath:
    encoder = pkl.load(fpath)

In [186]:
# Code to pull in the raw pitches dynamically
year = '2024'
month = '05'
day = '01'

# today = datetime.datetime.today()
# year, month, day = str(today.year), str(today.month), str(today.day)

def build_raw_pitches_df(year, month, day, years_prior = 3):
    pitches_holder = []

    base_year = int(year) - years_prior
    for n in range(base_year, int(year)+1):
        with open(f'../data/raw_pitches/pitches_{n}.pkl', 'rb') as fpath:
            pitches_df = pkl.load(fpath)
            if n != int(year):
                pitches_holder.append(pitches_df)
            else:
                date = f'{year}-{month}-{day}'
                pitches_df = pitches_df[pitches_df.game_date < date]
                pitches_holder.append(pitches_df)

    raw_pitches = pd.concat([df for df in pitches_holder])
    return raw_pitches

In [187]:
# Define the settings for the dataset builder run
def build_daily_stats_dataset(year, month, day, raw_pitches, windows=(20, 45, 75, 504)):
    dataset = dataset_builder(rolling_windows=[window for window in windows], verbose=False)
    date_suffix = f'{year}-{month}-{day}'
    rolling_window_suffix = '_'.join([str(pa) for pa in dataset.rolling_windows])
    dataset_suffix = f'{date_suffix}_rolling_windows_{rolling_window_suffix}'

    # Turn the Raw Pitches into a rolled dataset
    df = dataset.build_training_dataset(raw_pitches, suffix=dataset_suffix, make_ml=False,
                                save_cleaned=False, save_coefficients=False,
                                save_dataset=True, save_training_dataset=True,
                                local_save=True, online_save=False)
    return df

In [188]:
def nightly_stats_collection(year, month, day):
    raw_pitches = build_raw_pitches_df(year, month, day)
    daily_stats_for_simulation = build_daily_stats_dataset(year, month, day, raw_pitches)

    return daily_stats_for_simulation

In [191]:
year = '2024'
month = '05'
day = '01'

nightly_stats_collection(year, month, day)

## Get the most recent stats for each player

In [194]:
# Read in the dataset for the day
year = '2024'
month = '05'
day = '01'

with open(f'data/processed_data/final_dataset_nonML_{year}-{month}-{day}_rolling_windows_20_40_75_504', 'rb') as fpath:
    daily_dataset = pkl.load(fpath)

# Get the individual stats for each batter and pitcher + the league at the given moment, so we can pull them as needed for a given at bat
batter_stats = daily_dataset.groupby(by='batter').last()
batter_stats = batter_stats[[col for col in batter_stats.columns if 'PA' in col and 'LA' not in col and 'pitcher' not in col]]

pitcher_stats = daily_dataset.groupby(by='pitcher').last()
pitcher_stats = pitcher_stats[[col for col in pitcher_stats.columns if 'PA' in col and 'LA' not in col and 'pitcher' in col]]

LA_stats = daily_dataset.iloc[-1]
LA_stats = LA_stats[[col for col in LA_stats.index if 'LA' in col]]

## Scrape Lineups from MLB.com
We will likely need to replace with rotowire or MLB game cards 
or something else with bench players once the season starts

In [195]:
lineups = mlb_scrape(date='2024-05-01')

# Simulate a Specific Game

#### Grab just the stats for the specific game to speed up filtering for each batter

need to do the following:
- find a way to not throw an error for players making debut, and then impute stats for them

In [196]:
# Segment the home and away lineups (IDs) #MAKE DYNAMIC
home_lineup = [int(player['id']) for player in lineups['lineups'][0]['home_lineup'].values()]
away_lineup = [int(player['id']) for player in lineups['lineups'][0]['away_lineup'].values()]
total_lineup = home_lineup + away_lineup # THIS DOES NOT ACCOUNT FOR PLAYERS TO COME OFF BENCH. MAKE SURE TO SCRAPE THEIR IDS TOO IN THE GET_LINEUPS.PY

# Segment the home and away Pitchers (IDs)
home_starting_pitchers = int(lineups['lineups'][0]['home_pitcher']['id'])
away_starting_pitchers = int(lineups['lineups'][0]['away_pitcher']['id'])
total_pitchers = [home_starting_pitchers, away_starting_pitchers] # THIS DOES NOT ACCOUNT FOR PLAYERS TO COME OFF BENCH. MAKE SURE TO SCRAPE THEIR IDS TOO IN THE GET_LINEUPS.PY

# Grab the stats rows belonging to each batter
filtered_batter_stats = batter_stats.loc[total_lineup]

# Grab the stats rows belonging to each pitcher
filtered_pitcher_stats = pitcher_stats.loc[total_pitchers]

# Grab the weather line from the game
with open('data/raw_weather/weather_data_2024', 'rb') as fpath:
    weather = pkl.load(fpath)

game_weather = weather[(weather.game_id.str.contains(f'{year}-{month}-{day}')) & ((weather.game_id.str.contains(lineups['games'][0][0])) |
                                                                                 (weather.game_id.str.contains(lineups['games'][0][1])))]

KeyError: '[672761, 690993] not in index'

In [175]:
weather

Unnamed: 0.1,Unnamed: 0,game_id,rain_percentage,temprature,wind_speed,wind_direction,is_dome
0,0,Dodgers @ Blue Jays on 2024-04-27 3:07 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"
1,1,A's @ Orioles on 2024-04-27 4:05 PM,"[0.0, 35.0, 5.0, 7.0]","[55.0, 59.0, 59.0, 58.0]","[8.0, 10.0, 9.0, 9.0]","['right to left', 'R-L', 'Out', 'Out']","[False, False, False, False]"
2,2,Reds @ Rangers on 2024-04-27 4:05 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"
3,3,Cardinals @ Mets on 2024-04-27 4:05 PM,"[0.0, 6.0, 12.0, 5.0]","[56.0, 57.0, 55.0, 53.0]","[17.0, 11.0, 13.0, 13.0]","['right to left', 'R-L', 'R-L', 'R-L']","[False, False, False, False]"
4,4,Cubs @ Red Sox on 2024-04-27 4:10 PM,"[0.0, 0.0, 0.0, 0.0]","[58.0, 63.0, 61.0, 60.0]","[17.0, 10.0, 10.0, 10.0]","['right to left', 'R-L', 'R-L', 'Out']","[False, False, False, False]"
...,...,...,...,...,...,...,...
1462,1462,Braves @ Padres on 2024-10-01 8:38 PM,"[0.0, 0.0, 0.0, 0.0]","[73.0, 72.0, 69.0, 67.0]","[8.0, 7.0, 6.0, 5.0]","['left to right', 'L-R', 'L-R', 'L-R']","[False, False, False, False]"
1463,1463,Tigers @ Astros on 2024-10-02 2:32 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"
1464,1464,Royals @ Orioles on 2024-10-02 4:38 PM,"[18.0, 16.0, 16.0, 16.0]","[70.0, 69.0, 70.0, 68.0]","[2.0, 3.0, 2.0, 2.0]","['right to left', '', '', '']","[False, False, False, False]"
1465,1465,Mets @ Brewers on 2024-10-02 7:38 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"


In [174]:
weather[(weather.game_id.str.contains(f'{year}-{month}-{day}'))]

Unnamed: 0.1,Unnamed: 0,game_id,rain_percentage,temprature,wind_speed,wind_direction,is_dome


In [49]:
# Predict a PA # IF SLOW CONVERT THE DF TO A DICT

# Combine the rolled batter/pitcher stats
batter = home_lineup[0]
pitcher = away_starting_pitchers

batter_df = filtered_batter_stats.loc[batter]
pitcher_df = filtered_pitcher_stats.loc[pitcher]

prediction_dataset = pd.concat([batter_df, pitcher_df, LA_stats])

### Add in the outside factors like ballpark, weather, etc. ###
prediction_dataset['ballpark'] = 'Target'
prediction_dataset['batter'] = 0
prediction_dataset['pitcher'] = 1
prediction_dataset['pitbat'] = 'RR'
prediction_dataset['on_3b'] = 1
prediction_dataset['on_2b'] = 0
prediction_dataset['on_1b'] = 0
prediction_dataset['outs_when_up'] = 0
prediction_dataset['inning'] = 9
prediction_dataset['inning_topbot'] = 0
prediction_dataset['bat_score'] = 0
prediction_dataset['fld_score'] = 0
prediction_dataset['Left to Right'] = 0
prediction_dataset['Right to Left'] = 0
prediction_dataset['in'] = 0
prediction_dataset['out'] = 0
prediction_dataset['zero'] = 0
prediction_dataset['temprature_sq'] = 4900

In [163]:
with open('data/raw_weather/weather_data_2024', 'rb') as fpath:
    weather = pkl.load(fpath)

weather

Unnamed: 0.1,Unnamed: 0,game_id,rain_percentage,temprature,wind_speed,wind_direction,is_dome
0,0,Dodgers @ Blue Jays on 2024-04-27 3:07 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"
1,1,A's @ Orioles on 2024-04-27 4:05 PM,"[0.0, 35.0, 5.0, 7.0]","[55.0, 59.0, 59.0, 58.0]","[8.0, 10.0, 9.0, 9.0]","['right to left', 'R-L', 'Out', 'Out']","[False, False, False, False]"
2,2,Reds @ Rangers on 2024-04-27 4:05 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"
3,3,Cardinals @ Mets on 2024-04-27 4:05 PM,"[0.0, 6.0, 12.0, 5.0]","[56.0, 57.0, 55.0, 53.0]","[17.0, 11.0, 13.0, 13.0]","['right to left', 'R-L', 'R-L', 'R-L']","[False, False, False, False]"
4,4,Cubs @ Red Sox on 2024-04-27 4:10 PM,"[0.0, 0.0, 0.0, 0.0]","[58.0, 63.0, 61.0, 60.0]","[17.0, 10.0, 10.0, 10.0]","['right to left', 'R-L', 'R-L', 'Out']","[False, False, False, False]"
...,...,...,...,...,...,...,...
1462,1462,Braves @ Padres on 2024-10-01 8:38 PM,"[0.0, 0.0, 0.0, 0.0]","[73.0, 72.0, 69.0, 67.0]","[8.0, 7.0, 6.0, 5.0]","['left to right', 'L-R', 'L-R', 'L-R']","[False, False, False, False]"
1463,1463,Tigers @ Astros on 2024-10-02 2:32 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"
1464,1464,Royals @ Orioles on 2024-10-02 4:38 PM,"[18.0, 16.0, 16.0, 16.0]","[70.0, 69.0, 70.0, 68.0]","[2.0, 3.0, 2.0, 2.0]","['right to left', '', '', '']","[False, False, False, False]"
1465,1465,Mets @ Brewers on 2024-10-02 7:38 PM,"[0.0, 0.0, 0.0, 0.0]","[72.0, 72.0, 72.0, 72.0]","[0.0, 0.0, 0.0, 0.0]","[None, None, None, None]","[True, True, True, True]"


In [131]:
predictions = model.predict_proba(prediction_dataset.to_frame().T)

In [132]:
def generate_play_from_prediction_probas(prediction_probas):
    values = range(12)
    play_num = np.random.choice(values, p=prediction_probas.flatten())

    return encoder.inverse_transform(np.array([play_num]).reshape(-1, 1))[0][0]

l = []
for i in range(10000):
    play = generate_play_from_prediction_probas(predictions)
    l.append(play)
x = pd.Series(l)

In [133]:
x.value_counts()

field_out          2399
sacrifice          2074
single             1582
strikeout          1118
walk                727
fielders_choice     582
double_play         547
double              401
home_run            360
intent_walk          76
error                71
triple               63
Name: count, dtype: int64