# M05. Steals
- This predicts stolen base attempt and success rates
- Type: Model
- Run Frequency: Irregular
- Sources:
    - MLB API
    - Steamer
- Created: 12/16/2023
- Updated: 2/3/2025

Warning: This was modified from M04. Base Running and may contain some vestigial code. This isn't a problem, but just a reminder.

### Imports

In [3]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"
%run "U4. Datasets.ipynb"
%run "U5. Models.ipynb"

In [4]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

Create directory

In [5]:
os.makedirs(os.path.join(model_path, "M06. Steals", todaysdate), exist_ok=True)

### Data

Notes: 
- This cannot use the same complete dataset as elsewhere because multiple records per plate appearance are required and those are typically dropped
- MLB Stats API calls are highly prone to connection errors. Because of this, it's highly discouraged to run them all in parallel. Sadly, the below approach is the best I've got.

Create yearly dataframes

In [6]:
# df2015 = plays_statsapi("04/01/2015", "10/31/2015")
# df2016 = plays_statsapi("04/01/2016", "10/31/2016")
# df2017 = plays_statsapi("04/01/2017", "10/31/2017")
# df2018 = plays_statsapi("04/01/2018", "10/31/2018")
# df2019 = plays_statsapi("04/01/2019", "10/31/2019")
# df2020 = plays_statsapi("04/01/2020", "10/31/2020")
# df2021 = plays_statsapi("04/01/2021", "10/31/2021")
# df2022 = plays_statsapi("04/01/2022", "10/31/2022")
# df2023 = plays_statsapi("04/01/2023", "10/31/2023")
# df2024 = plays_statsapi("04/01/2024", "10/31/2024")

Concatenate yearly dataframes

In [7]:
# running_dataset = pd.concat([df2015, df2016, df2017, df2018, df2019, df2020, df2021, df2022, df2023, df2024], axis=0).query('game_type == "R"')

Write to CSV

In [8]:
# running_dataset.to_csv(os.path.join(baseball_path, "Running Dataset.csv"), index=False)

Read CSV

In [9]:
running_dataset = pd.read_csv(os.path.join(baseball_path, "Running Dataset.csv"))

### Movement Reasons

r_adv_force: advanced on a ball in play because they were forced to <br>
r_adv_play: advanced on a ball in play without being forced to <br>
r_force_out: out on a force play <br>
r_adv_throw: advanced on the throw, not the contact <br>
r_runner_out: out not on a force play <br>
r_thrown_out: out on a hit (base runner) <br>
r_doubled_off: out on a ball caught and thrown to base <br>
r_out_stretching: out on a hit (hitter) <br>

We now have where every runner, including the batter, started and finished. However, we have two problems:
- Some base runners don't move. They are not included in the dataset yet. 
- Some base runners move more than once in a play. They may go from 1B to 2B on a hit and then 2B to 3B on a throw.

### Clean and Create Variables

In [10]:
# Fill in missings
running_dataset['description'].fillna("Missing", inplace=True)

# Identify errors
running_dataset['error'] = running_dataset['description'].str.contains('error', case=False).astype('int')
# Double Play dummy (will be cleaned a bit later)
running_dataset['double_play'] = running_dataset['eventType'].isin(['grounded_into_double_play', 'double_play', 'sac_fly_double_play', 'strikeout_double_play', 'sac_bunt_double_play']).astype(int)

# Create bottom half inning dummy
running_dataset['bottom'] = (running_dataset['halfInning'] == "bottom").astype('int')

# Determine outs before at bat
running_dataset['outs_pre'] = running_dataset.groupby(['gamePk', 'inning', 'bottom'])['outs'].shift(1)
running_dataset['outs_pre'] = running_dataset.groupby(['gamePk', 'atBatIndex'])['outs_pre'].transform('min')
running_dataset['outs_pre'] = np.where(running_dataset['outs_pre'] == 3, 0, running_dataset['outs_pre'])

# Fill in missings
running_dataset['outs_pre'].fillna(0, inplace=True)

### Multiple Movements

Identify where a runner starts and ends in an at bat. Only keep one instance. 

In [11]:
# Determine start and end base by number
# 0 is AB, 1 is 1B, 2 is 2B, 3 is 3B, 4 is scored, 5 is out
running_dataset['startInt'] = running_dataset['start'].apply(lambda x: 0 if pd.isna(x) else int(x[0]) if x[0].isdigit() else 0)
running_dataset['endInt'] = running_dataset['end'].apply(lambda x: 5 if pd.isna(x) else 4 if x.lower() == 'score' else int(x[0]) if x[0].isdigit() else 0)

In [12]:
running_dataset['minBase'] = running_dataset.groupby(['gamePk', 'atBatIndex', 'runner_id'])['startInt'].transform('min')
running_dataset['maxBase'] = running_dataset.groupby(['gamePk', 'atBatIndex', 'runner_id'])['endInt'].transform('max')

In [13]:
# Extract the number after "id" using regular expression
running_dataset['postOnFirst'] = running_dataset['postOnFirst'].str.extract(r"'id': (\d+)")
running_dataset['postOnSecond'] = running_dataset['postOnSecond'].str.extract(r"'id': (\d+)")
running_dataset['postOnThird'] = running_dataset['postOnThird'].str.extract(r"'id': (\d+)")

In [14]:
# Determine where runners were to start PA
# Note that the exact id may be incorrect due to pinch runners, but we don't really care.
running_dataset['preOnFirst'] = running_dataset.groupby(['gamePk', 'halfInning'])['postOnFirst'].shift(1)
running_dataset['preOnSecond'] = running_dataset.groupby(['gamePk', 'halfInning'])['postOnSecond'].shift(1)
running_dataset['preOnThird'] = running_dataset.groupby(['gamePk', 'halfInning'])['postOnThird'].shift(1)

# Create a mask to identify the first occurrence of each combination
first_occurrence_mask = ~running_dataset.duplicated(subset=['gamePk', 'atBatIndex'], keep='first')

# Set 'preOnFirst' to NaN for non-first occurrences
running_dataset['preOnFirst'] = running_dataset['preOnFirst'].where(first_occurrence_mask, other=None)
running_dataset['preOnSecond'] = running_dataset['preOnSecond'].where(first_occurrence_mask, other=None)
running_dataset['preOnThird'] = running_dataset['preOnThird'].where(first_occurrence_mask, other=None)
# Fill in missings
running_dataset['preOnFirst'] = running_dataset.groupby(['gamePk', 'atBatIndex'])['preOnFirst'].ffill()
running_dataset['preOnSecond'] = running_dataset.groupby(['gamePk', 'atBatIndex'])['preOnSecond'].ffill()
running_dataset['preOnThird'] = running_dataset.groupby(['gamePk', 'atBatIndex'])['preOnThird'].ffill()

### Batters

In [15]:
atBat = running_dataset.query('id == batter')

# Only keep runners at bat
atBat.drop_duplicates(['gamePk', 'atBatIndex', 'runner_id'], keep='first', inplace=True)

### Runners on 1B

In [16]:
on1B = running_dataset[~running_dataset['preOnFirst'].isna()]

# Step 1: Create a dummy column is_runner = 1 if column id == preOnFirst
on1B['is_runner'] = (on1B['start'] == "1B").astype(int)

# Step 2: Identify instances where there are no observations for which is_runner = 1
no_runner_mask = ~on1B.groupby(['gamePk', 'atBatIndex'])['is_runner'].transform('max').astype(bool)

# Step 3: Set id = preOnFirst for instances where there are no runners
on1B.loc[no_runner_mask, 'id'] = on1B.loc[no_runner_mask, 'preOnFirst']
on1B.loc[no_runner_mask, 'runner_id'] = on1B.loc[no_runner_mask, 'preOnFirst']

# Step 4: Create the added_1b column
on1B['added_1b'] = 0
on1B.loc[no_runner_mask, 'added_1b'] = 1

# Step 5: Set startInt, endInt, minBase, maxBase for added_1b == 1
on1B.loc[on1B['added_1b'] == 1, ['startInt', 'endInt', 'minBase', 'maxBase']] = 1
on1B.loc[on1B['added_1b'] == 1, ['start', 'end']] = "1B"

# Step 6: Keep one observation per runner
on1B.drop_duplicates(['gamePk', 'atBatIndex', 'runner_id'], keep='first', inplace=True)

# Step 7: Only keep runners on specified base
on1B = on1B.query('minBase == 1')

### Runners on 2B

In [17]:
on2B = running_dataset[~running_dataset['preOnSecond'].isna()]

# Step 1: Create a dummy column is_runner = 1 if column id == preOnSecond
on2B['is_runner'] = (on2B['start'] == "2B").astype(int)

# Step 2: Identify instances where there are no observations for which is_runner = 1
no_runner_mask = ~on2B.groupby(['gamePk', 'atBatIndex'])['is_runner'].transform('max').astype(bool)

# Step 3: Set id = preOnSecond for instances where there are no runners
on2B.loc[no_runner_mask, 'id'] = on2B.loc[no_runner_mask, 'preOnSecond']
on2B.loc[no_runner_mask, 'runner_id'] = on2B.loc[no_runner_mask, 'preOnSecond']

# Step 4: Create the added_2b column
on2B['added_2b'] = 0
on2B.loc[no_runner_mask, 'added_2b'] = 1

# Step 5: Set startInt, endInt, minBase, maxBase for added_2b == 1
on2B.loc[on2B['added_2b'] == 1, ['startInt', 'endInt', 'minBase', 'maxBase']] = 2
on2B.loc[on2B['added_2b'] == 1, ['start', 'end']] = "2B"

# Step 6: Keep one observation per runner
on2B.drop_duplicates(['gamePk', 'atBatIndex', 'runner_id'], keep='first', inplace=True)

# Step 7: Only keep runners on specified base
on2B = on2B.query('minBase == 2')

### Runners on 3B

In [18]:
on3B = running_dataset[~running_dataset['preOnThird'].isna()]

# Step 1: Create a dummy column is_runner = 1 if column id == preOnSecond
on3B['is_runner'] = (on3B['start'] == "3B").astype(int)

# Step 2: Identify instances where there are no observations for which is_runner = 1
no_runner_mask = ~on3B.groupby(['gamePk', 'atBatIndex'])['is_runner'].transform('max').astype(bool)

# Step 3: Set id = preOnSecond for instances where there are no runners
on3B.loc[no_runner_mask, 'id'] = on3B.loc[no_runner_mask, 'preOnThird']
on3B.loc[no_runner_mask, 'runner_id'] = on3B.loc[no_runner_mask, 'preOnThird']

# Step 4: Create the added_2b column
on3B['added_3b'] = 0
on3B.loc[no_runner_mask, 'added_3b'] = 1

# Step 5: Set startInt, endInt, minBase, maxBase for added_3b == 1
on3B.loc[on3B['added_3b'] == 1, ['startInt', 'endInt', 'minBase', 'maxBase']] = 3
on3B.loc[on3B['added_3b'] == 1, ['start', 'end']] = "3B"

# Step 6: Keep one observation per runner
on3B.drop_duplicates(['gamePk', 'atBatIndex', 'runner_id'], keep='first', inplace=True)

# Step 7: Only keep runners on specified base
on3B = on3B.query('minBase == 3')

### Combine

In [19]:
# Concatenate the original DataFrame with the modified rows
df = pd.concat([atBat, on1B, on2B, on3B], ignore_index=True)

# Count up observations within an atBatIndex
df['atBatIndexNum'] = df.groupby(['gamePk', 'atBatIndex']).cumcount() + 1

# Sort
df.sort_values(['gamePk', 'atBatIndex', 'atBatIndexNum'], inplace=True)

### Start Locations

In [20]:
# Any runner started on these bases
# Create start location dummies
df['pre_1b'] = (df['minBase'] == 1).astype('int')
df['pre_2b'] = (df['minBase'] == 2).astype('int')
df['pre_3b'] = (df['minBase'] == 3).astype('int')

# Group by 'gamePk' and 'atBatIndex', then use transform to calculate the max for each group
df['pre_1b'] = df.groupby(['gamePk', 'atBatIndex'])['pre_1b'].transform('max')
df['pre_2b'] = df.groupby(['gamePk', 'atBatIndex'])['pre_2b'].transform('max')
df['pre_3b'] = df.groupby(['gamePk', 'atBatIndex'])['pre_3b'].transform('max')

### Fix End Locations

In [21]:
# End locations: Runner
df['post_1b'] = (df['maxBase'] == 1).astype('int')
df['post_2b'] = (df['maxBase'] == 2).astype('int')
df['post_3b'] = (df['maxBase'] == 3).astype('int')

# End locations: At Bat (team)
df['post_1b'] = df.groupby(['gamePk', 'atBatIndex'])['post_1b'].transform('max')
df['post_2b'] = df.groupby(['gamePk', 'atBatIndex'])['post_2b'].transform('max')
df['post_3b'] = df.groupby(['gamePk', 'atBatIndex'])['post_3b'].transform('max')

# End locations: Blocked - this occurs when someone other than the runner is already on a base
# Note: You can't be blocked from advancing to a base you're on or have passed
df['blocked_1b'] = ((df['post_1b'] == 1) & (df['maxBase'] < 1)).astype('int')
df['blocked_2b'] = ((df['post_2b'] == 1) & (df['maxBase'] < 2)).astype('int')
df['blocked_3b'] = ((df['post_3b'] == 1) & (df['maxBase'] < 3)).astype('int')

### Events

In [22]:
# Create game events
df = create_events(df)

# Encode events as integer
df['eventsModelInt'] = df['eventsModel'].map({'b1': 1, 'b2': 2, 'b3': 3, 'hr': 4, 'bb': 5, 'hbp': 6, 'so': 7, 'fo': 8, 'go': 9, 'lo': 10, 'po': 11})

### Out locations

In [23]:
# Determine if a runner is out
df['out'] = (df['maxBase'] == 5).astype('int')

df['out_home'] = ((df['out'] == 1) & (df['minBase'] == 0)).astype('int')
df['out_1b'] = ((df['out'] == 1) & (df['minBase'] == 1)).astype('int')
df['out_2b'] = ((df['out'] == 1) & (df['minBase'] == 2)).astype('int')
df['out_3b'] = ((df['out'] == 1) & (df['minBase'] == 3)).astype('int')

df['out_home'] = df.groupby(['gamePk', 'atBatIndex'])['out_home'].transform('max')
df['out_1b'] = df.groupby(['gamePk', 'atBatIndex'])['out_1b'].transform('max')
df['out_2b'] = df.groupby(['gamePk', 'atBatIndex'])['out_2b'].transform('max')
df['out_3b'] = df.groupby(['gamePk', 'atBatIndex'])['out_3b'].transform('max')

### Cuts

In [24]:
# # Drop less relevant events
# df = df.query('eventsModel != "Cut"')

# Keep only regular season games
df = df.query('game_type == "R"')

# Duplicates (should be very rare. I believe they're mlb's errors, not mine)
df.drop_duplicates(subset=['gamePk', 'atBatIndex', 'minBase'], keep='first', inplace=True)

# Calculate outs in PA
df['outs_calculated'] = df.groupby(['gamePk', 'atBatIndex'])['out'].transform('sum')

# Sometimes, there will be two outs without a double play recorded (typically a pickoff) but we need these for the math to work
df['double_play'] = np.where(df['outs_calculated'] == 2, 1, df['double_play'])
# Sometimes, there will be no outs on a play that's traditionally an out. These are errors.
df['error'] = np.where((df['outs_calculated'] == 0) & (df['eventType'] == 'fielders_choice'), 1, df['error'])

# Drop triple plays
df = df.query('outs_calculated != 3')

### Read in Steamer

In [25]:
# Read in Steamer hitters 
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')
# Clean
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)
steamer_hitters_df2.dropna(subset=batter_stats_fg, inplace=True)

In [26]:
# Convert to YYYYMMDD int
df['date'] = df['game_date'].str.replace("-", "").astype('int')

In [27]:
# Find the dates of Steamer projections
# We'll take the most recent and merge in that projection for each player
batter_steamer_dates = list(steamer_hitters_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date, steamer_dates):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
df["batter_date"] = df["date"].apply(lambda x: find_steamer_date(x, batter_steamer_dates))

### Identify steals and attempts

In [28]:
df['sb_2b'] = df['movementReason'].isin(['r_stolen_base_2b']).astype('int') 
df['sb_3b'] = df['movementReason'].isin(['r_stolen_base_3b']).astype('int') 
df['sba_2b'] = df['movementReason'].isin(['r_stolen_base_2b', 'r_caught_stealing_2b', 'r_pickoff_caught_stealing_2b']).astype('int') 
df['sba_3b'] = df['movementReason'].isin(['r_stolen_base_3b', 'r_caught_stealing_3b', 'r_pickoff_caught_stealing_3b']).astype('int') 

In [29]:
df['sb_2b'].mean()

0.008657461877197356

In [31]:
# Create year variable
df['year'] = df["date"].astype('str').str[:4].astype('int')

# Creating dummy variables
dummy_years = pd.get_dummies(df['year'], prefix='year').astype('int')

# Concatenating dummy variables with original DataFrame
df = pd.concat([df, dummy_years], axis=1)

# Convert to numeric
df['runner_id'] = df['runner_id'].astype(int)

In [32]:
# Merge
steal_df = pd.merge(df[['year', 'gamePk', 'eventsModel', 'atBatIndex', 'atBatIndexNum', 'minBase', 'maxBase', 'runner_id', 'movementReason', 'batter_date', 'sba_2b', 'sba_3b', 'sb_2b', 'sb_3b', 'outs_pre', 'pre_1b', 'pre_2b', 'pre_3b'] + list(dummy_years.columns)], steamer_hitters_df2[['mlbamid', 'date', 'sb', 'sbo', 'sba']], left_on=['runner_id', 'batter_date'], right_on=['mlbamid', 'date'], how='inner')

Define model inputs

In [33]:
steal_df['sba_imp'] = steal_df['sba'] / steal_df['sbo']
steal_df['sb_imp'] = steal_df['sb'] / steal_df['sba']

Sort

In [34]:
steal_df.sort_values(['gamePk', 'atBatIndex', 'atBatIndexNum'], ascending=True, inplace=True)

Only keep runners with meaningful sample size

Note: this may drop very late season base runners as sbo are projected for the rest of season

In [35]:
cutoff = 30

steal_df = steal_df.query(f'sbo > {cutoff}').reset_index(drop=True)

Calculate medians for the purpose of providing fill-in rates for those with small sbo values and unusually high rates in matchup files (we won't use those here)

In [36]:
steal_df.query('year >= 2023')[['sba_imp', 'sb_imp']].describe()

Unnamed: 0,sba_imp,sb_imp
count,384981.0,384981.0
mean,0.0994,0.7385
std,0.082,0.0382
min,0.0031,0.642
25%,0.0364,0.705
50%,0.0746,0.739
75%,0.1407,0.767
max,0.5621,0.848


In [37]:
steal_df.query('pre_1b == 1 and pre_2b == 0 and minBase == 1').query('year >= 2024')['sba_2b'].mean()

0.08830815393740624

# Models

### Steals

### Attempt to steal 2B

In [39]:
steal_df.query('pre_1b == 1 and pre_2b == 0 and minBase == 1').query('year >= 2024')['sba_2b'].value_counts()

sba_2b
0    26130
1     2531
Name: count, dtype: int64

In [None]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_1b == 1 and pre_2b == 0 and minBase == 1').query('year >= 2024')[['outs_pre', 'sba_imp', 'sb_imp']]
y = steal_df.query('pre_1b == 1 and pre_2b == 0 and minBase == 1').query('year >= 2024')['sba_2b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
predict_sba_2b = MLPClassifier(hidden_layer_sizes=(4,4), activation='relu', early_stopping=False, max_iter=100, random_state=10)
predict_sba_2b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = predict_sba_2b.predict_proba(X_test)

# Create DataFrame from probabilities
probability_df = pd.DataFrame(probabilities, columns=['sba_2b_not', 'sba_2b_pred'], index=X_test.index)

# Concatenate probability_df with y_test and X_test
sba_2b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Create directory
os.makedirs(os.path.join(model_path, "M05. Steals", todaysdate), exist_ok=True)

# Save model
pickle.dump(predict_sba_2b, open(os.path.join(model_path, "M05. Steals", todaysdate, "predict_sba_2b.sav"), 'wb'))

In [None]:
quantiles = 40

# Add xtiles (to examine how well predictions match actual results)
sba_2b_df['quantile'] = pd.qcut(sba_2b_df['sba_2b_pred'], quantiles, labels=False)
globals()["sba_2b_df_quantiles"] = sba_2b_df.groupby('quantile')[['sba_2b', 'sba_2b_pred']].mean().reset_index()

In [None]:
# Create figures
plt.plot(sba_2b_df_quantiles['quantile'], sba_2b_df_quantiles['sba_2b_pred'], color='red')
plt.plot(sba_2b_df_quantiles['quantile'], sba_2b_df_quantiles['sba_2b'], color='black')
plt.show() 

In [None]:
sba_2b_df[['sba_2b_pred', 'sba_2b']].mean()

### Attempt to steal 3B

In [None]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_2b == 1 and pre_3b == 0 and minBase == 2').query('year >= 2024')[['outs_pre', 'sba_imp', 'sb_imp']] # + list(dummy_years.columns)]
y = steal_df.query('pre_2b == 1 and pre_3b == 0 and minBase == 2').query('year >= 2024')['sba_3b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
predict_sba_3b = MLPClassifier(hidden_layer_sizes=(4,4,), activation='relu', early_stopping=False, max_iter=100, random_state=30000000)
predict_sba_3b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = predict_sba_3b.predict_proba(X_test)

# Create DataFrame from probabilities
probability_df = pd.DataFrame(probabilities, columns=['sba_3b_not', 'sba_3b_pred'], index=X_test.index)

# Concatenate probability_df with y_test and X_test
sba_3b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Create directory
os.makedirs(os.path.join(model_path, "M05. Steals", todaysdate), exist_ok=True)

# Save model
pickle.dump(predict_sba_3b, open(os.path.join(model_path, "M05. Steals", todaysdate, "predict_sba_3b.sav"), 'wb'))

In [None]:
quantiles = 10 

# Add xtiles (to examine how well predictions match actual results)
sba_3b_df['quantile'] = pd.qcut(sba_3b_df['sba_3b_pred'], 10, labels=False)
globals()["sba_3b_df_quantiles"] = sba_3b_df.groupby('quantile')[['sba_3b', 'sba_3b_pred']].mean().reset_index()

In [None]:
# Create figures
plt.plot(sba_3b_df_quantiles['quantile'], sba_3b_df_quantiles['sba_3b_pred'], color='red')
plt.plot(sba_3b_df_quantiles['quantile'], sba_3b_df_quantiles['sba_3b'], color='black')
plt.show() 

In [None]:
sba_3b_df[['sba_3b_pred', 'sba_3b']].mean()

### Steal 2B

In [None]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_1b == 1 and pre_2b == 0 and sba_2b == 1 and minBase == 1').query('year >= 2023')[['outs_pre', 'sba_imp', 'sb_imp']] # + list(dummy_years.columns)]
y = steal_df.query('pre_1b == 1 and pre_2b == 0 and sba_2b == 1 and minBase == 1').query('year >= 2023')['sb_2b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
predict_sb_2b = MLPClassifier(hidden_layer_sizes=(4,4), activation='relu', max_iter=100, random_state=1000)
predict_sb_2b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = predict_sb_2b.predict_proba(X_test)

# Create DataFrame from probabilities
probability_df = pd.DataFrame(probabilities, columns=['sb_2b_not', 'sb_2b_pred'], index=X_test.index)

# Concatenate probability_df with y_test and X_test
sb_2b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Create directory
os.makedirs(os.path.join(model_path, "M05. Steals", todaysdate), exist_ok=True)

# Save model
pickle.dump(predict_sb_2b, open(os.path.join(model_path, "M05. Steals", todaysdate, "predict_sb_2b.sav"), 'wb'))

In [None]:
quantiles = 10

# Add xtiles (to examine how well predictions match actual results)
sb_2b_df['quantile'] = pd.qcut(sb_2b_df['sb_2b_pred'], quantiles, labels=False)
globals()["sb_2b_df"] = sb_2b_df.groupby('quantile')[['sb_2b', 'sb_2b_pred']].mean().reset_index()

In [None]:
# Create figures
plt.plot(sb_2b_df['quantile'], sb_2b_df['sb_2b_pred'], color='red')
plt.plot(sb_2b_df['quantile'], sb_2b_df['sb_2b'], color='black')
plt.show() 

In [None]:
sb_2b_df[['sb_2b_pred', 'sb_2b']].mean()

### Steal 3B

In [None]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_2b == 1 and pre_3b == 0 and sba_3b == 1 and minBase == 2').query('year >= 2023')[['outs_pre', 'sba_imp', 'sb_imp']] # + list(dummy_years.columns)]
y = steal_df.query('pre_2b == 1 and pre_3b == 0 and sba_3b == 1 and minBase == 2').query('year >= 2023')['sb_3b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
predict_sb_3b = MLPClassifier(hidden_layer_sizes=(10,10,), activation='relu', max_iter=100, random_state=42)
predict_sb_3b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = predict_sb_3b.predict_proba(X_test)

# Create DataFrame from probabilities
probability_df = pd.DataFrame(probabilities, columns=['sb_3b_not', 'sb_3b_pred'], index=X_test.index)

# Concatenate probability_df with y_test and X_test
sb_3b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Create directory
os.makedirs(os.path.join(model_path, "M05. Steals", todaysdate), exist_ok=True)

# Save model
pickle.dump(predict_sb_3b, open(os.path.join(model_path, "M05. Steals", todaysdate, "predict_sb_3b.sav"), 'wb'))

In [None]:
quantiles = 10

# Add xtiles (to examine how well predictions match actual results)
sb_3b_df['quantile'] = pd.qcut(sb_3b_df['sb_3b_pred'], quantiles, labels=False)
globals()["sb_3b_df"] = sb_3b_df.groupby('quantile')[['sb_3b', 'sb_3b_pred']].mean().reset_index()

In [None]:
# Create figures
plt.plot(sb_3b_df['quantile'], sb_3b_df['sb_3b_pred'], color='red')
plt.plot(sb_3b_df['quantile'], sb_3b_df['sb_3b'], color='black')
plt.show() 

In [None]:
sb_3b_df[['sb_3b_pred', 'sb_3b']].mean()