# M04. Steals

This predicts stolen bases using outs and some imputed stolen base attempt/success rates from Steamer. <br>
Warning: This was modified from M03. Base Running and contains some vestigial code. This isn't a problem, but just a reminder.

In [1]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"

baseball_path = r'C:\Users\james\Documents\MLB\Database'

db_path = r'C:\Users\james\Documents\MLB\Database\MLBDB.db'
engine = create_engine(f'sqlite:///{db_path}')




In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, classification_report, f1_score, make_scorer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from tensorflow import keras
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

import joblib
import matplotlib.pyplot as plt

In [3]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [4]:
%run "A02. MLB API.ipynb"
%run "A03. Steamer.ipynb"

### Read in plays

In [5]:
# df2015 = plays_statsapi("04/01/2015", "10/31/2015")
# df2016 = plays_statsapi("04/01/2016", "10/31/2016")
# df2017 = plays_statsapi("04/01/2017", "10/31/2017")
# df2018 = plays_statsapi("04/01/2018", "10/31/2018")
# df2019 = plays_statsapi("04/01/2019", "10/31/2019")
# df2020 = plays_statsapi("04/01/2020", "10/31/2020")
# df2021 = plays_statsapi("04/01/2021", "10/31/2021")
# df2022 = plays_statsapi("04/01/2022", "10/31/2022")
# df2023 = plays_statsapi("04/01/2023", "10/31/2023")

In [6]:
# df = pd.concat([df2015, df2016, df2017, df2018, df2019, df2020, df2021, df2022, df2023], axis=0)
# df = df.query('game_type == "R"')

NameError: name 'df2015' is not defined

In [None]:
# df.to_csv(os.path.join(baseball_path, "Complete Dataset.csv"), index=False)

In [9]:
df = pd.read_csv(os.path.join(baseball_path, "Complete Dataset.csv"))
df = df.query('game_type == "R"')

r_adv_force: advanced on a ball in play because they were forced to <br>
r_adv_play: advanced on a ball in play without being forced to <br>
r_force_out: out on a force play <br>
r_adv_throw: advanced on the throw, not the contact <br>
r_runner_out: out not on a force play <br>
r_thrown_out: out on a hit (base runner) <br>
r_doubled_off: out on a ball caught and thrown to base <br>
r_out_stretching: out on a hit (hitter) <br>

We now have where every runner, including the batter, started and finished. However, we have two problems:
- Some base runners don't move. They are not included in the dataset yet. 
- Some base runners move more than once in a play. They may go from 1B to 2B on a hit and then 2B to 3B on a throw.

### Clean and Create Variables

In [10]:
# Fill in missings
df['description'].fillna("Missing", inplace=True)

# Identify errors
df['error'] = df['description'].str.contains('error', case=False).astype('int')
# Double Play dummy (will be cleaned a bit later)
df['double_play'] = df['eventType'].isin(['grounded_into_double_play', 'double_play', 'sac_fly_double_play', 'strikeout_double_play', 'sac_bunt_double_play']).astype(int)

# Create bottom half inning dummy
df['bottom'] = (df['halfInning'] == "bottom").astype('int')

# Determine outs before at bat
df['outs_pre'] = df.groupby(['gamePk', 'inning', 'bottom'])['outs'].shift(1)
df['outs_pre'] = df.groupby(['gamePk', 'atBatIndex'])['outs_pre'].transform('min')
df['outs_pre'] = np.where(df['outs_pre'] == 3, 0, df['outs_pre'])

# Fill in missings
df['outs_pre'].fillna(0, inplace=True)

### Multiple Movements

Identify where a runner starts and ends in an at bat. Only keep one instance. 

In [11]:
# Determine start and end base by number
# 0 is AB, 1 is 1B, 2 is 2B, 3 is 3B, 4 is scored, 5 is out
df['startInt'] = df['start'].apply(lambda x: 0 if pd.isna(x) else int(x[0]) if x[0].isdigit() else 0)
df['endInt'] = df['end'].apply(lambda x: 5 if pd.isna(x) else 4 if x.lower() == 'score' else int(x[0]) if x[0].isdigit() else 0)

In [12]:
df['minBase'] = df.groupby(['gamePk', 'atBatIndex', 'runner_id'])['startInt'].transform('min')
df['maxBase'] = df.groupby(['gamePk', 'atBatIndex', 'runner_id'])['endInt'].transform('max')

The following is removed from the base running model, but we want to keep steals for the steal model, obviously, so this is left just for comparison.

In [13]:
# # List of typical movement types
# movementReason_list = ['r_adv_force', 'r_adv_play', 'r_force_out', 'r_adv_throw', 'r_runner_out', 'r_thrown_out', 'r_doubled_off', 'r_out_stretching']
# df['movementTypical'] = df['movementReason'].apply(lambda x: 1 if x in movementReason_list or pd.isna(x) else 0)
# # Apply to whole at bat. We'll only keep those for which all movement is typical.
# df['movementTypical'] = df.groupby(['gamePk', 'atBatIndex'])['movementTypical'].transform('min')

In [14]:
# df = df.query('movementTypical == 1')
# df.drop(columns={'movementTypical'}, inplace=True)

In [15]:
# Drop duplicates, keeping first (which has almost all steals)

In [16]:
df.drop_duplicates(['gamePk', 'atBatIndex', 'runner_id'], keep='first', inplace=True)

### Stationary Runners

We can expect a runner on each base at the end of the play that the maxBase variable suggests

In [17]:
# For each base at which there is a runner after the play, there should be an explanation for it in "movement"
df['expected1b'] = (df['maxBase'] == 1).astype('int')
df['expected1b'] = df.groupby(['gamePk', 'atBatIndex'])['expected1b'].transform('max')
df['expected2b'] = (df['maxBase'] == 2).astype('int')
df['expected2b'] = df.groupby(['gamePk', 'atBatIndex'])['expected2b'].transform('max')
df['expected3b'] = (df['maxBase'] == 3).astype('int')
df['expected3b'] = df.groupby(['gamePk', 'atBatIndex'])['expected3b'].transform('max')

# If there's someone at a base and we don't know how they got there, they were already there.
df['missing1b'] = (df['postOnFirst'].notna() & (df['expected1b'] == 0)).astype(int)
df['missing2b'] = (df['postOnSecond'].notna() & (df['expected2b'] == 0)).astype(int)
df['missing3b'] = (df['postOnThird'].notna() & (df['expected3b'] == 0)).astype(int)

In [18]:
# Count up observations within an atBatIndex
df['atBatIndexNum'] = df.groupby(['gamePk', 'atBatIndex']).cumcount() + 1

In [19]:
# Add those missing at each base, only copying the first instance
### 1B
copied1b = df[(df['missing1b'] == 1) & (df['atBatIndexNum'] == 1)].copy()

# Modify the specific columns for the copied rows
copied1b['start'] = "1B"
copied1b['end'] = "1B"
copied1b['startInt'] = 1
copied1b['endInt'] = 1
copied1b['minBase'] = 1
copied1b['maxBase'] = 1

### 2B
copied2b = df[(df['missing2b'] == 1) & (df['atBatIndexNum'] == 1)].copy()

# Modify the specific columns for the copied rows
copied2b['start'] = "2B"
copied2b['end'] = "2B"
copied2b['startInt'] = 2
copied2b['endInt'] = 2
copied2b['minBase'] = 2
copied2b['maxBase'] = 2

### 3B
copied3b = df[(df['missing3b'] == 1) & (df['atBatIndexNum'] == 1)].copy()

# Modify the specific columns for the copied rows
copied3b['start'] = "3B"
copied3b['end'] = "3B"
copied3b['startInt'] = 3
copied3b['endInt'] = 3
copied3b['minBase'] = 3
copied3b['maxBase'] = 3

In [20]:
# Concatenate the original DataFrame with the modified rows
df = pd.concat([df, copied1b, copied2b, copied3b], ignore_index=True)

In [21]:
df.sort_values(['gamePk', 'atBatIndex', 'atBatIndexNum'], inplace=True)

### Start Locations

In [22]:
# Any runner started on these bases
# Create start location dummies
df['pre_1b'] = (df['minBase'] == 1).astype('int')
df['pre_2b'] = (df['minBase'] == 2).astype('int')
df['pre_3b'] = (df['minBase'] == 3).astype('int')

# Group by 'gamePk' and 'atBatIndex', then use transform to calculate the max for each group
df['pre_1b'] = df.groupby(['gamePk', 'atBatIndex'])['pre_1b'].transform('max')
df['pre_2b'] = df.groupby(['gamePk', 'atBatIndex'])['pre_2b'].transform('max')
df['pre_3b'] = df.groupby(['gamePk', 'atBatIndex'])['pre_3b'].transform('max')

### Fix End Locations

In [23]:
# End locations: Runner
df['post_1b'] = (df['maxBase'] == 1).astype('int')
df['post_2b'] = (df['maxBase'] == 2).astype('int')
df['post_3b'] = (df['maxBase'] == 3).astype('int')

# End locations: At Bat (team)
df['post_1b'] = df.groupby(['gamePk', 'atBatIndex'])['post_1b'].transform('max')
df['post_2b'] = df.groupby(['gamePk', 'atBatIndex'])['post_2b'].transform('max')
df['post_3b'] = df.groupby(['gamePk', 'atBatIndex'])['post_3b'].transform('max')

# End locations: Blocked - this occurs when someone other than the runner is already on a base
# Note: You can't be blocked from advancing to a base you're on or have passed
df['blocked_1b'] = ((df['post_1b'] == 1) & (df['maxBase'] < 1)).astype('int')
df['blocked_2b'] = ((df['post_2b'] == 1) & (df['maxBase'] < 2)).astype('int')
df['blocked_3b'] = ((df['post_3b'] == 1) & (df['maxBase'] < 3)).astype('int')

### Events

In [24]:
# Create game events
df = create_events(df)

# Encode events as integer
df['eventsModelInt'] = df['eventsModel'].map({'b1': 1, 'b2': 2, 'b3': 3, 'hr': 4, 'bb': 5, 'hbp': 6, 'so': 7, 'fo': 8, 'go': 9, 'lo': 10, 'po': 11})

### Out locations

In [25]:
# Determine if a runner is out
df['out'] = (df['maxBase'] == 5).astype('int')

df['out_home'] = ((df['out'] == 1) & (df['minBase'] == 0)).astype('int')
df['out_1b'] = ((df['out'] == 1) & (df['minBase'] == 1)).astype('int')
df['out_2b'] = ((df['out'] == 1) & (df['minBase'] == 2)).astype('int')
df['out_3b'] = ((df['out'] == 1) & (df['minBase'] == 3)).astype('int')

df['out_home'] = df.groupby(['gamePk', 'atBatIndex'])['out_home'].transform('max')
df['out_1b'] = df.groupby(['gamePk', 'atBatIndex'])['out_1b'].transform('max')
df['out_2b'] = df.groupby(['gamePk', 'atBatIndex'])['out_2b'].transform('max')
df['out_3b'] = df.groupby(['gamePk', 'atBatIndex'])['out_3b'].transform('max')

### Cuts

In [26]:
# # Drop less relevant events
# df = df.query('eventsModel != "Cut"')

# Duplicates (should be very rare. I believe they're mlb's errors, not mine)
df.drop_duplicates(subset=['gamePk', 'atBatIndex', 'minBase'], keep='first', inplace=True)

# Calculate outs in PA
df['outs_calculated'] = df.groupby(['gamePk', 'atBatIndex'])['out'].transform('sum')

# Sometimes, there will be two outs without a double play recorded (typically a pickoff) but we need these for the math to work
df['double_play'] = np.where(df['outs_calculated'] == 2, 1, df['double_play'])
# Sometimes, there will be no outs on a play that's traditionally an out. These are errors.
df['error'] = np.where((df['outs_calculated'] == 0) & (df['eventType'] == 'fielders_choice'), 1, df['error'])

# Drop triple plays
df = df.query('outs_calculated != 3')

### Read in Steamer

In [27]:
# Read in Steamer hitters 
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')
# Clean
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)
steamer_hitters_df2.dropna(subset=batter_stats_fg, inplace=True)

In [28]:
# Convert to YYYYMMDD int
df['date'] = df['game_date'].str.replace("-", "").astype('int')

In [29]:
# Find the dates of Steamer projections
# We'll take the most recent and merge in that projection for each player
batter_steamer_dates = list(steamer_hitters_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date, steamer_dates):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
df["batter_date"] = df["date"].apply(lambda x: find_steamer_date(x, batter_steamer_dates))

### Identify steals and attempts

In [30]:
df['year'] = df['game_date'].str[:4].astype('int')

df2 = df.query('year == 2023')

# Count instances where description contains 'steal'
steal_df = df2[df2['description'].str.contains('steal', case=False)]

print("Number of instances where description contains 'steal' after dropping duplicates:", steal_df.shape[0])

Number of instances where description contains 'steal' after dropping duplicates: 1136


In [31]:
df.query('year == 2023')['movementReason'].value_counts()

movementReason
r_adv_force                       29415
r_adv_play                        13032
r_force_out                        7026
r_stolen_base_2b                   3070
r_caught_stealing_2b                611
r_stolen_base_3b                    454
r_runner_out                        297
r_doubled_off                       256
r_defensive_indiff                  228
r_pickoff_1b                        153
r_thrown_out                        144
r_pickoff_caught_stealing_2b        135
r_pickoff_error_1b                   74
r_caught_stealing_3b                 63
r_adv_throw                          48
r_rundown                            38
r_pickoff_error_2b                   28
r_caught_stealing_home               23
r_out_returning                      21
r_stolen_base_home                   20
r_pickoff_2b                         18
r_pickoff_caught_stealing_3b         17
r_pickoff_3b                          8
r_interference                        8
r_hbr                    

In [32]:
df['sb_2b'] = df['movementReason'].isin(['r_stolen_base_2b']).astype('int') 
df['sb_3b'] = df['movementReason'].isin(['r_stolen_base_3b']).astype('int') 
df['sba_2b'] = df['movementReason'].isin(['r_stolen_base_2b', 'r_caught_stealing_2b', 'r_pickoff_caught_stealing_2b']).astype('int') 
df['sba_3b'] = df['movementReason'].isin(['r_stolen_base_3b', 'r_caught_stealing_3b', 'r_pickoff_caught_stealing_3b']).astype('int') 

In [33]:
df['year'] = df["date"].astype('str').str[:4].astype('int')

# Creating dummy variables
dummy_years = pd.get_dummies(df['year'], prefix='year').astype('int')

# Concatenating dummy variables with original DataFrame
df = pd.concat([df, dummy_years], axis=1)

In [133]:
# Merge
steal_df = pd.merge(df[['gamePk', 'atBatIndex', 'minBase', 'runner_id', 'batter_date', 'sba_2b', 'sba_3b', 'sb_2b', 'sb_3b', 'outs_pre', 'pre_1b', 'pre_2b', 'pre_3b'] + list(dummy_years.columns)], steamer_hitters_df2[['mlbamid', 'date', 'sba_imp', 'sbr']], left_on=['runner_id', 'batter_date'], right_on=['mlbamid', 'date'], how='inner')

In [35]:
# steal_df.groupby('runner_id')[['sba_2b', 'sb_2b']].sum().reset_index(drop=True).sort_values('sb_2b', ascending=False)

In [36]:
steal_df.query('year_2022 == 1 or year_2023 == 1')[['sb_2b', 'sb_3b']].sum().sum()

6094

# Models

### Steals

### Attempt to steal 2B

In [135]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_1b == 1 and pre_2b == 0 and minBase == 1')[['outs_pre', 'sba_imp', 'sbr'] + list(dummy_years.columns)]
y = steal_df.query('pre_1b == 1 and pre_2b == 0 and minBase == 1')['sba_2b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
model_sba_2b = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', max_iter=100, early_stopping=True, random_state=42)
model_sba_2b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = model_sba_2b.predict_proba(X_test)

# Create DataFrame from probabilities
class_labels = model_sba_2b.classes_
probability_columns = [f'sba_2b_{label}' for label in class_labels]
probability_df = pd.DataFrame(probabilities, columns=probability_columns, index=X_test.index)

# Concatenate probability_df with y_test and X_test
sba_2b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Save model
sba_2b_filename = f"model_sba_2b_{todaysdate}.sav"
print(sba_2b_filename)

# Save model
pickle.dump(model_sba_2b, open(os.path.join(model_path, sba_2b_filename), 'wb'))

model_sba_2b_20240305.sav
CPU times: total: 1.38 s
Wall time: 9.72 s


In [136]:
# steal_df.head(50)

In [137]:
sba_2b_df.query('year_2023 == 1').describe()

Unnamed: 0,outs_pre,sba_imp,sbr,year_2015,year_2016,year_2017,year_2018,year_2019,year_2020,year_2021,year_2022,year_2023,sba_2b,sba_2b_0,sba_2b_1
count,7189.0,7189.0,7189.0,7189.0,7189.0,7189.0,7189.0,7189.0,7189.0,7189.0,7189.0,7189.0,7189.0,7189.0,7189.0
mean,0.899,0.0853,0.7214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.095,0.906,0.094
std,0.7408,0.0507,0.0309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2932,0.0481,0.0481
min,0.0,0.0041,0.642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.7713,0.0351
25%,0.0,0.0382,0.698,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.8775,0.0549
50%,1.0,0.0801,0.715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9127,0.0873
75%,1.0,0.1467,0.741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9451,0.1225
max,2.0,0.15,0.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.9649,0.2287


### Attempt to steal 3B

In [139]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_2b == 1 and pre_3b == 0 and minBase == 2')[['outs_pre', 'sba_imp', 'sbr'] + list(dummy_years.columns)]
y = steal_df.query('pre_2b == 1 and pre_3b == 0 and minBase == 2')['sba_3b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
model_sba_3b = MLPClassifier(hidden_layer_sizes=(30,), activation='relu', max_iter=30, random_state=42)
model_sba_3b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = model_sba_3b.predict_proba(X_test)

# Create DataFrame from probabilities
class_labels = model_sba_3b.classes_
probability_columns = [f'sba_3b_{label}' for label in class_labels]
probability_df = pd.DataFrame(probabilities, columns=probability_columns, index=X_test.index)

# Concatenate probability_df with y_test and X_test
sba_3b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Save model
sba_3b_filename = f"model_sba_3b_{todaysdate}.sav"
print(sba_3b_filename)

# Save model
pickle.dump(model_sba_3b, open(os.path.join(model_path, sba_3b_filename), 'wb'))

model_sba_3b_20240305.sav
CPU times: total: 4.02 s
Wall time: 4.09 s


In [140]:
sba_3b_df.query('year_2023 == 1').describe()

Unnamed: 0,outs_pre,sba_imp,sbr,year_2015,year_2016,year_2017,year_2018,year_2019,year_2020,year_2021,year_2022,year_2023,sba_3b,sba_3b_0,sba_3b_1
count,3755.0,3755.0,3755.0,3755.0,3755.0,3755.0,3755.0,3755.0,3755.0,3755.0,3755.0,3755.0,3755.0,3755.0,3755.0
mean,0.9409,0.084,0.7208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0306,0.9718,0.0282
std,0.7249,0.0509,0.0313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1723,0.0285,0.0285
min,0.0,0.0042,0.646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9067,0.0015
25%,0.0,0.0366,0.697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9575,0.0062
50%,1.0,0.0771,0.714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9853,0.0147
75%,1.0,0.145,0.74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9938,0.0425
max,2.0,0.15,0.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.9985,0.0933


In [55]:
(steal_df.query('pre_1b == 1 and pre_2b == 0 and year_2023 == 1')[['sb_2b']].sum(), steal_df.query('pre_2b == 1 and pre_3b == 0 and year_2023 == 1')[['sb_3b']].sum())

(sb_2b    2906
 dtype: int64,
 sb_3b    454
 dtype: int64)

### Steal 2B

In [141]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_1b == 1 and pre_2b == 0 and sba_2b == 1 and minBase == 1')[['outs_pre', 'sba_imp', 'sbr'] + list(dummy_years.columns)]
y = steal_df.query('pre_1b == 1 and pre_2b == 0 and sba_2b == 1 and minBase == 1')['sb_2b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
model_sb_2b = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', max_iter=100, early_stopping=True, random_state=42)
model_sb_2b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = model_sb_2b.predict_proba(X_test)

# Create DataFrame from probabilities
class_labels = model_sb_2b.classes_
probability_columns = [f'sb_2b_{label}' for label in class_labels]
probability_df = pd.DataFrame(probabilities, columns=probability_columns, index=X_test.index)

# Concatenate probability_df with y_test and X_test
sb_2b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Save model
sb_2b_filename = f"model_sb_2b_{todaysdate}.sav"
print(sb_2b_filename)

# Save model
pickle.dump(model_sb_2b, open(os.path.join(model_path, sb_2b_filename), 'wb'))

model_sb_2b_20240305.sav
CPU times: total: 219 ms
Wall time: 895 ms


In [142]:
sb_2b_df.describe()

Unnamed: 0,outs_pre,sba_imp,sbr,year_2015,year_2016,year_2017,year_2018,year_2019,year_2020,year_2021,year_2022,year_2023,sb_2b,sb_2b_0,sb_2b_1
count,4757.0,4757.0,4757.0,4757.0,4757.0,4757.0,4757.0,4757.0,4757.0,4757.0,4757.0,4757.0,4757.0,4757.0,4757.0
mean,1.1757,0.1172,0.6973,0.1188,0.1326,0.1232,0.115,0.1076,0.042,0.0982,0.1181,0.1444,0.7322,0.2673,0.7327
std,0.7775,0.0411,0.0468,0.3236,0.3392,0.3287,0.319,0.3099,0.2007,0.2976,0.3228,0.3516,0.4429,0.0513,0.0513
min,0.0,0.0052,0.576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1712,0.5998
25%,1.0,0.0856,0.664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.229,0.6904
50%,1.0,0.1443,0.699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.2694,0.7306
75%,2.0,0.15,0.731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3096,0.771
max,2.0,0.15,0.827,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4002,0.8288


### Steal 3B

In [143]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_2b == 1 and pre_3b == 0 and sba_3b == 1 and minBase == 2')[['outs_pre', 'sba_imp', 'sbr'] + list(dummy_years.columns)]
y = steal_df.query('pre_2b == 1 and pre_3b == 0 and sba_3b == 1 and minBase == 2')['sb_3b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
model_sb_3b = MLPClassifier(hidden_layer_sizes=(30,), activation='relu', max_iter=40, random_state=42)
model_sb_3b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = model_sb_3b.predict_proba(X_test)

# Create DataFrame from probabilities
class_labels = model_sb_3b.classes_
probability_columns = [f'sb_3b_{label}' for label in class_labels]
probability_df = pd.DataFrame(probabilities, columns=probability_columns, index=X_test.index)

# Concatenate probability_df with y_test and X_test
sb_3b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Save model
sb_3b_filename = f"model_sb_3b_{todaysdate}.sav"
print(sb_3b_filename)

# Save model
pickle.dump(model_sb_3b, open(os.path.join(model_path, sb_3b_filename), 'wb'))

model_sb_3b_20240305.sav
CPU times: total: 172 ms
Wall time: 203 ms


In [144]:
sb_3b_df.describe()

Unnamed: 0,outs_pre,sba_imp,sbr,year_2015,year_2016,year_2017,year_2018,year_2019,year_2020,year_2021,year_2022,year_2023,sb_3b,sb_3b_0,sb_3b_1
count,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0,665.0
mean,1.2195,0.1194,0.6973,0.1534,0.1414,0.1218,0.1173,0.0887,0.0376,0.0917,0.0902,0.1579,0.791,0.2153,0.7847
std,0.647,0.0394,0.0473,0.3606,0.3486,0.3273,0.322,0.2846,0.1904,0.2889,0.2867,0.3649,0.4069,0.0516,0.0516
min,0.0,0.0052,0.583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1205,0.6612
25%,1.0,0.0928,0.664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1735,0.7422
50%,1.0,0.1441,0.699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.2269,0.7731
75%,2.0,0.15,0.729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.2578,0.8265
max,2.0,0.15,0.82,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3388,0.8795
