# M04. Steals

This predicts stolen bases using outs and some imputed stolen base attempt/success rates from Steamer. <br>
Warning: This was modified from M03. Base Running and contains some vestigial code. This isn't a problem, but just a reminder.

In [6]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"

baseball_path = r'C:\Users\james\Documents\MLB\Database'

db_path = r'C:\Users\james\Documents\MLB\Database\MLBDB.db'
engine = create_engine(f'sqlite:///{db_path}')

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, classification_report, f1_score, make_scorer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from tensorflow import keras
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

import joblib
import matplotlib.pyplot as plt

In [8]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [9]:
%run "A02. MLB API.ipynb"
%run "A03. Steamer.ipynb"

### Read in plays

In [10]:
# df2015 = plays_statsapi("04/01/2015", "10/31/2015")
# df2016 = plays_statsapi("04/01/2016", "10/31/2016")
# df2017 = plays_statsapi("04/01/2017", "10/31/2017")
# df2018 = plays_statsapi("04/01/2018", "10/31/2018")
# df2019 = plays_statsapi("04/01/2019", "10/31/2019")
# df2020 = plays_statsapi("04/01/2020", "10/31/2020")
# df2021 = plays_statsapi("04/01/2021", "10/31/2021")
# df2022 = plays_statsapi("04/01/2022", "10/31/2022")
# df2023 = plays_statsapi("04/01/2023", "10/31/2023")

In [11]:
# df = pd.concat([df2015, df2016, df2017, df2018, df2019, df2020, df2021, df2022, df2023], axis=0)
# df = df.query('game_type == "R"')

In [12]:
# df.to_csv(os.path.join(baseball_path, "Complete Dataset.csv"), index=False)

In [43]:
# Read from CSV
df = pd.read_csv(os.path.join(baseball_path, "Complete Dataset.csv"))
# Keep only regular season games
df = df.query('game_type == "R"')

r_adv_force: advanced on a ball in play because they were forced to <br>
r_adv_play: advanced on a ball in play without being forced to <br>
r_force_out: out on a force play <br>
r_adv_throw: advanced on the throw, not the contact <br>
r_runner_out: out not on a force play <br>
r_thrown_out: out on a hit (base runner) <br>
r_doubled_off: out on a ball caught and thrown to base <br>
r_out_stretching: out on a hit (hitter) <br>

We now have where every runner, including the batter, started and finished. However, we have two problems:
- Some base runners don't move. They are not included in the dataset yet. 
- Some base runners move more than once in a play. They may go from 1B to 2B on a hit and then 2B to 3B on a throw.

### Clean and Create Variables

In [44]:
# Fill in missings
df['description'].fillna("Missing", inplace=True)

# Identify errors
df['error'] = df['description'].str.contains('error', case=False).astype('int')
# Double Play dummy (will be cleaned a bit later)
df['double_play'] = df['eventType'].isin(['grounded_into_double_play', 'double_play', 'sac_fly_double_play', 'strikeout_double_play', 'sac_bunt_double_play']).astype(int)

# Create bottom half inning dummy
df['bottom'] = (df['halfInning'] == "bottom").astype('int')

# Determine outs before at bat
df['outs_pre'] = df.groupby(['gamePk', 'inning', 'bottom'])['outs'].shift(1)
df['outs_pre'] = df.groupby(['gamePk', 'atBatIndex'])['outs_pre'].transform('min')
df['outs_pre'] = np.where(df['outs_pre'] == 3, 0, df['outs_pre'])

# Fill in missings
df['outs_pre'].fillna(0, inplace=True)

### Multiple Movements

Identify where a runner starts and ends in an at bat. Only keep one instance. 

In [45]:
# Determine start and end base by number
# 0 is AB, 1 is 1B, 2 is 2B, 3 is 3B, 4 is scored, 5 is out
df['startInt'] = df['start'].apply(lambda x: 0 if pd.isna(x) else int(x[0]) if x[0].isdigit() else 0)
df['endInt'] = df['end'].apply(lambda x: 5 if pd.isna(x) else 4 if x.lower() == 'score' else int(x[0]) if x[0].isdigit() else 0)

In [46]:
df['minBase'] = df.groupby(['gamePk', 'atBatIndex', 'runner_id'])['startInt'].transform('min')
df['maxBase'] = df.groupby(['gamePk', 'atBatIndex', 'runner_id'])['endInt'].transform('max')

The following is removed from the base running model, but we want to keep steals for the steal model, obviously, so this is left just for comparison.

In [20]:
# # List of typical movement types
# movementReason_list = ['r_adv_force', 'r_adv_play', 'r_force_out', 'r_adv_throw', 'r_runner_out', 'r_thrown_out', 'r_doubled_off', 'r_out_stretching']
# df['movementTypical'] = df['movementReason'].apply(lambda x: 1 if x in movementReason_list or pd.isna(x) else 0)
# # Apply to whole at bat. We'll only keep those for which all movement is typical.
# df['movementTypical'] = df.groupby(['gamePk', 'atBatIndex'])['movementTypical'].transform('min')

In [21]:
# df = df.query('movementTypical == 1')
# df.drop(columns={'movementTypical'}, inplace=True)

In [22]:
# Drop duplicates, keeping first (which has almost all steals)

In [47]:
df.drop_duplicates(['gamePk', 'atBatIndex', 'runner_id'], keep='first', inplace=True)

### Stationary Runners

We can expect a runner on each base at the end of the play that the maxBase variable suggests

In [48]:
df.head()

Unnamed: 0,atBatIndex,inning,halfInning,outs,type,id,event,eventType,description,rbi,awayScore,homeScore,batter,batterName,batSide,pitcher,pitcherName,pitchHand,postOnFirst,postOnSecond,postOnThird,runner_id,start,end,movementReason,isScoringEvent,earned,gamePk,weather,wind,venue,date,away_name,home_name,game_date,game_type,venue_id,error,double_play,bottom,outs_pre,startInt,endInt,minBase,maxBase
6063,0,1,top,1,atBat,572761,Groundout,field_out,"Matt Carpenter grounds out, second baseman Tommy La Stella to first baseman Anthony Rizzo.",0,0,0,572761,Matt Carpenter,L,452657,Jon Lester,L,,,,572761,,,,False,False,413661,"44 degrees, Clear.","7 mph, In From CF.",Wrigley Field.,"April 5, 2015",St. Louis Cardinals,Chicago Cubs,2015-04-05,R,17,0,0,0,0.0,0,5,0,5
6064,1,1,top,1,atBat,518792,Double,double,Jason Heyward doubles (1) on a line drive to right fielder Jorge Soler.,0,0,0,518792,Jason Heyward,L,452657,Jon Lester,L,,"{'id': 518792, 'fullName': 'Jason Heyward', 'link': '/api/v1/people/518792'}",,518792,,2B,,False,False,413661,"44 degrees, Clear.","7 mph, In From CF.",Wrigley Field.,"April 5, 2015",St. Louis Cardinals,Chicago Cubs,2015-04-05,R,17,0,0,0,1.0,0,2,0,2
6065,2,1,top,1,atBat,407812,Single,single,Matt Holliday singles on a line drive to right fielder Jorge Soler. Jason Heyward scores.,1,1,0,407812,Matt Holliday,R,452657,Jon Lester,L,"{'id': 407812, 'fullName': 'Matt Holliday', 'link': '/api/v1/people/407812'}",,,407812,,1B,,False,False,413661,"44 degrees, Clear.","7 mph, In From CF.",Wrigley Field.,"April 5, 2015",St. Louis Cardinals,Chicago Cubs,2015-04-05,R,17,0,0,0,1.0,0,1,0,1
6066,2,1,top,1,atBat,518792,Single,single,Matt Holliday singles on a line drive to right fielder Jorge Soler. Jason Heyward scores.,1,1,0,407812,Matt Holliday,R,452657,Jon Lester,L,"{'id': 407812, 'fullName': 'Matt Holliday', 'link': '/api/v1/people/407812'}",,,518792,2B,score,r_adv_play,True,True,413661,"44 degrees, Clear.","7 mph, In From CF.",Wrigley Field.,"April 5, 2015",St. Louis Cardinals,Chicago Cubs,2015-04-05,R,17,0,0,0,1.0,2,4,2,4
6067,3,1,top,2,atBat,425509,Strikeout,strikeout,Jhonny Peralta strikes out swinging.,0,1,0,425509,Jhonny Peralta,R,452657,Jon Lester,L,"{'id': 407812, 'fullName': 'Matt Holliday', 'link': '/api/v1/people/407812'}",,,425509,,,,False,False,413661,"44 degrees, Clear.","7 mph, In From CF.",Wrigley Field.,"April 5, 2015",St. Louis Cardinals,Chicago Cubs,2015-04-05,R,17,0,0,0,1.0,0,5,0,5


In [24]:
# For each base at which there is a runner after the play, there should be an explanation for it in "movement"
df['expected1b'] = (df['maxBase'] == 1).astype('int')
df['expected1b'] = df.groupby(['gamePk', 'atBatIndex'])['expected1b'].transform('max')
df['expected2b'] = (df['maxBase'] == 2).astype('int')
df['expected2b'] = df.groupby(['gamePk', 'atBatIndex'])['expected2b'].transform('max')
df['expected3b'] = (df['maxBase'] == 3).astype('int')
df['expected3b'] = df.groupby(['gamePk', 'atBatIndex'])['expected3b'].transform('max')

# If there's someone at a base and we don't know how they got there, they were already there.
df['missing1b'] = (df['postOnFirst'].notna() & (df['expected1b'] == 0)).astype(int)
df['missing2b'] = (df['postOnSecond'].notna() & (df['expected2b'] == 0)).astype(int)
df['missing3b'] = (df['postOnThird'].notna() & (df['expected3b'] == 0)).astype(int)

In [25]:
# Count up observations within an atBatIndex
df['atBatIndexNum'] = df.groupby(['gamePk', 'atBatIndex']).cumcount() + 1

In [26]:
# Add those missing at each base, only copying the first instance
### 1B
copied1b = df[(df['missing1b'] == 1) & (df['atBatIndexNum'] == 1)].copy()

# Modify the specific columns for the copied rows
copied1b['start'] = "1B"
copied1b['end'] = "1B"
copied1b['startInt'] = 1
copied1b['endInt'] = 1
copied1b['minBase'] = 1
copied1b['maxBase'] = 1

### 2B
copied2b = df[(df['missing2b'] == 1) & (df['atBatIndexNum'] == 1)].copy()

# Modify the specific columns for the copied rows
copied2b['start'] = "2B"
copied2b['end'] = "2B"
copied2b['startInt'] = 2
copied2b['endInt'] = 2
copied2b['minBase'] = 2
copied2b['maxBase'] = 2

### 3B
copied3b = df[(df['missing3b'] == 1) & (df['atBatIndexNum'] == 1)].copy()

# Modify the specific columns for the copied rows
copied3b['start'] = "3B"
copied3b['end'] = "3B"
copied3b['startInt'] = 3
copied3b['endInt'] = 3
copied3b['minBase'] = 3
copied3b['maxBase'] = 3

In [27]:
# Concatenate the original DataFrame with the modified rows
df = pd.concat([df, copied1b, copied2b, copied3b], ignore_index=True)

In [28]:
df.sort_values(['gamePk', 'atBatIndex', 'atBatIndexNum'], inplace=True)

### Start Locations

In [29]:
# Any runner started on these bases
# Create start location dummies
df['pre_1b'] = (df['minBase'] == 1).astype('int')
df['pre_2b'] = (df['minBase'] == 2).astype('int')
df['pre_3b'] = (df['minBase'] == 3).astype('int')

# Group by 'gamePk' and 'atBatIndex', then use transform to calculate the max for each group
df['pre_1b'] = df.groupby(['gamePk', 'atBatIndex'])['pre_1b'].transform('max')
df['pre_2b'] = df.groupby(['gamePk', 'atBatIndex'])['pre_2b'].transform('max')
df['pre_3b'] = df.groupby(['gamePk', 'atBatIndex'])['pre_3b'].transform('max')

### Fix End Locations

In [30]:
# End locations: Runner
df['post_1b'] = (df['maxBase'] == 1).astype('int')
df['post_2b'] = (df['maxBase'] == 2).astype('int')
df['post_3b'] = (df['maxBase'] == 3).astype('int')

# End locations: At Bat (team)
df['post_1b'] = df.groupby(['gamePk', 'atBatIndex'])['post_1b'].transform('max')
df['post_2b'] = df.groupby(['gamePk', 'atBatIndex'])['post_2b'].transform('max')
df['post_3b'] = df.groupby(['gamePk', 'atBatIndex'])['post_3b'].transform('max')

# End locations: Blocked - this occurs when someone other than the runner is already on a base
# Note: You can't be blocked from advancing to a base you're on or have passed
df['blocked_1b'] = ((df['post_1b'] == 1) & (df['maxBase'] < 1)).astype('int')
df['blocked_2b'] = ((df['post_2b'] == 1) & (df['maxBase'] < 2)).astype('int')
df['blocked_3b'] = ((df['post_3b'] == 1) & (df['maxBase'] < 3)).astype('int')

### Events

In [31]:
# Create game events
df = create_events(df)

# Encode events as integer
df['eventsModelInt'] = df['eventsModel'].map({'b1': 1, 'b2': 2, 'b3': 3, 'hr': 4, 'bb': 5, 'hbp': 6, 'so': 7, 'fo': 8, 'go': 9, 'lo': 10, 'po': 11})

### Out locations

In [32]:
# Determine if a runner is out
df['out'] = (df['maxBase'] == 5).astype('int')

df['out_home'] = ((df['out'] == 1) & (df['minBase'] == 0)).astype('int')
df['out_1b'] = ((df['out'] == 1) & (df['minBase'] == 1)).astype('int')
df['out_2b'] = ((df['out'] == 1) & (df['minBase'] == 2)).astype('int')
df['out_3b'] = ((df['out'] == 1) & (df['minBase'] == 3)).astype('int')

df['out_home'] = df.groupby(['gamePk', 'atBatIndex'])['out_home'].transform('max')
df['out_1b'] = df.groupby(['gamePk', 'atBatIndex'])['out_1b'].transform('max')
df['out_2b'] = df.groupby(['gamePk', 'atBatIndex'])['out_2b'].transform('max')
df['out_3b'] = df.groupby(['gamePk', 'atBatIndex'])['out_3b'].transform('max')

### Cuts

In [33]:
# # Drop less relevant events
# df = df.query('eventsModel != "Cut"')

# Duplicates (should be very rare. I believe they're mlb's errors, not mine)
df.drop_duplicates(subset=['gamePk', 'atBatIndex', 'minBase'], keep='first', inplace=True)

# Calculate outs in PA
df['outs_calculated'] = df.groupby(['gamePk', 'atBatIndex'])['out'].transform('sum')

# Sometimes, there will be two outs without a double play recorded (typically a pickoff) but we need these for the math to work
df['double_play'] = np.where(df['outs_calculated'] == 2, 1, df['double_play'])
# Sometimes, there will be no outs on a play that's traditionally an out. These are errors.
df['error'] = np.where((df['outs_calculated'] == 0) & (df['eventType'] == 'fielders_choice'), 1, df['error'])

# Drop triple plays
df = df.query('outs_calculated != 3')

### Read in Steamer

In [34]:
# Read in Steamer hitters 
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')
# Clean
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)
steamer_hitters_df2.dropna(subset=batter_stats_fg, inplace=True)

In [35]:
# Convert to YYYYMMDD int
df['date'] = df['game_date'].str.replace("-", "").astype('int')

In [36]:
# Find the dates of Steamer projections
# We'll take the most recent and merge in that projection for each player
batter_steamer_dates = list(steamer_hitters_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date, steamer_dates):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
df["batter_date"] = df["date"].apply(lambda x: find_steamer_date(x, batter_steamer_dates))

### Identify steals and attempts

In [37]:
df['sb_2b'] = df['movementReason'].isin(['r_stolen_base_2b']).astype('int') 
df['sb_3b'] = df['movementReason'].isin(['r_stolen_base_3b']).astype('int') 
df['sba_2b'] = df['movementReason'].isin(['r_stolen_base_2b', 'r_caught_stealing_2b', 'r_pickoff_caught_stealing_2b']).astype('int') 
df['sba_3b'] = df['movementReason'].isin(['r_stolen_base_3b', 'r_caught_stealing_3b', 'r_pickoff_caught_stealing_3b']).astype('int') 

In [38]:
# Create year variable
df['year'] = df["date"].astype('str').str[:4].astype('int')

# Creating dummy variables
dummy_years = pd.get_dummies(df['year'], prefix='year').astype('int')

# Concatenating dummy variables with original DataFrame
df = pd.concat([df, dummy_years], axis=1)

In [39]:
# Merge
steal_df = pd.merge(df[['year', 'gamePk', 'eventsModel', 'atBatIndex', 'minBase', 'runner_id', 'movementReason', 'batter_date', 'sba_2b', 'sba_3b', 'sb_2b', 'sb_3b', 'outs_pre', 'pre_1b', 'pre_2b', 'pre_3b'] + list(dummy_years.columns)], steamer_hitters_df2[['mlbamid', 'date', 'sba_imp', 'sbr']], left_on=['runner_id', 'batter_date'], right_on=['mlbamid', 'date'], how='inner')

In [40]:
steal_df.query('pre_1b == 1 and pre_2b == 0 and minBase == 1').query('year_2022 == 1 or year_2023 == 1')[['sb_2b', 'sb_3b']].sum().sum()

4877

Note: runner_id doesn't seem to correspond to the correct runners. Not sure if this is problematic.

In [41]:
steal_df = steal_df.sort_values(['gamePk', 'atBatIndex', 'minBase'], ascending=True)

In [42]:
steal_df.head(100)

Unnamed: 0,year,gamePk,eventsModel,atBatIndex,minBase,runner_id,movementReason,batter_date,sba_2b,sba_3b,sb_2b,sb_3b,outs_pre,pre_1b,pre_2b,pre_3b,year_2015,year_2016,year_2017,year_2018,year_2019,year_2020,year_2021,year_2022,year_2023,mlbamid,date,sba_imp,sbr
0,2015,413649,so,0,0,594777,,20140928,0,0,0,0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,594777.0,20140928,0.0977,0.6429
26,2015,413649,hr,1,0,545361,,20140928,0,0,0,0,1.0,0,0,0,1,0,0,0,0,0,0,0,0,545361.0,20140928,0.15,0.7201
58,2015,413649,so,2,0,405395,,20140928,0,0,0,0,1.0,0,0,0,1,0,0,0,0,0,0,0,0,405395.0,20140928,0.0405,0.6667
89,2015,413649,so,3,0,459964,,20140928,0,0,0,0,2.0,0,0,0,1,0,0,0,0,0,0,0,0,459964.0,20140928,0.0653,0.6226
112,2015,413649,go,4,0,457706,,20140928,0,0,0,0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,457706.0,20140928,0.143,0.6921
151,2015,413649,b2,5,0,452234,,20140928,0,0,0,0,1.0,0,0,0,1,0,0,0,0,0,0,0,0,452234.0,20140928,0.0238,0.6329
163,2015,413649,go,6,0,429664,,20140928,0,0,0,0,1.0,0,1,0,1,0,0,0,0,0,0,0,0,429664.0,20140928,0.0579,0.6286
152,2015,413649,go,6,2,452234,r_adv_play,20140928,0,0,0,0,1.0,0,1,0,1,0,0,0,0,0,0,0,0,452234.0,20140928,0.0238,0.6329
193,2015,413649,po,7,0,443558,,20140928,0,0,0,0,2.0,0,0,0,1,0,0,0,0,0,0,0,0,443558.0,20140928,0.0646,0.6447
224,2015,413649,so,8,0,501896,,20140928,0,0,0,0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,501896.0,20140928,0.0341,0.6


# Models

### Steals

### Attempt to steal 2B

In [None]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_1b == 1 and pre_2b == 0 and minBase == 1')[['outs_pre', 'sba_imp', 'sbr'] + list(dummy_years.columns)]
y = steal_df.query('pre_1b == 1 and pre_2b == 0 and minBase == 1')['sba_2b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
model_sba_2b = MLPClassifier(hidden_layer_sizes=(100,100), activation='relu', max_iter=100, early_stopping=True, random_state=44)
model_sba_2b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = model_sba_2b.predict_proba(X_test)

# Create DataFrame from probabilities
class_labels = model_sba_2b.classes_
probability_columns = [f'sba_2b_{label}' for label in class_labels]
probability_df = pd.DataFrame(probabilities, columns=probability_columns, index=X_test.index)

# Concatenate probability_df with y_test and X_test
sba_2b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Save model
sba_2b_filename = f"model_sba_2b_{todaysdate}.sav"
print(sba_2b_filename)

# Save model
pickle.dump(model_sba_2b, open(os.path.join(model_path, sba_2b_filename), 'wb'))

In [None]:
# Add xtiles (to examine how well predictions match actual results)
sba_2b_df['decile'] = pd.qcut(sba_2b_df['sba_2b_1'], 10, labels=False)
globals()["sba_2b_df"] = sba_2b_df.groupby('decile')[['sba_2b', 'sba_2b_1']].mean().reset_index()

In [None]:
# Create figures
plt.plot(sba_2b_df['decile'], sba_2b_df['sba_2b_1'], color='red')
plt.plot(sba_2b_df['decile'], sba_2b_df['sba_2b'], color='black')
plt.show() 

### Attempt to steal 3B

In [None]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_2b == 1 and pre_3b == 0 and minBase == 2')[['outs_pre', 'sba_imp', 'sbr'] + list(dummy_years.columns)]
y = steal_df.query('pre_2b == 1 and pre_3b == 0 and minBase == 2')['sba_3b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
model_sba_3b = MLPClassifier(hidden_layer_sizes=(30,), activation='relu', max_iter=30, random_state=42)
model_sba_3b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = model_sba_3b.predict_proba(X_test)

# Create DataFrame from probabilities
class_labels = model_sba_3b.classes_
probability_columns = [f'sba_3b_{label}' for label in class_labels]
probability_df = pd.DataFrame(probabilities, columns=probability_columns, index=X_test.index)

# Concatenate probability_df with y_test and X_test
sba_3b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Save model
sba_3b_filename = f"model_sba_3b_{todaysdate}.sav"
print(sba_3b_filename)

# Save model
pickle.dump(model_sba_3b, open(os.path.join(model_path, sba_3b_filename), 'wb'))

In [None]:
# Add xtiles (to examine how well predictions match actual results)
sba_3b_df['decile'] = pd.qcut(sba_3b_df['sba_3b_1'], 10, labels=False)
globals()["sba_3b_df"] = sba_3b_df.groupby('decile')[['sba_3b', 'sba_3b_1']].mean().reset_index()

In [None]:
# Create figures
plt.plot(sba_3b_df['decile'], sba_3b_df['sba_3b_1'], color='red')
plt.plot(sba_3b_df['decile'], sba_3b_df['sba_3b'], color='black')
plt.show() 

### Steal 2B

In [None]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_1b == 1 and pre_2b == 0 and sba_2b == 1 and minBase == 1')[['outs_pre', 'sba_imp', 'sbr'] + list(dummy_years.columns)]
y = steal_df.query('pre_1b == 1 and pre_2b == 0 and sba_2b == 1 and minBase == 1')['sb_2b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
model_sb_2b = MLPClassifier(hidden_layer_sizes=(80,), activation='relu', max_iter=100, early_stopping=True, random_state=45)
model_sb_2b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = model_sb_2b.predict_proba(X_test)

# Create DataFrame from probabilities
class_labels = model_sb_2b.classes_
probability_columns = [f'sb_2b_{label}' for label in class_labels]
probability_df = pd.DataFrame(probabilities, columns=probability_columns, index=X_test.index)

# Concatenate probability_df with y_test and X_test
sb_2b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Save model
sb_2b_filename = f"model_sb_2b_{todaysdate}.sav"
print(sb_2b_filename)

# Save model
pickle.dump(model_sb_2b, open(os.path.join(model_path, sb_2b_filename), 'wb'))

In [None]:
# Add xtiles (to examine how well predictions match actual results)
sb_2b_df['decile'] = pd.qcut(sb_2b_df['sb_2b_1'], 10, labels=False)
globals()["sb_2b_df"] = sb_2b_df.groupby('decile')[['sb_2b', 'sb_2b_1']].mean().reset_index()

In [None]:
# Create figures
plt.plot(sb_2b_df['decile'], sb_2b_df['sb_2b_1'], color='red')
plt.plot(sb_2b_df['decile'], sb_2b_df['sb_2b'], color='black')
plt.show() 

### Steal 3B

In [None]:
%%time
# Select relevant columns and handle missing values
X = steal_df.query('pre_2b == 1 and pre_3b == 0 and sba_3b == 1 and minBase == 2')[['outs_pre', 'sba_imp', 'sbr'] + list(dummy_years.columns)]
y = steal_df.query('pre_2b == 1 and pre_3b == 0 and sba_3b == 1 and minBase == 2')['sb_3b']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the MLPClassifier
model_sb_3b = MLPClassifier(hidden_layer_sizes=(100,100,100), activation='relu', early_stopping=True, max_iter=100, random_state=42)
model_sb_3b.fit(X_train, y_train)

# Get probability predictions on the test set
probabilities = model_sb_3b.predict_proba(X_test)

# Create DataFrame from probabilities
class_labels = model_sb_3b.classes_
probability_columns = [f'sb_3b_{label}' for label in class_labels]
probability_df = pd.DataFrame(probabilities, columns=probability_columns, index=X_test.index)

# Concatenate probability_df with y_test and X_test
sb_3b_df = pd.concat([X_test, y_test, probability_df], axis=1)

# Save model
sb_3b_filename = f"model_sb_3b_{todaysdate}.sav"
print(sb_3b_filename)

# Save model
pickle.dump(model_sb_3b, open(os.path.join(model_path, sb_3b_filename), 'wb'))

In [None]:
# Add xtiles (to examine how well predictions match actual results)
sb_3b_df['decile'] = pd.qcut(sb_3b_df['sb_3b_1'], 10, labels=False)
globals()["sb_3b_df"] = sb_3b_df.groupby('decile')[['sb_3b', 'sb_3b_1']].mean().reset_index()

In [None]:
# Create figures
plt.plot(sb_3b_df['decile'], sb_3b_df['sb_3b_1'], color='red')
plt.plot(sb_3b_df['decile'], sb_3b_df['sb_3b'], color='black')
plt.show() 