In [2]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"

baseball_path = r'C:\Users\james\Documents\MLB\Database'

db_path = r'C:\Users\james\Documents\MLB\Database\MLBDB.db'
engine = create_engine(f'sqlite:///{db_path}')

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, classification_report, f1_score, make_scorer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from tensorflow import keras
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

import joblib
import matplotlib.pyplot as plt

In [4]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)


In [5]:
%run "A02. MLB API.ipynb"
%run "A03. Steamer.ipynb"

In [314]:
complete_dataset = create_pa_inputs(2023, 2023, short=50, long=300)

In [315]:
columns_to_process = ['preOnFirst', 'preOnSecond', 'preOnThird', 'postOnFirst', 'postOnSecond', 'postOnThird']

for column in columns_to_process:
    complete_dataset[column] = complete_dataset[column].astype(str)
    complete_dataset[column] = complete_dataset[column].apply(lambda x: re.search(r"'fullName':\s*'([^']*)'", x).group(1) if (pd.notna(x) and re.search(r"'fullName':\s*'([^']*)'", x)) else '')

In [316]:
complete_dataset['outs_pre'] = complete_dataset['outs_pre'].astype('int')

# Assuming 'complete_dataset' is your DataFrame
complete_dataset['description'] = complete_dataset['description'].apply(lambda x: re.sub(r'\s+', ' ', str(x)))

In [323]:
# Convert 'eventType' to string using Label Encoding
label_encoder = LabelEncoder()
complete_dataset['eventTypeInt'] = label_encoder.fit_transform(complete_dataset['eventType']).astype(int)
complete_dataset['eventTypeInt'] = pd.Categorical(complete_dataset['eventTypeInt'])

In [317]:
def determine_ab_dest(row):
    if row['batterName'] in row['postOnFirst']:
        return 'onFirst'
    elif row['batterName'] in row['postOnSecond']:
        return 'onSecond'
    elif row['batterName'] in row['postOnThird']:
        return 'onThird'
    elif row['eventType'] == "home_run":
        return 'scored'
    else:
        return 'NA'

In [318]:
def determine_on_1b_dest(row):
    if row['preOnFirst'] == "":
        return "NA"
    elif row['preOnFirst'] in row['postOnFirst']:
        return 'onFirst'
    elif row['preOnFirst'] in row['postOnSecond']:
        return 'onSecond'
    elif row['preOnFirst'] in row['postOnThird']:
        return 'onThird'
    elif re.search(fr"{re.escape(row['preOnFirst'])}\s*scores", row['description'], flags=re.IGNORECASE):
        return 'scored'
    else:
        return "NA"

In [319]:
def determine_on_2b_dest(row):
    if row['preOnSecond'] == "":
        return "NA"
    elif row['preOnSecond'] in row['postOnSecond']:
        return 'onSecond'
    elif row['preOnSecond'] in row['postOnThird']:
        return 'onThird'
    elif re.search(fr"{re.escape(row['preOnSecond'])}\s*scores", row['description'], flags=re.IGNORECASE):
        return 'scored'
    else:
        return 'NA'

In [320]:
def determine_on_3b_dest(row):
    if row['preOnThird'] == "":
        return "NA"
    elif row['preOnThird'] in row['postOnThird']:
        return 'onThird'
    elif re.search(fr"{re.escape(row['preOnThird'])}\s*scores", row['description'], flags=re.IGNORECASE):
        return 'scored'
    else:
        return "NA"


In [321]:
# Apply the function to create the new column
complete_dataset['on_1b_dest'] = complete_dataset.apply(determine_on_1b_dest, axis=1)
complete_dataset['on_2b_dest'] = complete_dataset.apply(determine_on_2b_dest, axis=1)
complete_dataset['on_3b_dest'] = complete_dataset.apply(determine_on_3b_dest, axis=1)
complete_dataset['ab_dest'] = complete_dataset.apply(determine_ab_dest, axis=1)

In [322]:
# Create three dummy columns
complete_dataset['on_1b'] = (complete_dataset['preOnFirst'] != '').astype('int')
complete_dataset['on_2b'] = (complete_dataset['preOnSecond'] != '').astype('int')
complete_dataset['on_3b'] = (complete_dataset['preOnThird'] != '').astype('int')

In [324]:
len(complete_dataset)

187676

In [335]:
complete_dataset[['description', 'preOnThird', 'postOnThird', 'on_3b_dest']].head(857).tail()

Unnamed: 0,description,preOnThird,postOnThird,on_3b_dest
852,Marcus Semien singles on a fly ball to right fielder Nick Castellanos. Josh H. Smith to 3rd.,,Josh H. Smith,
853,Corey Seager walks.,Josh H. Smith,Josh H. Smith,onThird
854,Nathaniel Lowe singles on a ground ball to pitcher Gregory Soto. Josh H. Smith scores. Marcus Semien to 3rd. Corey Seager to 2nd.,Josh H. Smith,Marcus Semien,scored
855,"Adolis Garcia singles on a sharp line drive to right fielder Nick Castellanos. Corey Seager scores. Nathaniel Lowe scores. Adolis Garcia out at 2nd on the throw, right fielder Nick Castellanos to shortstop Trea Turner.",Marcus Semien,,
856,J.T. Realmuto pops out to second baseman Marcus Semien.,,,


In [336]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder

# Assuming your DataFrame is named 'complete_dataset'

# Drop rows with missing values in the target columns
complete_dataset = complete_dataset.dropna(subset=['on_3b_dest', 'on_2b_dest', 'on_1b_dest', 'ab_dest'])

# Convert 'eventTypeInt' to categorical
complete_dataset['eventTypeInt'] = pd.Categorical(complete_dataset['eventTypeInt'])

# Select relevant features and target variables
features = ['on_1b', 'on_2b', 'on_3b', 'outs_pre', 'eventTypeInt']
targets = ['on_3b_dest', 'on_2b_dest', 'on_1b_dest', 'ab_dest']

X = complete_dataset.head(500000)[features]
y = complete_dataset.head(500000)[targets]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the neural network model using MultiOutputClassifier
model = MultiOutputClassifier(MLPClassifier(random_state=42))
model.fit(X_train, y_train)

# Get probabilities for each outcome for each target
predicted_probs = model.predict_proba(X_test)

# Create columns for probabilities for each outcome for each target
prob_columns = [f"{target}_prob_{class_}" for target in targets for class_ in model.estimators_[targets.index(target)].classes_]

# Add probabilities to X_test
for i, target in enumerate(targets):
    classes_for_target = model.estimators_[i].classes_
    for j, class_ in enumerate(classes_for_target):
        col_name = f"{target}_prob_{class_}"
        if col_name in prob_columns:
            X_test[col_name] = predicted_probs[i][:, j]

# Inverse transform 'eventTypeInt' to 'eventType'
X_test['eventType'] = label_encoder.inverse_transform(X_test['eventTypeInt'])

In [337]:
outcome_df = pd.concat([X_test, y_test], axis=1)

In [339]:
outcome_df.query('eventType == "home_run"').head()

Unnamed: 0,on_1b,on_2b,on_3b,outs_pre,eventTypeInt,on_3b_dest_prob_NA,on_3b_dest_prob_onThird,on_3b_dest_prob_scored,on_2b_dest_prob_NA,on_2b_dest_prob_onSecond,on_2b_dest_prob_onThird,on_2b_dest_prob_scored,on_1b_dest_prob_NA,on_1b_dest_prob_onFirst,on_1b_dest_prob_onSecond,on_1b_dest_prob_onThird,on_1b_dest_prob_scored,ab_dest_prob_NA,ab_dest_prob_onFirst,ab_dest_prob_onSecond,ab_dest_prob_onThird,ab_dest_prob_scored,eventType,on_3b_dest,on_2b_dest,on_1b_dest,ab_dest
108883,0,0,0,2,14,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,home_run,,,,scored
50780,1,0,1,2,14,0.02,0.55,0.42,1.0,0.0,0.0,0.0,0.01,0.0,0.24,0.01,0.74,0.0,0.0,0.0,0.0,1.0,home_run,scored,,scored,scored
24203,0,0,0,1,14,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,home_run,,,,scored
113157,0,0,0,2,14,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,home_run,,,,scored
130925,1,0,0,0,14,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.01,0.0,0.12,0.0,0.86,0.0,0.0,0.0,0.0,1.0,home_run,,,scored,scored


In [343]:
complete_dataset.query('eventType == "home_run"').query('on_1b == 1').query('on_1b_dest != "scored"')[['description', 'preOnFirst', 'preOnSecond', 'preOnThird', 'postOnFirst', 'on_1b', 'on_1b_dest']]

Unnamed: 0,description,preOnFirst,preOnSecond,preOnThird,postOnFirst,on_1b,on_1b_dest
2566,"Umpire reviewed (home run), call on the field was upheld: Adam Duvall homers (2) on a line drive to left center field. Rob Refsnyder scores.",Masataka Yoshida,,,,1,
6658,Jorge Soler homers (2) on a line drive to left center field.,Jazz Chisholm Jr.,,,,1,
12642,Alejandro Kirk homers (1) on a fly ball to left field. Matt Chapman scores. Jordan Luplow scores.,Brandon Belt,,Matt Chapman,,1,
22336,Brandon Lowe homers (6) on a fly ball to right center field. Vidal Brujan scores.,Yandy Diaz,,,,1,
42048,Tucupita Marcano homers (1) on a fly ball to center field.,Connor Joe,,,,1,
44483,Josh Naylor homers (5) on a fly ball to right center field. Jose Ramirez scores. Tyler Freeman scores.,Josh Bell,,Jose Ramirez,,1,
46927,Nolan Gorman homers (10) on a fly ball to right field. Lars Nootbaar scores. Andrew Knizner scores.,Paul Goldschmidt,Lars Nootbaar,,,1,
48039,Jorge Soler homers (10) on a fly ball to left center field. Jon Berti scores.,Luis Arraez,,,,1,
54012,J.D. Martinez homers (8) on a fly ball to center field.,Max Muncy,,,,1,
69579,Will Benson homers (1) on a fly ball to right field. Jake Fraley scores.,TJ Hopkins,,,,1,


In [6]:
game = statsapi.get('game_playByPlay', {'gamePk': 718580})
game

{'copyright': 'Copyright 2023 MLB Advanced Media, L.P.  Use of any content on this page acknowledges agreement to the terms posted here http://gdx.mlb.com/components/copyright.txt',
 'allPlays': [{'result': {'type': 'atBat',
    'event': 'Groundout',
    'eventType': 'field_out',
    'description': 'Taylor Ward grounds out, pitcher Tanner Houck to first baseman Triston Casas.',
    'rbi': 0,
    'awayScore': 0,
    'homeScore': 0,
    'isOut': True},
   'about': {'atBatIndex': 0,
    'halfInning': 'top',
    'isTopInning': True,
    'inning': 1,
    'startTime': '2023-04-14T23:11:55.735Z',
    'endTime': '2023-04-14T23:12:44.934Z',
    'isComplete': True,
    'isScoringPlay': False,
    'hasReview': False,
    'hasOut': True,
    'captivatingIndex': 0},
   'count': {'balls': 1, 'strikes': 2, 'outs': 1},
   'matchup': {'batter': {'id': 621493,
     'fullName': 'Taylor Ward',
     'link': '/api/v1/people/621493'},
    'batSide': {'code': 'R', 'description': 'Right'},
    'pitcher': {'id'

In [381]:
%run "A02. MLB API.ipynb"

game_df = create_game(716628)
game_df

Unnamed: 0,atBatIndex,inning,halfInning,outs,type,id,event,eventType,description,rbi,awayScore,homeScore,batter,batterName,batSide,pitcher,pitcherName,pitchHand,postOnFirst,postOnSecond,postOnThird,runner_id,start,end,movementReason,isScoringEvent,earned,gamePk,weather,wind,venue,date
0,0,1,top,1,atBat,518934,Pop Out,field_out,DJ LeMahieu pops out to second baseman Enmanuel Valdez.,0,0,0,518934,DJ LeMahieu,R,601713,Nick Pivetta,R,,,,518934,,,,False,False,716628,"74 degrees, Cloudy.","4 mph, Varies.",Fenway Park.,"September 12, 2023"
1,1,1,top,2,atBat,592450,Strikeout,strikeout,Aaron Judge strikes out swinging.,0,0,0,592450,Aaron Judge,R,601713,Nick Pivetta,R,,,,592450,,,,False,False,716628,"74 degrees, Cloudy.","4 mph, Varies.",Fenway Park.,"September 12, 2023"
2,2,1,top,3,atBat,650402,Strikeout,strikeout,Gleyber Torres strikes out swinging.,0,0,0,650402,Gleyber Torres,R,601713,Nick Pivetta,R,,,,650402,,,,False,False,716628,"74 degrees, Cloudy.","4 mph, Varies.",Fenway Park.,"September 12, 2023"
3,3,1,bottom,1,atBat,657077,Groundout,field_out,"Alex Verdugo grounds out, second baseman Gleyber Torres to first baseman DJ LeMahieu.",0,0,0,657077,Alex Verdugo,L,681190,Randy Vasquez,R,,,,657077,,,,False,False,716628,"74 degrees, Cloudy.","4 mph, Varies.",Fenway Park.,"September 12, 2023"
4,4,1,bottom,2,atBat,646240,Groundout,field_out,"Rafael Devers grounds out, second baseman Gleyber Torres to first baseman DJ LeMahieu.",0,0,0,646240,Rafael Devers,L,681190,Randy Vasquez,R,,,,646240,,,,False,False,716628,"74 degrees, Cloudy.","4 mph, Varies.",Fenway Park.,"September 12, 2023"
5,5,1,bottom,3,atBat,457759,Groundout,field_out,"Justin Turner grounds out softly, catcher Kyle Higashioka to first baseman DJ LeMahieu.",0,0,0,457759,Justin Turner,R,681190,Randy Vasquez,R,,,,457759,,,,False,False,716628,"74 degrees, Cloudy.","4 mph, Varies.",Fenway Park.,"September 12, 2023"
6,6,2,top,1,atBat,519317,Strikeout,strikeout,Giancarlo Stanton strikes out swinging.,0,0,0,519317,Giancarlo Stanton,R,601713,Nick Pivetta,R,,,,519317,,,,False,False,716628,"74 degrees, Cloudy.","4 mph, Varies.",Fenway Park.,"September 12, 2023"
7,7,2,top,2,atBat,683011,Groundout,field_out,"Anthony Volpe grounds out, shortstop Ceddanne Rafaela to first baseman Triston Casas.",0,0,0,683011,Anthony Volpe,R,601713,Nick Pivetta,R,,,,683011,,,,False,False,716628,"74 degrees, Cloudy.","4 mph, Varies.",Fenway Park.,"September 12, 2023"
8,8,2,top,3,atBat,643396,Strikeout,strikeout,Isiah Kiner-Falefa strikes out swinging.,0,0,0,643396,Isiah Kiner-Falefa,R,601713,Nick Pivetta,R,,,,643396,,,,False,False,716628,"74 degrees, Cloudy.","4 mph, Varies.",Fenway Park.,"September 12, 2023"
9,9,2,bottom,0,atBat,671213,Single,single,Triston Casas singles on a line drive to left fielder Everson Pereira.,0,0,0,671213,Triston Casas,L,681190,Randy Vasquez,R,"{'id': 671213, 'fullName': 'Triston Casas', 'link': '/api/v1/people/671213'}",,,671213,,1B,,False,False,716628,"74 degrees, Cloudy.","4 mph, Varies.",Fenway Park.,"September 12, 2023"


You want to use start (base), outs_pre, eventType, on_1b, on_2b, on_3b, to predict end
So create outs_pre by comparing to previous atbat 
using same atbat, check if on bases

In [None]:
This will do advances and everything perfectly
However, it won't be obvious when an error happens
May make sense to predict errors first and then base stuff?
This way, you know about charging unearned runs and stuff. 

Sim if reached on field error
if yes, then unearned
Then choose outcome, single, double, etc
Then predict if error
if yes, then unearned
Then determine outcome (base)
need var for blocked 3b, blocked 2b, blocked 1b?>

Maybe try to find out if they reached on error

In [382]:
game

{'copyright': 'Copyright 2023 MLB Advanced Media, L.P.  Use of any content on this page acknowledges agreement to the terms posted here http://gdx.mlb.com/components/copyright.txt',
 'allPlays': [{'result': {'type': 'atBat',
    'event': 'Groundout',
    'eventType': 'field_out',
    'description': 'Taylor Ward grounds out, pitcher Tanner Houck to first baseman Triston Casas.',
    'rbi': 0,
    'awayScore': 0,
    'homeScore': 0,
    'isOut': True},
   'about': {'atBatIndex': 0,
    'halfInning': 'top',
    'isTopInning': True,
    'inning': 1,
    'startTime': '2023-04-14T23:11:55.735Z',
    'endTime': '2023-04-14T23:12:44.934Z',
    'isComplete': True,
    'isScoringPlay': False,
    'hasReview': False,
    'hasOut': True,
    'captivatingIndex': 0},
   'count': {'balls': 1, 'strikes': 2, 'outs': 1},
   'matchup': {'batter': {'id': 621493,
     'fullName': 'Taylor Ward',
     'link': '/api/v1/people/621493'},
    'batSide': {'code': 'R', 'description': 'Right'},
    'pitcher': {'id'