# M03. Predict Pulls
- Predict when pitcher gets removed from the game

In [None]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"
%run "D3. Simulation Functions.ipynb"

baseball_path = r'C:\Users\james\Documents\MLB\Database'

db_path = r'C:\Users\james\Documents\MLB\Database\MLBDB.db'
engine = create_engine(f'sqlite:///{db_path}')

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, classification_report, f1_score, make_scorer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from tensorflow import keras
from sklearn.impute import SimpleImputer

import joblib
import matplotlib.pyplot as plt

In [None]:
%run "A03. Steamer.ipynb"

### Dataset

In [None]:
# Choose the last instance of each player in each game, assuming they have enough PAs
sql_query = f'''
    SELECT *
    FROM "Dataset"
'''

complete_dataset = pd.read_sql_query(sql_query, con=engine)

In [None]:
complete_dataset.head()

### Steamer

In [None]:
# Choose the last instance of each player in each game, assuming they have enough PAs
sql_query = f'''
  SELECT *
  FROM "Steamer Pitchers"
'''

steamer_pitchers_df = pd.read_sql_query(sql_query, con=engine)

In [None]:
# Clean
steamer_pitchers_df2 = clean_steamer_pitchers(steamer_pitchers_df)
steamer_pitchers_df2.dropna(subset=pitcher_stats_fg2, inplace=True)

In [None]:
steamer_pitchers_df2.head()

### Merge

In [None]:
# Find the dates of Steamer projections
# We'll take the most recent and merge in that projection for each player
pitcher_steamer_dates = list(steamer_pitchers_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date, steamer_dates):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
complete_dataset["pitcher_date"] = complete_dataset["date"].apply(lambda x: find_steamer_date(x, pitcher_steamer_dates))

In [None]:
# Merge
complete_merged_df = pd.merge(complete_dataset, steamer_pitchers_df2[['mlbamid', 'date'] + pitcher_stats_fg2] , left_on=['pitcher', 'pitcher_date'], right_on=['mlbamid', 'date'], how='left', suffixes=("", "_fg"))
complete_merged_df.drop_duplicates(subset=['date', 'gamePk', 'atBatIndex'], keep='last', inplace=True)
complete_merged_df.head()

In [None]:
def pull_dataset(df): 
    # Calculate scores bot batter and pitcher teams
    df['pitcher_score'] = np.where(df['halfInning'] == "top", df['homeScore'], df['awayScore'])
    df['batter_score'] = np.where(df['halfInning'] == "top", df['awayScore'], df['homeScore'])
    
    # Number of batters faced (will be used to calculate rolling sum)
    df['faced'] = 1
    
    # Convert to numeric
    df['rbi'] = df['rbi'].astype('int')
    
    # Determine year
    df['year'] = df['date'].astype('str').str[:4]
    
    # Cumulative counts
    # Stats to sum
    sums_list = ['gamePk', 'pitcher'] + events_list + ['rbi', 'faced']
    # Calculate
    sums = df[sums_list].groupby(['gamePk', 'pitcher']).cumsum()
    # Add suffix
    sums = sums.add_suffix("_sum")
    
    # Add rolling sums
    df = pd.concat([df, sums], axis=1)
    
    # Identify if it's the bottom of the inning (a little more helpful than "top" as it's sortable)
    df['bottom'] = (df['top'] == 0).astype('int')
    
    # Sort to identify starting pitchers
    df = df.sort_values(by=['date', 'gamePk', 'bottom', 'atBatIndex'])
    
    # The starter has the lowest atBatIndex
    df['atBatIndex_min'] = df.groupby(['gamePk', 'bottom'])['atBatIndex'].transform('min')
    df['start'] = (df['atBatIndex'] == df['atBatIndex_min']).astype('int')
        
    # Identify starter throughout
    df['starter'] = df.groupby(['pitcher', 'gamePk'])['start'].cumsum()
    
    # Keep only starters
    df = df.query('starter == 1')
    
    # The starter is pulled at their highest atBatIndex
    df['atBatIndex_max'] = df.groupby(['gamePk', 'bottom'])['atBatIndex'].transform('max')
    df['pulled'] = (df['atBatIndex'] == df['atBatIndex_max']).astype('int')
    
    # Rolling sums stats (post-rolling sum)
    rolled_sums_list = [f'{stat}_sum' for stat in events_list] + ['rbi_sum', 'faced_sum']
    # Variables to keep
    keep_list = ['date', 'year', 'gamePk', 'pitcher', 'pitcherName', 'batter', 'batterName', 'atBatIndex', 'pitcher_score', 'batter_score'] + pull_inputs + pitcher_stats_fg2 + rolled_sums_list + ['start', 'pulled']
    
    # Keep relevant variables
    df = df[keep_list]
    
    # Drop if we don't have Steamer
    df.dropna(subset=pitcher_stats_fg2, inplace=True)
    
    
    return df

In [None]:
pulls_dataset = pull_dataset(complete_merged_df)

In [None]:
# Pitchers
with open(os.path.join(model_path, "pitcher_stats_fg_scaler_20231027.pkl"), "rb") as file:
    pitcher_stats_fg_scaler = pickle.load(file)

In [None]:
pulls_dataset[pitcher_stats_fg] = pitcher_stats_fg_scaler.fit_transform(pulls_dataset[pitcher_stats_fg])

In [None]:
# Full dataset
X = pulls_dataset[pull_inputs3]
y = pulls_dataset['pulled']

# Split into training and testing groups
X_train = pulls_dataset.groupby(pulls_dataset['year']).apply(lambda x: x.head(int(len(x)*2/3)))
X_test = pulls_dataset.groupby(pulls_dataset['year']).apply(lambda x: x.tail(int(len(x)*1/3)))

### Train Models

##### Dataset

### Pulls

In [None]:
%%time
pulls_filename = "model_pulls_" + "voting_" + f"{todaysdate}.sav"

# Define the individual models in the ensemble
models = [
    LogisticRegression(solver='lbfgs', max_iter=20),  
    LogisticRegression(solver='saga', max_iter=20),   
    # MLPClassifier(hidden_layer_sizes=(100,100), activation='relu', random_state=1, max_iter=100),  
    ]


# Create the ensemble classifier using VotingClassifier
model_pulls = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(X_train[pull_inputs3], X_train[['pulled']].values.ravel())

# Save model
pickle.dump(model_pulls, open(os.path.join(model_path, pulls_filename), 'wb'))

In [None]:
# Make predictions
proba = model_pulls.predict_proba(X_test[pull_inputs3])
X_test['is_kept_pred'] = proba[:, 0]  # Assign the first column of probabilities
X_test['is_pulled_pred'] = proba[:, 1]  # Assign the second column of probabilities

In [None]:
# Add xtiles (to examine how well predictions match actual results)
X_test['decile'] = pd.qcut(X_test['is_pulled_pred'], 10, labels=False)

df_name = "is_pulled" + "_df"
globals()[df_name] = X_test.groupby('decile').mean().reset_index()

In [None]:
# Create figures
plt.plot(is_pulled_df['decile'], is_pulled_df['is_pulled_pred'], color='red')
plt.plot(is_pulled_df['decile'], is_pulled_df['pulled'], color='black')
plt.show() 