# Imputations

In [1]:
%run "Imports.ipynb"

# import ast
# import datetime
# import dateutil.parser
# import distutils.dir_util
# import glob
# import IPython.display
# import json
# import math
# import numpy as np
# import os
# import pandas as pd
# import pathlib
# import pickle
# import pyautogui
# import pytz
# import re
# import requests
# import selenium
# import shutil
# import statsapi
# import statsmodels.formula.api as smf
# import time
# import unidecode
# import warnings
# import webbrowser
# import xlrd
# import random
# import urllib
# from urllib.request import urlopen, Request
# import zipfile

# from bs4 import BeautifulSoup
# from datetime import date
# from IPython.display import display, Javascript
# from joblib import Parallel, delayed
# from pathlib import Path
# from sklearn.preprocessing import StandardScaler
# from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium import webdriver
# from openpyxl import load_workbook
# from functools import partial

# from statsapi import get
# from pydfs_lineup_optimizer import get_optimizer, Site, Sport, Player, TeamStack, PlayerFilter, RandomFantasyPointsStrategy

# os.chdir(r"C:\Users\james\Documents\MLB\Code")

# import smtplib
# import ssl
# from email.mime.text import MIMEText
# from email.mime.multipart import MIMEMultipart
# from email.mime.base import MIMEBase
# from email import encoders

# # Ensure the warning is ignored only once
# warnings.simplefilter(action="ignore")

# # Display the DataFrame
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.width", None)
# pd.set_option("display.max_colwidth", None)

# # Set paths
# model_path = r"C:\Users\james\Documents\MLB\Code\Models"
# baseball_path = r"C:\Users\james\Documents\MLB\Data2"
# download_path = r"C:\Users\james\Downloads"


In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, classification_report, f1_score, make_scorer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from tensorflow import keras
from sklearn.impute import SimpleImputer

import joblib
import matplotlib.pyplot as plt

In [3]:
%run "Utilities.ipynb"

In [5]:
# This reads in Chadwick register with player codes.
keep_list = ['key_mlbam', 'key_fangraphs', 'key_bbref_minors', 'key_bbref', 'name_first', 'name_last']
chadwick = read_chadwick(keep_list)

In [7]:
# This reads in a map of team name, codes, and the shorthand MLB uses in their URLs
team_map = pd.read_csv(os.path.join(baseball_path, "Utilities", "Team Map.csv"))

# We just need teams right now
team_map = team_map[['FULLNAME', 'BBREFTEAM', 'MLBURL', 'FANGRAPHSTEAM', 'VENUE_ID', 'SFBBTEAM', 'DKTEAM', 'ROTOWIRETEAM', 'FANPROSTEAM']]

# Create sample dataset

In [8]:
%run "04. Dataset.ipynb"

In [9]:
# Read in sample, up until today's date
sample = create_model_input(todaysdate)

### FanGraphs

In [10]:
# Append all FanGraphs projections together and save it as a CSV
batters_list = []
# Loop over all FanGraphs files
for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data2\7. Stats\B. Clean FanGraphs\Batters"):
    # Extract date
    date = filename[12:20]
    # Read in dataframe
    df = pd.read_csv(os.path.join(baseball_path, "7. Stats", "B. Clean FanGraphs", "Batters", filename), encoding='iso-8859-1')
    # Add date column
    df['date'] = date
    
    # Append dataframe to list
    batters_list.append(df)
    
# Create combined dataframe
batters_fg_sample = pd.concat(batters_list, axis=0)

# Write to CSV
batters_fg_sample.to_csv(os.path.join(baseball_path, "Inputs", "Batters FanGraphs.csv"))

In [11]:
# Append all FanGraphs projections together and save it as a CSV
pitchers_list = []
# Loop over all FanGraphs files
for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data2\7. Stats\B. Clean FanGraphs\Pitchers"):
    # Extract date
    date = filename[13:21]
    # Read in dataframe
    df = pd.read_csv(os.path.join(baseball_path, "7. Stats", "B. Clean FanGraphs", "Pitchers", filename), encoding='iso-8859-1')
    # Create date column
    df['date'] = date
    
    try:
        # Depending on the origin of the file (Steamer vs. FanGraphs), you may need to rename certain variables
        df.rename(columns={'H9':'H/9', 'HR9':'HR/9', 'K9':'K/9', 'BB9':'BB/9'}, inplace=True)
    except:
        pass
    
    # Append dataframe to list
    pitchers_list.append(df)
    
# Create combined dataframe
pitchers_fg_sample = pd.concat(pitchers_list, axis=0)

# Write to CSV
pitchers_fg_sample.to_csv(os.path.join(baseball_path, "Inputs", "Pitchers FanGraphs.csv"))

In [12]:
# Read in FanGraphs batter projections for each day
batters_fg_sample = pd.read_csv(os.path.join(baseball_path, "Inputs", "Batters FanGraphs.csv"))
# Convert date to string for merge
batters_fg_sample['date'] = batters_fg_sample['date'].astype('str')

# Merge sample data (Stats API and Statcast) with projections (Fangraphs)
sample = sample.merge(batters_fg_sample, left_on=['batter', 'date'], right_on=['mlbamid', 'date'], how='inner', suffixes=("", "_b"))
# Delete to clear up space
del batters_fg_sample

# Read in FanGraphs pitcher projections for each day
pitchers_fg_sample = pd.read_csv(os.path.join(baseball_path, "Inputs", "Pitchers FanGraphs.csv"))
# Convert date to string for merge
pitchers_fg_sample['date'] = pitchers_fg_sample['date'].astype('str')

# Merge sample data (Stats API and Statcast) with projections (Fangraphs)
sample = sample.merge(pitchers_fg_sample, left_on=['pitcher', 'date'], right_on=['mlbamid', 'date'], how='inner', suffixes=("", "_p"))
# Delete to clear up space
del pitchers_fg_sample

# Model Inputs

In [13]:
# Basic stat prefixes
stats = ['b1','b2','b3','hr','bb','hbp',
         'so','fo','go','lo','po',
         'iso','slg','obp','woba','estimated_woba_using_speedangle',
         'to_left','to_middle','to_right',
         'hard_hit','barrel','totalDistance','launchSpeed','maxSpeed','maxSpin',
         'ab','pa']

### Batter

In [14]:
# Create position/length-specific inputs from stats
batter_stats_short = [f"{stat}_b" for stat in stats]
batter_stats_long  = [f"{stat}_b_long" for stat in stats]

# FanGraphs stats
batter_stats_fg =     ['b1_rate','b2_rate','b3_rate','hr_rate','bb_rate','hbp_rate','so_rate', 'woba', 'slg', 'obp']

### Pitcher

In [15]:
# Create position/length-specific inputs from stats
pitcher_stats_short = [f"{stat}_p" for stat in stats]
pitcher_stats_long  = [f"{stat}_p_long" for stat in stats]

# FanGraphs stats
pitcher_stats_fg =    ['H/9','HR/9','K/9','BB/9']

### Other

In [16]:
# Venues
venue_nums = ['1', '2', '3', '4', '5', '7', '10', '12', '13', '14', '15', '16', '17', '19', '22', '31', '32', 
              '680', '2392', '2394', '2395', '2535', '2536', '2602', '2680', '2681', '2701', '2735', '2756', 
              '2889', '3289', '3309', '3312', '3313', '4169', '4705', '5010', '5325', '5365', '5381', '5445']

venues = [f"venue_{num}" for num in venue_nums]

# Years
years = [f"year_{year}" for year in range(2015,2024)]

# Matchup, weather, and game stat
other_list = ['p_L','b_L','x_vect','y_vect','temperature','onFirst','onSecond','onThird','inning','top','score_diff']

### Exclusions

In [17]:
# Stats that do not apply to the position or we just don't want
exclude = ["maxSpeed_b", "maxSpin_b", "maxSpeed_b_long", "maxSpin_b_long", 
           "totalDistance_p", "totalDistance_p_long", "launchSpeed_p", "launchSpeed_p_long",
           "ab_b", "pa_b", "ab_b_long", "pa_b_long", 
           "ab_p", "pa_p", "ab_p_long", "pa_p_long"]

### Input Lists

In [18]:
# Batter inputs (into final mode)
batter_stats = batter_stats_short + batter_stats_long
batter_stats = [item for item in batter_stats if item not in exclude]

# Pitcher inputs (into final mode)
pitcher_stats = pitcher_stats_short + pitcher_stats_long
pitcher_stats = [item for item in pitcher_stats if item not in exclude]

# All inputs into final model
inputs = batter_stats + pitcher_stats + venues + years + other_list


# Add additional variables for ease of use
inputs_plus = inputs + ['batterName', 'pitcherName', 'batter', 'pitcher', 'batSide', 'pitchHand', 'eventsModel']

### Clean Data

In [19]:
# Get rid of small samples when training
# Important: Figure out if you want this!
# sample = sample.query('pa_b_long >= 40').query('pa_p_long >= 40')

# Get rid of PA outcomes that are not valid outputs
sample = sample.query('eventsModel != "Cut"').reset_index(drop=True)

# Count outs
sample['is_out'] = sample[['so', 'fo', 'go', 'lo', 'po']].sum(axis=1)
# Rounding is necessary because SOs are adjusted for park factors, so they might be just above or just below 1.
# This isn't an amazing solution, so I could probably do this more cleanly
sample['is_out'] = sample['is_out'].round()

In [20]:
# Set directory to models folder 
os.chdir(r"C:\Users\james\Documents\MLB\Code\Models")

In [21]:
# Remove early rows because they'll treat all players like rookies
sample = sample.drop(index=sample.index[:10000])
sample.reset_index(inplace=True, drop=True)

### Standardize FG Stats

In [22]:
# Standardize the data using StandardScaler
scaler = StandardScaler()
batter_stats_fg_scaled = scaler.fit_transform(sample[batter_stats_fg])
batter_stats_fg_scaled = pd.DataFrame(batter_stats_fg_scaled, columns=batter_stats_fg)

# Save the trained StandardScaler object
scaler_filename = "batter_stats_fg_scaler.pkl"
with open(scaler_filename, "wb") as file:
    pickle.dump(scaler, file)
    
# Standardize the data using StandardScaler
scaler = StandardScaler()
pitcher_stats_fg_scaled = scaler.fit_transform(sample[pitcher_stats_fg])
pitcher_stats_fg_scaled = pd.DataFrame(pitcher_stats_fg_scaled, columns=pitcher_stats_fg)

# Save the trained StandardScaler object
scaler_filename = "pitcher_stats_fg_scaler.pkl"
with open(scaler_filename, "wb") as file:
    pickle.dump(scaler, file)

### Standardize Stats API and Statcast Stats

In [23]:
# Standardize the data using StandardScaler
scaler = StandardScaler()
batter_stats_scaled = scaler.fit_transform(sample[batter_stats])
batter_stats_scaled = pd.DataFrame(batter_stats_scaled, columns=batter_stats)

# Save the trained StandardScaler object
scaler_filename = "batter_stats_scaler.pkl"
with open(scaler_filename, "wb") as file:
    pickle.dump(scaler, file)
    
# Standardize the data using StandardScaler
scaler = StandardScaler()
pitcher_stats_scaled = scaler.fit_transform(sample[pitcher_stats])
pitcher_stats_scaled = pd.DataFrame(pitcher_stats_scaled, columns=pitcher_stats)

# Save the trained StandardScaler object
scaler_filename = "pitcher_stats_scaler.pkl"
with open(scaler_filename, "wb") as file:
    pickle.dump(scaler, file)

### Data

In [24]:
# Create working dataset
# Extra variables
model_extra_vars = venues + years + other_list 
extra_variable_df = sample[model_extra_vars]

# Event variables
eventsModel_df = sample[['pa_b', 'pa_p', 'year', 'is_out', 'eventsModel']]

In [25]:
# Concatenate all together
df = pd.concat([batter_stats_scaled, pitcher_stats_scaled, batter_stats_fg_scaled, pitcher_stats_fg_scaled, extra_variable_df, eventsModel_df], axis=1)
# Since stats are normalized, this should just assume league average when missing
df.fillna(0, inplace=True)

In [42]:
pitcher_stats_scaled['b1_b'].head()

0   -2.856849
1    3.278646
2    1.288464
3   -0.849955
4    0.599671
Name: b1_b, dtype: float64

### Imputations

In [33]:
batter_stats_fg2 = batter_stats_fg + ['b_L', 'p_L']

# Create a copy of the DataFrame with only relevant columns
df_filtered = df[batter_stats_fg2 + batter_stats + ['pa_b']].copy()

# Drop rows with missing values in the features or target columns
df_filtered.dropna(subset=batter_stats_fg2 + batter_stats, inplace=True)

# Separate the features (batter_stats_fg2) and target (batter_stats) columns
features = df_filtered[batter_stats_fg2]
target = df_filtered[batter_stats]

# Create and fit the model
model = keras.Sequential([
    keras.layers.Dense(25, activation='relu', input_shape=(len(batter_stats_fg2),)),
    # keras.layers.Dense(25, activation='relu'),
    # keras.layers.Dense(25, activation='relu'),
    keras.layers.Dense(len(batter_stats))  # Output layer with the same number of units as the target columns
])

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(features, target, epochs=10, batch_size=32)

# Pickle
model_filename = "batter_imputations.pkl"
with open(model_filename, "wb") as file:
    pickle.dump(model, file)
    
# Use the trained model to make predictions
prediction = model.predict(df.loc[df['pa_b'] < 40, batter_stats_fg2])

# Impute missing values in batter_stats with the predicted values
df.loc[df['pa_b'] < 40, batter_stats] = prediction

Epoch 1/10


ValueError: in user code:

    File "C:\Users\james\anaconda3\lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\james\anaconda3\lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\james\anaconda3\lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\james\anaconda3\lib\site-packages\keras\engine\training.py", line 1051, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\james\anaconda3\lib\site-packages\keras\engine\training.py", line 1109, in compute_loss
        return self.compiled_loss(
    File "C:\Users\james\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\james\anaconda3\lib\site-packages\keras\losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\james\anaconda3\lib\site-packages\keras\losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\james\anaconda3\lib\site-packages\keras\losses.py", line 1470, in mean_squared_error
        return backend.mean(tf.math.squared_difference(y_pred, y_true), axis=-1)

    ValueError: Dimensions must be equal, but are 46 and 92 for '{{node mean_squared_error/SquaredDifference}} = SquaredDifference[T=DT_FLOAT](sequential_1/dense_5/BiasAdd, Cast)' with input shapes: [?,46], [?,92].


In [None]:
pitcher_stats_fg2 = pitcher_stats_fg + ['b_L', 'p_L']

# Create a copy of the DataFrame with only relevant columns
df_filtered = df[pitcher_stats_fg2 + pitcher_stats + ['pa_p']].copy()

# Drop rows with missing values in the features or target columns
df_filtered.dropna(subset=pitcher_stats_fg2 + pitcher_stats, inplace=True)

# Separate the features (pitcher_stats_fg) and target (pitcher_stats) columns
features = df_filtered[pitcher_stats_fg2]
target = df_filtered[pitcher_stats]

# Create and fit the model
model = keras.Sequential([
    keras.layers.Dense(25, activation='relu', input_shape=(len(pitcher_stats_fg2),)),
    keras.layers.Dense(25, activation='relu'),
    keras.layers.Dense(25, activation='relu'),
    keras.layers.Dense(len(pitcher_stats))  # Output layer with the same number of units as the target columns
])

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
# model.fit(features_imputed, target, epochs=5, batch_size=32)
model.fit(features, target, epochs=10, batch_size=32)

# Use the trained model to make predictions
prediction = model.predict(df.loc[df['pa_p'] < 40, pitcher_stats_fg2])

# Pickle
model_filename = "pitcher_imputations.pkl"
with open(model_filename, "wb") as file:
    pickle.dump(model, file)

# Impute missing values in pitcher_stats with the predicted values
df.loc[df['pa_p'] < 40, pitcher_stats] = prediction


# I think what you want to do is add batter hand and pitcher hand. 
# For batters, batter hand will just be an input and pitcher hand will be entered twice in 7. Stats, once to figure out _l and then _r

# Impute (for re-running without retraining)

In [None]:
# Read in batter imputation model
kmeans_model_filename = "batter_imputations.pkl"
with open(kmeans_model_filename, "rb") as file:
    batter_kmeans = pickle.load(file)
    
# Add handedness to FanGraphs stats
batter_stats_fg2 = batter_stats_fg + ['b_L', 'p_L']

# Use FanGraphs stats to predict API/Statcast stats for those with limited samples
prediction = batter_kmeans.predict(df.loc[df['pa_b'] < 40, batter_stats_fg2])

# Impute missing values in batter_stats with the predicted values
# df.loc[df['pa_b'] < 40, batter_stats] = prediction
df.loc[df['pa_b'] < 40, batter_stats] = prediction[:sum(df['pa_b'] < 40)]



# Read in pitcher imputation model
kmeans_model_filename = "pitcher_imputations.pkl"
with open(kmeans_model_filename, "rb") as file:
    pitcher_kmeans = pickle.load(file)
    
# Add handedness to FanGraphs stats
pitcher_stats_fg2 = pitcher_stats_fg + ['b_L', 'p_L']
    
# Use FanGraphs stats to predict API/Statcast stats for those with limited samples
prediction = pitcher_kmeans.predict(df.loc[df['pa_p'] < 40, pitcher_stats_fg2])

# Impute missing values in pitcher_stats with the predicted values
# df.loc[df['pa_p'] < 40, pitcher_stats] = prediction
df.loc[df['pa_p'] < 40, batter_stats] = prediction[:sum(df['pa_p'] < 40)]


In [None]:
# Create imputation flags (could move this up, might make more sense)
df['imp_b'] = (df['pa_b'] < 40).astype('int')
df['imp_p'] = (df['pa_p'] < 40).astype('int')

### Train models

##### Create dataset

In [None]:
keep_list = batter_stats + pitcher_stats + venues + years + other_list + ['pa_b', 'pa_p', 'imp_b', 'imp_p', 'year', 'is_out', 'eventsModel']
model_dataset = df[keep_list]

In [None]:
model_dataset.isna().sum()

In [None]:
model_dataset.fillna(0, inplace=True)

In [None]:
outs_dataset = model_dataset[model_dataset['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()
safe_dataset = model_dataset[~model_dataset['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()

In [None]:
# Split into training and testing groups
X_train = model_dataset.groupby(model_dataset['year']).apply(lambda x: x.head(int(len(x)*2/3)))
X_test = model_dataset.groupby(model_dataset['year']).apply(lambda x: x.tail(int(len(x)*1/3)))

In [None]:
outs_dataset_train = X_train[X_train['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()
safe_dataset_train = X_train[~X_train['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()

In [None]:
outs_dataset_test = X_test[X_test['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()
safe_dataset_test = X_test[~X_test['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()

### Out vs. Safe

In [None]:
inputs = inputs + ['imp_b', 'imp_p']

In [None]:
# inputs.remove('imp_b')
# inputs.remove('imp_p')

In [None]:
%%time

solver = 'lbfgs'

iters = 200

filename = "model_binary_" + "voting" + "_100_new.sav"

print(filename)

# Define the individual models in the ensemble
models = [
    LogisticRegression(solver='lbfgs', max_iter=20),  
    LogisticRegression(solver='saga', max_iter=20),   
    MLPClassifier(hidden_layer_sizes=(100,100), activation='relu', random_state=1, max_iter=15),  
]


# Create the ensemble classifier using VotingClassifier
# model_binary = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(model_dataset[inputs], model_dataset[['is_out']].values.ravel())
model_binary = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(X_train[inputs], X_train[['is_out']].values.ravel())
# model_binary = LogisticRegression(solver=solver, max_iter=iters).fit(X_train[inputs], X_train[['is_out']].values.ravel())


# Save model
pickle.dump(model_binary, open(filename, 'wb'))

In [None]:
proba = model_binary.predict_proba(X_test[inputs])
X_test['is_safe_pred'] = proba[:, 0]  # Assign the first column of probabilities
X_test['is_out_pred'] = proba[:, 1]  # Assign the second column of probabilities

In [None]:
# model_dataset['decile'] = pd.qcut(model_dataset['is_out_pred'], 10, labels=False)

# df_name = "is_out" + "_df"
# globals()[df_name] = model_dataset.groupby('decile').mean().reset_index()

X_test['decile'] = pd.qcut(X_test['is_out_pred'], 5, labels=False)

df_name = "is_out" + "_df"
globals()[df_name] = X_test.query('imp_b == 0').query('imp_p == 0').groupby('decile').mean().reset_index()

In [None]:
plt.plot(is_out_df['decile'], is_out_df['is_out_pred'], color='red')
plt.plot(is_out_df['decile'], is_out_df['is_out'], color='black')
plt.show() 

### Outs

In [None]:
%%time
layers = (30,30,30,30,30)
# layers = (25,25,25,25,25)
layers_str = ''.join(str(x) for x in layers)

iters = 20

filename = "model_outs_" + layers_str + "_" + str(iters) + "_100.sav"
print(filename)
# Define the individual models in the ensemble
models = [
    MLPClassifier(hidden_layer_sizes=(layers), activation='relu', verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=1, max_iter=iters),
]

# Create the ensemble classifier using VotingClassifier
# model_outs = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(outs_dataset[inputs], outs_dataset[['eventsModel']].values.ravel())
model_outs = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(outs_dataset_train[inputs], outs_dataset_train[['eventsModel']].values.ravel())

# Save model
pickle.dump(model_outs, open(filename, 'wb'))

In [None]:
outs_outputs = list(model_outs.classes_)
outs_outputs_pred = [x + "_pred" for x in outs_outputs]
outs_outputs

In [None]:
# outs_dataset[outs_outputs_pred] = model_outs.predict_proba(outs_dataset[inputs])
# outs_dataset_test[outs_outputs_pred] = model_outs.predict_proba(outs_dataset_test[inputs])

proba = model_outs.predict_proba(outs_dataset_test[inputs])
for i, col in enumerate(outs_outputs_pred):
    outs_dataset_test[f'{col}'] = proba[:, i]

In [None]:
# # Create deciles
# for var in outs_outputs:
#     outs_dataset[f'{var}_act'] = (outs_dataset['eventsModel'] == var).astype('int')
#     outs_dataset['decile'] = pd.qcut(outs_dataset[f'{var}_pred'], 10, labels=False)
#     df_name = var + "_df"
#     globals()[df_name] = outs_dataset.groupby('decile').mean().reset_index()
    
# Create deciles
for var in outs_outputs:
    outs_dataset_test[f'{var}_act'] = (outs_dataset_test['eventsModel'] == var).astype('int')
    outs_dataset_test['decile'] = pd.qcut(outs_dataset_test[f'{var}_pred'], 10, labels=False)
    df_name = var + "_df"
    globals()[df_name] = outs_dataset_test.query('imp_b == 0').query('imp_p == 0').groupby('decile').mean().reset_index()

In [None]:
# Create figures
fig, axs = plt.subplots(2, 3, figsize=(12, 8))

for i, var in enumerate(outs_outputs):
    row = i // 3  # Calculate the row index based on the iteration
    col = i % 3   # Calculate the column index based on the iteration
    df_name = var + "_df"
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_pred'], color='red')
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_act'], color='black')
    axs[row, col].set_title(var)
    # axs[row, col].set_ylim(0,0.35)


# Add some space between subplots to prevent overlapping
fig.tight_layout(pad=.0)

# Show the figure
plt.show()

### Safe

In [None]:
# Parameters
layers = (30,30,30,30,30)
# layers = (25,25,25,25,25)
layers_str = ''.join(str(x) for x in layers)
model = "safe"
iters = 15
alpha = 0.0001
activation = 'relu'
short = 100


# inputs = batter_stats_safe + pitcher_stats_safe + batter_stats_safe_long + pitcher_stats_safe_long + venues + years + other_list

filename = "model_" + model + "_" + activation + "_" + layers_str + "_" + str(iters) + "_" + str(short) + ".sav"
print(filename)
# Define the individual models in the ensemble
models = [
    MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=1, max_iter=iters),
    MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=2, max_iter=iters),
    MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=3, max_iter=iters),
    # MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=15, max_iter=iters),
]

# Create the ensemble classifier using VotingClassifier
# model_safe = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(safe_dataset[inputs], safe_dataset[['eventsModel']].values.ravel())
model_safe = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(safe_dataset_train[inputs], safe_dataset_train[['eventsModel']].values.ravel())
# model_safe = MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=7, max_iter=iters).fit(safe_dataset_train[inputs], safe_dataset_train[['eventsModel']].values.ravel())

# Save model
pickle.dump(model_safe, open(filename, 'wb'))

In [None]:
len(inputs)

In [None]:
safe_outputs = list(model_safe.classes_)
safe_outputs_pred = [x + "_pred" for x in safe_outputs]
safe_outputs

In [None]:
# safe_dataset[safe_outputs_pred] = model_safe.predict_proba(safe_dataset[inputs])
# safe_dataset_test[safe_outputs_pred] = model_safe.predict_proba(safe_dataset_test[inputs])

proba = model_safe.predict_proba(safe_dataset_test[inputs])
for i, col in enumerate(safe_outputs_pred):
    safe_dataset_test[f'{col}'] = proba[:, i]

In [None]:
# # Create deciles
# for var in safe_outputs:
#     safe_dataset[f'{var}_act'] = (safe_dataset['eventsModel'] == var).astype('int')
#     safe_dataset['decile'] = pd.qcut(safe_dataset[f'{var}_pred'], 10, labels=False)
#     df_name = var + "_df"
#     globals()[df_name] = safe_dataset.groupby('decile').mean().reset_index()
    
# Create deciles
for var in safe_outputs:
    safe_dataset_test[f'{var}_act'] = (safe_dataset_test['eventsModel'] == var).astype('int')
    safe_dataset_test['decile'] = pd.qcut(safe_dataset_test[f'{var}_pred'], 10, labels=False)
    df_name = var + "_df"
    globals()[df_name] = safe_dataset_test.query('imp_b == 0').query('imp_p == 0').groupby('decile').mean().reset_index()

In [None]:
# Create figures
fig, axs = plt.subplots(2, 3, figsize=(12, 8))

for i, var in enumerate(safe_outputs):
    row = i // 3  # Calculate the row index based on the iteration
    col = i % 3   # Calculate the column index based on the iteration
    df_name = var + "_df"
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_pred'], color='red')
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_act'], color='black')
    axs[row, col].set_title(var)
    # axs[row, col].set_ylim(globals()[df_name][f'{var}_act'].min(),globals()[df_name][f'{var}_act'].max())


# Add some space between subplots to prevent overlapping
fig.tight_layout(pad=.0)

# Show the figure
plt.show()

In [None]:
breaksfadf

# Single Model

In [None]:
# Parameters
layers = (25,25)
layers_str = ''.join(str(x) for x in layers)
model = "full"
iters = 10
alpha = 0.0001
activation = 'relu'
short = 100

filename = "model_" + model + "_" + activation + "_" + layers_str + "_" + str(iters) + "_" + str(short) + ".sav"
print(filename)

# # Define the individual models in the ensemble
# models = [
#     MLPClassifier(hidden_layer_sizes=(layers), activation='relu', verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=1, max_iter=iters),
# ]

# # Create the ensemble classifier using VotingClassifier
# # model_binary = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(model_dataset[inputs], model_dataset[['is_out']].values.ravel())
# model_full = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(X_train[inputs], X_train[['eventsModel']].values.ravel())

model_full = MLPClassifier(hidden_layer_sizes=(layers), activation='relu', verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=1, max_iter=iters).fit(X_train[inputs], X_train[['eventsModel']].values.ravel())


# Save model
pickle.dump(model_full, open(filename, 'wb'))

In [None]:
full_outputs = list(model_full.classes_)
full_outputs_pred = [x + "_pred" for x in full_outputs]
full_outputs

In [None]:
# model_dataset[full_outsputs_pred] = model_full.predict_proba(model_dataset[inputs])
X_test[full_outputs_pred] = model_full.predict_proba(X_test[inputs])

In [None]:
# # Create deciles
# for var in full_outputs:
#     model_dataset[f'{var}_act'] = (model_dataset['eventsModel'] == var).astype('int')
#     model_dataset['decile'] = pd.qcut(model_dataset[f'{var}_pred'], 10, labels=False)
#     df_name = var + "_df"
#     globals()[df_name] = model_dataset.groupby('decile').mean().reset_index()
    
# Create deciles
for var in full_outputs:
    X_test[f'{var}_act'] = (X_test['eventsModel'] == var).astype('int')
    X_test['decile'] = pd.qcut(X_test[f'{var}_pred'], 10, labels=False)
    df_name = var + "_df"
    globals()[df_name] = X_test.groupby('decile').mean().reset_index()

In [None]:
# Create figures
fig, axs = plt.subplots(4, 3, figsize=(12, 16))

for i, var in enumerate(full_outputs):
    row = i // 3  # Calculate the row index based on the iteration
    col = i % 3   # Calculate the column index based on the iteration
    df_name = var + "_df"
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_pred'], color='red')
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_act'], color='black')
    axs[row, col].set_title(var)
    # axs[row, col].set_ylim(0,0.67)


# Add some space between subplots to prevent overlapping
fig.tight_layout(pad=.0)

# Show the figure
plt.show()

In [None]:
# To do: create a way to calculate probabilities for individual matchups so you can test