# Imputations

In [4]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, classification_report, f1_score, make_scorer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from tensorflow import keras
from sklearn.impute import SimpleImputer

import joblib
import matplotlib.pyplot as plt

In [5]:
# This reads in Chadwick register with player codes.
keep_list = ['key_mlbam', 'key_fangraphs', 'key_bbref_minors', 'key_bbref', 'name_first', 'name_last']
chadwick = read_chadwick(keep_list)

In [6]:
# This reads in a map of team name, codes, and the shorthand MLB uses in their URLs
team_map = pd.read_csv(os.path.join(baseball_path, "Utilities", "Team Map.csv"))

# We just need teams right now
team_map = team_map[['FULLNAME', 'BBREFTEAM', 'MLBURL', 'FANGRAPHSTEAM', 'VENUE_ID', 'SFBBTEAM', 'DKTEAM', 'ROTOWIRETEAM', 'FANPROSTEAM']]

# Create sample dataset

In [7]:
%run "04. Dataset.ipynb"

In [8]:
# Read in sample, up until today's date
sample = create_model_input(todaysdate)

### FanGraphs

In [9]:
# Append all FanGraphs projections together and save it as a CSV
batters_list = []
# Loop over all FanGraphs files
for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data2\7. Stats\B. Clean FanGraphs\Batters"):
    # Extract date
    date = filename[12:20]
    # Read in dataframe
    df = pd.read_csv(os.path.join(baseball_path, "7. Stats", "B. Clean FanGraphs", "Batters", filename), encoding='iso-8859-1')
    # Add date column
    df['date'] = date
    
    # Append dataframe to list
    batters_list.append(df)
    
# Create combined dataframe
batters_fg_sample = pd.concat(batters_list, axis=0)

# Write to CSV
batters_fg_sample.to_csv(os.path.join(baseball_path, "Inputs", "Batters FanGraphs.csv"))

In [10]:
# Append all FanGraphs projections together and save it as a CSV
pitchers_list = []
# Loop over all FanGraphs files
for filename in os.listdir(r"C:\Users\james\Documents\MLB\Data2\7. Stats\B. Clean FanGraphs\Pitchers"):
    # Extract date
    date = filename[13:21]
    # Read in dataframe
    df = pd.read_csv(os.path.join(baseball_path, "7. Stats", "B. Clean FanGraphs", "Pitchers", filename), encoding='iso-8859-1')
    # Create date column
    df['date'] = date
    
    try:
        # Depending on the origin of the file (Steamer vs. FanGraphs), you may need to rename certain variables
        df.rename(columns={'H9':'H/9', 'HR9':'HR/9', 'K9':'K/9', 'BB9':'BB/9'}, inplace=True)
    except:
        pass
    
    # Append dataframe to list
    pitchers_list.append(df)
    
# Create combined dataframe
pitchers_fg_sample = pd.concat(pitchers_list, axis=0)

# Write to CSV
pitchers_fg_sample.to_csv(os.path.join(baseball_path, "Inputs", "Pitchers FanGraphs.csv"))

In [None]:
# Read in FanGraphs batter projections for each day
batters_fg_sample = pd.read_csv(os.path.join(baseball_path, "Inputs", "Batters FanGraphs.csv"))
# Convert date to string for merge
batters_fg_sample['date'] = batters_fg_sample['date'].astype('str')

# Merge sample data (Stats API and Statcast) with projections (Fangraphs)
sample = sample.merge(batters_fg_sample, left_on=['batter', 'date'], right_on=['mlbamid', 'date'], how='inner', suffixes=("", "_b"))
# Delete to clear up space
del batters_fg_sample

# Read in FanGraphs pitcher projections for each day
pitchers_fg_sample = pd.read_csv(os.path.join(baseball_path, "Inputs", "Pitchers FanGraphs.csv"))
# Convert date to string for merge
pitchers_fg_sample['date'] = pitchers_fg_sample['date'].astype('str')

# Merge sample data (Stats API and Statcast) with projections (Fangraphs)
sample = sample.merge(pitchers_fg_sample, left_on=['pitcher', 'date'], right_on=['mlbamid', 'date'], how='inner', suffixes=("", "_p"))
# Delete to clear up space
del pitchers_fg_sample

### Clean Data

In [None]:
# Get rid of small samples when training
# Important: Figure out if you want this!
# sample = sample.query('pa_b_long >= 40').query('pa_p_long >= 40')

# Get rid of PA outcomes that are not valid outputs
sample = sample.query('eventsModel != "Cut"').reset_index(drop=True)

# Count outs
sample['is_out'] = sample[['so', 'fo', 'go', 'lo', 'po']].sum(axis=1)
# Rounding is necessary because SOs are adjusted for park factors, so they might be just above or just below 1.
# This isn't an amazing solution, so I could probably do this more cleanly
sample['is_out'] = sample['is_out'].round()

In [None]:
# Set directory to models folder 
os.chdir(r"C:\Users\james\Documents\MLB\Code\Models")

In [None]:
# Remove early rows because they'll treat all players like rookies
sample = sample.drop(index=sample.index[:10000])
sample.reset_index(inplace=True, drop=True)

### Standardize FG Stats

In [None]:
# Standardize the data using StandardScaler
scaler = StandardScaler()
batter_stats_fg_scaled = scaler.fit_transform(sample[batter_stats_fg])
batter_stats_fg_scaled = pd.DataFrame(batter_stats_fg_scaled, columns=batter_stats_fg)

# Save the trained StandardScaler object
scaler_filename = "batter_stats_fg_scaler.pkl"
with open(scaler_filename, "wb") as file:
    pickle.dump(scaler, file)
    
# Standardize the data using StandardScaler
scaler = StandardScaler()
pitcher_stats_fg_scaled = scaler.fit_transform(sample[pitcher_stats_fg])
pitcher_stats_fg_scaled = pd.DataFrame(pitcher_stats_fg_scaled, columns=pitcher_stats_fg)

# Save the trained StandardScaler object
scaler_filename = "pitcher_stats_fg_scaler.pkl"
with open(scaler_filename, "wb") as file:
    pickle.dump(scaler, file)

### Standardize Stats API and Statcast Stats

In [None]:
# Standardize the data using StandardScaler
scaler = StandardScaler()
batter_stats_scaled = scaler.fit_transform(sample[batter_stats])
batter_stats_scaled = pd.DataFrame(batter_stats_scaled, columns=batter_stats)

# Save the trained StandardScaler object
scaler_filename = "batter_stats_scaler.pkl"
with open(scaler_filename, "wb") as file:
    pickle.dump(scaler, file)
    
# Standardize the data using StandardScaler
scaler = StandardScaler()
pitcher_stats_scaled = scaler.fit_transform(sample[pitcher_stats])
pitcher_stats_scaled = pd.DataFrame(pitcher_stats_scaled, columns=pitcher_stats)

# Save the trained StandardScaler object
scaler_filename = "pitcher_stats_scaler.pkl"
with open(scaler_filename, "wb") as file:
    pickle.dump(scaler, file)

### Data

In [None]:
# Create working dataset
# Extra variables
model_extra_vars = venues + years + other_list 
extra_variable_df = sample[model_extra_vars]

# Event variables
eventsModel_df = sample[['pa_b', 'pa_p', 'year', 'is_out', 'eventsModel']]

In [None]:
# Concatenate all together
df = pd.concat([batter_stats_scaled, pitcher_stats_scaled, batter_stats_fg_scaled, pitcher_stats_fg_scaled, extra_variable_df, eventsModel_df], axis=1)
# Since stats are normalized, this should just assume league average when missing
df.fillna(0, inplace=True)

### Imputations

In [None]:
# batter_stats_fg2 = batter_stats_fg + ['b_L', 'p_L']

# # Create a copy of the DataFrame with only relevant columns
# df_filtered = df[batter_stats_fg2 + batter_stats + ['pa_b']].copy()

# # Drop rows with missing values in the features or target columns
# df_filtered.dropna(subset=batter_stats_fg2 + batter_stats, inplace=True)

# # Separate the features (batter_stats_fg2) and target (batter_stats) columns
# features = df_filtered[batter_stats_fg2]
# target = df_filtered[batter_stats]

# # Create and fit the model
# model = keras.Sequential([
#     keras.layers.Dense(25, activation='relu', input_shape=(len(batter_stats_fg2),)),
#     keras.layers.Dense(25, activation='relu'),
#     keras.layers.Dense(25, activation='relu'),
#     keras.layers.Dense(len(batter_stats))  # Output layer with the same number of units as the target columns
# ])

# # Compile the model
# model.compile(loss='mean_squared_error', optimizer='adam')

# # Train the model
# model.fit(features, target, epochs=20, batch_size=32)

# # Pickle
# model_filename = "batter_imputations.pkl"
# with open(model_filename, "wb") as file:
#     pickle.dump(model, file)
    
# # Use the trained model to make predictions
# prediction = model.predict(df.loc[df['pa_b'] < 40, batter_stats_fg2])

# # Impute missing values in batter_stats with the predicted values
# df.loc[df['pa_b'] < 40, batter_stats] = prediction

In [None]:
# pitcher_stats_fg2 = pitcher_stats_fg + ['b_L', 'p_L']

# # Create a copy of the DataFrame with only relevant columns
# df_filtered = df[pitcher_stats_fg2 + pitcher_stats + ['pa_p']].copy()

# # Drop rows with missing values in the features or target columns
# df_filtered.dropna(subset=pitcher_stats_fg2 + pitcher_stats, inplace=True)

# # Separate the features (pitcher_stats_fg) and target (pitcher_stats) columns
# features = df_filtered[pitcher_stats_fg2]
# target = df_filtered[pitcher_stats]

# # Create and fit the model
# model = keras.Sequential([
#     keras.layers.Dense(25, activation='relu', input_shape=(len(pitcher_stats_fg2),)),
#     keras.layers.Dense(25, activation='relu'),
#     keras.layers.Dense(25, activation='relu'),
#     keras.layers.Dense(len(pitcher_stats))  # Output layer with the same number of units as the target columns
# ])

# # Compile the model
# model.compile(loss='mean_squared_error', optimizer='adam')

# # Train the model
# # model.fit(features_imputed, target, epochs=5, batch_size=32)
# model.fit(features, target, epochs=20, batch_size=32)

# # Use the trained model to make predictions
# prediction = model.predict(df.loc[df['pa_p'] < 40, pitcher_stats_fg2])

# # Pickle
# model_filename = "pitcher_imputations.pkl"
# with open(model_filename, "wb") as file:
#     pickle.dump(model, file)

# # Impute missing values in pitcher_stats with the predicted values
# df.loc[df['pa_p'] < 40, pitcher_stats] = prediction

# Impute (for re-running without retraining)

In [None]:
# Read in batter imputation model
kmeans_model_filename = "batter_imputations.pkl"
with open(kmeans_model_filename, "rb") as file:
    batter_kmeans = pickle.load(file)
    
# Add handedness to FanGraphs stats
batter_stats_fg2 = batter_stats_fg + ['b_L', 'p_L']

# Use FanGraphs stats to predict API/Statcast stats for those with limited samples
prediction = batter_kmeans.predict(df.loc[df['pa_b'] < 40, batter_stats_fg2])

# Impute missing values in batter_stats with the predicted values
# df.loc[df['pa_b'] < 40, batter_stats] = prediction
df.loc[df['pa_b'] < 40, batter_stats] = prediction[:sum(df['pa_b'] < 40)]



# Read in pitcher imputation model
kmeans_model_filename = "pitcher_imputations.pkl"
with open(kmeans_model_filename, "rb") as file:
    pitcher_kmeans = pickle.load(file)
    
# Add handedness to FanGraphs stats
pitcher_stats_fg2 = pitcher_stats_fg + ['b_L', 'p_L']
    
# Use FanGraphs stats to predict API/Statcast stats for those with limited samples
prediction = pitcher_kmeans.predict(df.loc[df['pa_p'] < 40, pitcher_stats_fg2])

# Impute missing values in pitcher_stats with the predicted values
# df.loc[df['pa_p'] < 40, pitcher_stats] = prediction
df.loc[df['pa_p'] < 40, batter_stats] = prediction[:sum(df['pa_p'] < 40)]


In [None]:
# Create imputation flags (could move this up, might make more sense)
df['imp_b'] = (df['pa_b'] < 40).astype('int')
df['imp_p'] = (df['pa_p'] < 40).astype('int')

### Train models

##### Create dataset

In [None]:
keep_list = batter_stats + pitcher_stats + venues + years + other_list + ['pa_b', 'pa_p', 'imp_b', 'imp_p', 'year', 'is_out', 'eventsModel']
model_dataset = df[keep_list]

In [None]:
outs_dataset = model_dataset[model_dataset['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()
safe_dataset = model_dataset[~model_dataset['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()

In [None]:
# Split into training and testing groups
X_train = model_dataset.groupby(model_dataset['year']).apply(lambda x: x.head(int(len(x)*2/3)))
X_test = model_dataset.groupby(model_dataset['year']).apply(lambda x: x.tail(int(len(x)*1/3)))

In [None]:
outs_dataset_train = X_train[X_train['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()
safe_dataset_train = X_train[~X_train['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()

In [None]:
outs_dataset_test = X_test[X_test['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()
safe_dataset_test = X_test[~X_test['eventsModel'].isin(['so', 'lo', 'go', 'fo', 'po'])].copy()

### Out vs. Safe

In [None]:
inputs

In [68]:
inputs = inputs + ['imp_b', 'imp_p']

In [None]:
%%time

solver = 'lbfgs'

iters = 200

filename = "model_binary_" + "voting" + "_100_new.sav"

print(filename)

# Define the individual models in the ensemble
models = [
    LogisticRegression(solver='lbfgs', max_iter=20),  
    LogisticRegression(solver='saga', max_iter=20),   
    MLPClassifier(hidden_layer_sizes=(100,100), activation='relu', random_state=1, max_iter=15),  
]


# Create the ensemble classifier using VotingClassifier
# model_binary = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(model_dataset[inputs], model_dataset[['is_out']].values.ravel())
model_binary = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(X_train[inputs], X_train[['is_out']].values.ravel())
# model_binary = LogisticRegression(solver=solver, max_iter=iters).fit(X_train[inputs], X_train[['is_out']].values.ravel())


# Save model
pickle.dump(model_binary, open(filename, 'wb'))

In [None]:
proba = model_binary.predict_proba(X_test[inputs])
X_test['is_safe_pred'] = proba[:, 0]  # Assign the first column of probabilities
X_test['is_out_pred'] = proba[:, 1]  # Assign the second column of probabilities

In [None]:
# model_dataset['decile'] = pd.qcut(model_dataset['is_out_pred'], 10, labels=False)

# df_name = "is_out" + "_df"
# globals()[df_name] = model_dataset.groupby('decile').mean().reset_index()

X_test['decile'] = pd.qcut(X_test['is_out_pred'], 5, labels=False)

df_name = "is_out" + "_df"
globals()[df_name] = X_test.query('imp_b == 0').query('imp_p == 0').groupby('decile').mean().reset_index()

In [None]:
plt.plot(is_out_df['decile'], is_out_df['is_out_pred'], color='red')
plt.plot(is_out_df['decile'], is_out_df['is_out'], color='black')
plt.show() 

### Outs

In [71]:
%%time
layers = (30,30,30)
# layers = (25,25,25,25,25)
layers_str = ''.join(str(x) for x in layers)
activation = 'relu'

iters = 15

filename = "model_outs_" + "_" + activation + layers_str + "_" + str(iters) + "_100.sav"
print(filename)
# Define the individual models in the ensemble
models = [
    MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=1, max_iter=iters),
]

# Create the ensemble classifier using VotingClassifier
# model_outs = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(outs_dataset[inputs], outs_dataset[['eventsModel']].values.ravel())
model_outs = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(outs_dataset_train[inputs], outs_dataset_train[['eventsModel']].values.ravel())

# Save model
pickle.dump(model_outs, open(filename, 'wb'))

model_outs__relu303030_15_100.sav
CPU times: total: 1.27 s
Wall time: 2min 34s


In [None]:
outs_outputs = list(model_outs.classes_)
outs_outputs_pred = [x + "_pred" for x in outs_outputs]
outs_outputs

In [None]:
# outs_dataset[outs_outputs_pred] = model_outs.predict_proba(outs_dataset[inputs])
# outs_dataset_test[outs_outputs_pred] = model_outs.predict_proba(outs_dataset_test[inputs])

proba = model_outs.predict_proba(outs_dataset_test[inputs])
for i, col in enumerate(outs_outputs_pred):
    outs_dataset_test[f'{col}'] = proba[:, i]

In [None]:
# # Create deciles
# for var in outs_outputs:
#     outs_dataset[f'{var}_act'] = (outs_dataset['eventsModel'] == var).astype('int')
#     outs_dataset['decile'] = pd.qcut(outs_dataset[f'{var}_pred'], 10, labels=False)
#     df_name = var + "_df"
#     globals()[df_name] = outs_dataset.groupby('decile').mean().reset_index()
    
# Create deciles
for var in outs_outputs:
    outs_dataset_test[f'{var}_act'] = (outs_dataset_test['eventsModel'] == var).astype('int')
    outs_dataset_test['decile'] = pd.qcut(outs_dataset_test[f'{var}_pred'], 10, labels=False)
    df_name = var + "_df"
    globals()[df_name] = outs_dataset_test.query('imp_b == 0').query('imp_p == 0').groupby('decile').mean().reset_index()

In [None]:
# Create figures
fig, axs = plt.subplots(2, 3, figsize=(12, 8))

for i, var in enumerate(outs_outputs):
    row = i // 3  # Calculate the row index based on the iteration
    col = i % 3   # Calculate the column index based on the iteration
    df_name = var + "_df"
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_pred'], color='red')
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_act'], color='black')
    axs[row, col].set_title(var)
    # axs[row, col].set_ylim(0,0.35)


# Add some space between subplots to prevent overlapping
fig.tight_layout(pad=.0)

# Show the figure
plt.show()

### Safe

In [None]:
# Parameters
layers = (30,30,30,30,30)
# layers = (25,25,25,25,25)
layers_str = ''.join(str(x) for x in layers)
model = "safe"
iters = 15
alpha = 0.0001
activation = 'relu'
short = 100


# inputs = batter_stats_safe + pitcher_stats_safe + batter_stats_safe_long + pitcher_stats_safe_long + venues + years + other_list

filename = "model_" + model + "_" + activation + "_" + layers_str + "_" + str(iters) + "_" + str(short) + ".sav"
print(filename)
# Define the individual models in the ensemble
models = [
    MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=1, max_iter=iters),
    MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=2, max_iter=iters),
    MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=3, max_iter=iters),
    # MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=15, max_iter=iters),
]

# Create the ensemble classifier using VotingClassifier
# model_safe = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(safe_dataset[inputs], safe_dataset[['eventsModel']].values.ravel())
model_safe = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(safe_dataset_train[inputs], safe_dataset_train[['eventsModel']].values.ravel())
# model_safe = MLPClassifier(hidden_layer_sizes=(layers), activation=activation, verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=7, max_iter=iters).fit(safe_dataset_train[inputs], safe_dataset_train[['eventsModel']].values.ravel())

# Save model
pickle.dump(model_safe, open(filename, 'wb'))

In [None]:
len(inputs)

In [None]:
safe_outputs = list(model_safe.classes_)
safe_outputs_pred = [x + "_pred" for x in safe_outputs]
safe_outputs

In [None]:
# safe_dataset[safe_outputs_pred] = model_safe.predict_proba(safe_dataset[inputs])
# safe_dataset_test[safe_outputs_pred] = model_safe.predict_proba(safe_dataset_test[inputs])

proba = model_safe.predict_proba(safe_dataset_test[inputs])
for i, col in enumerate(safe_outputs_pred):
    safe_dataset_test[f'{col}'] = proba[:, i]

In [None]:
# # Create deciles
# for var in safe_outputs:
#     safe_dataset[f'{var}_act'] = (safe_dataset['eventsModel'] == var).astype('int')
#     safe_dataset['decile'] = pd.qcut(safe_dataset[f'{var}_pred'], 10, labels=False)
#     df_name = var + "_df"
#     globals()[df_name] = safe_dataset.groupby('decile').mean().reset_index()
    
# Create deciles
for var in safe_outputs:
    safe_dataset_test[f'{var}_act'] = (safe_dataset_test['eventsModel'] == var).astype('int')
    safe_dataset_test['decile'] = pd.qcut(safe_dataset_test[f'{var}_pred'], 10, labels=False)
    df_name = var + "_df"
    globals()[df_name] = safe_dataset_test.query('imp_b == 0').query('imp_p == 0').groupby('decile').mean().reset_index()

In [None]:
# Create figures
fig, axs = plt.subplots(2, 3, figsize=(12, 8))

for i, var in enumerate(safe_outputs):
    row = i // 3  # Calculate the row index based on the iteration
    col = i % 3   # Calculate the column index based on the iteration
    df_name = var + "_df"
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_pred'], color='red')
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_act'], color='black')
    axs[row, col].set_title(var)
    # axs[row, col].set_ylim(globals()[df_name][f'{var}_act'].min(),globals()[df_name][f'{var}_act'].max())


# Add some space between subplots to prevent overlapping
fig.tight_layout(pad=.0)

# Show the figure
plt.show()

In [None]:
breaksfadf

# Single Model

In [None]:
# Parameters
layers = (25,25)
layers_str = ''.join(str(x) for x in layers)
model = "full"
iters = 10
alpha = 0.0001
activation = 'relu'
short = 100

filename = "model_" + model + "_" + activation + "_" + layers_str + "_" + str(iters) + "_" + str(short) + ".sav"
print(filename)

# # Define the individual models in the ensemble
# models = [
#     MLPClassifier(hidden_layer_sizes=(layers), activation='relu', verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=1, max_iter=iters),
# ]

# # Create the ensemble classifier using VotingClassifier
# # model_binary = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(model_dataset[inputs], model_dataset[['is_out']].values.ravel())
# model_full = VotingClassifier(estimators=[('model'+str(i+1), model) for i, model in enumerate(models)], voting='soft', n_jobs=-2).fit(X_train[inputs], X_train[['eventsModel']].values.ravel())

model_full = MLPClassifier(hidden_layer_sizes=(layers), activation='relu', verbose=True, alpha=0.0001, early_stopping=True, validation_fraction=0.1, random_state=1, max_iter=iters).fit(X_train[inputs], X_train[['eventsModel']].values.ravel())


# Save model
pickle.dump(model_full, open(filename, 'wb'))

In [None]:
full_outputs = list(model_full.classes_)
full_outputs_pred = [x + "_pred" for x in full_outputs]
full_outputs

In [None]:
# model_dataset[full_outsputs_pred] = model_full.predict_proba(model_dataset[inputs])
X_test[full_outputs_pred] = model_full.predict_proba(X_test[inputs])

In [None]:
# # Create deciles
# for var in full_outputs:
#     model_dataset[f'{var}_act'] = (model_dataset['eventsModel'] == var).astype('int')
#     model_dataset['decile'] = pd.qcut(model_dataset[f'{var}_pred'], 10, labels=False)
#     df_name = var + "_df"
#     globals()[df_name] = model_dataset.groupby('decile').mean().reset_index()
    
# Create deciles
for var in full_outputs:
    X_test[f'{var}_act'] = (X_test['eventsModel'] == var).astype('int')
    X_test['decile'] = pd.qcut(X_test[f'{var}_pred'], 10, labels=False)
    df_name = var + "_df"
    globals()[df_name] = X_test.groupby('decile').mean().reset_index()

In [None]:
# Create figures
fig, axs = plt.subplots(4, 3, figsize=(12, 16))

for i, var in enumerate(full_outputs):
    row = i // 3  # Calculate the row index based on the iteration
    col = i % 3   # Calculate the column index based on the iteration
    df_name = var + "_df"
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_pred'], color='red')
    axs[row, col].plot(globals()[df_name]['decile'], globals()[df_name][f'{var}_act'], color='black')
    axs[row, col].set_title(var)
    # axs[row, col].set_ylim(0,0.67)


# Add some space between subplots to prevent overlapping
fig.tight_layout(pad=.0)

# Show the figure
plt.show()

In [None]:
# To do: create a way to calculate probabilities for individual matchups so you can test