# M01. Impute Inputs
- This imputes model inputs using Steamer projections
- Type: Model
- Run Frequency: Irregular
- Sources:
    - MLB API
    - Steamer
- Dates:
    - Created: 1/28/2024
    - Updated: 1/31/2024

### Imports

In [6]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, classification_report, f1_score, make_scorer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from tensorflow import keras
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

import joblib
import matplotlib.pyplot as plt

In [8]:
%run "A02. MLB API.ipynb"
%run "A03. Steamer.ipynb"

### Batters

##### Dataset

In [9]:
hitters_df = create_pa_inputs(2015, 2023, 50, 300)

In [10]:
hitters_df.drop_duplicates(['gamePk', 'batter', 'b_L', 'p_L'], keep='last', inplace=True)

In [11]:
hitters_df = hitters_df[['batter', 'date', 'b_L', 'p_L', 'imp_b'] + batter_inputs]

In [12]:
# Standardize the data using StandardScaler
batter_stats_scaler = StandardScaler()
hitters_df[batter_inputs] = batter_stats_scaler.fit_transform(hitters_df[batter_inputs])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "batter_stats_scaler_20231027.pkl"), "wb") as file:
    pickle.dump(batter_stats_scaler, file)

##### Steamer

In [13]:
# Read in Steamer hitters 
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')
# Clean
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)

In [14]:
# Standardize the data using StandardScaler
batter_stats_fg_scaler = StandardScaler()
steamer_hitters_df2[batter_stats_fg] = batter_stats_fg_scaler.fit_transform(steamer_hitters_df2[batter_stats_fg])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "batter_stats_fg_scaler_20231027.pkl"), "wb") as file:
    pickle.dump(batter_stats_fg_scaler, file)

##### Create compatible dates

In [15]:
# Create column steamer_date column in hitters_df equal to the highest number <= a number in this list of uniques
steamer_dates = list(steamer_hitters_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
hitters_df["steamer_date"] = hitters_df["date"].apply(find_steamer_date)

##### Merge

In [16]:
# Steamer stats we want to keep
batter_stats_fg_plus = ['mlbamid', 'steamerid', 'date'] + batter_stats_fg 
# Merge
hitters_merged_df = pd.merge(hitters_df, steamer_hitters_df2[batter_stats_fg_plus], left_on=['batter', 'steamer_date'], right_on=['mlbamid', 'date'], how='inner')
# Only keep those without missing data
hitters_merged_df = hitters_merged_df.dropna(subset=batter_inputs).dropna(subset=batter_stats_fg)

##### Impute

In [17]:
# Add hands to use in imputation
batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L']

In [19]:
%%time
# Define the architecture of the neural network
layers = (30, 30, 30)  # Example architecture

batter_imputations_model_filename = f"batter_imputations_model_{todaysdate}.sav"
print(batter_imputations_model_filename)

# Create the MLPRegressor model
batter_imputation_model = MLPRegressor(hidden_layer_sizes=layers, activation='relu', random_state=10, learning_rate_init=0.0001, max_iter=10)

# Prepare data for training
X_train = hitters_merged_df[batter_stats_fg_imp]
y_train = hitters_merged_df[batter_inputs]

# Train the model
batter_imputation_model.fit(X_train, y_train)

# Update y_train with imputed values
hitters_merged_df.loc[:, batter_inputs] = batter_imputation_model.predict(X_train)

# Save model
pickle.dump(batter_imputation_model, open(os.path.join(model_path, batter_imputations_model_filename), 'wb'))

batter_imputations_model_20240130.sav


In [None]:
# # # Testing (would need to add back pa_b to dataset):
# # hitters_merged_df = hitters_merged_df.iloc[10000:].query('pa_p < 40')

# # Add hands to use in imputation
# batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L']

# # Separate the features and target columns
# features = hitters_merged_df[batter_stats_fg_imp]
# target = hitters_merged_df[batter_inputs]

# # Create and fit the model
# batter_imputations_model = keras.Sequential([
#     keras.layers.Dense(30, activation='relu', input_shape=(len(batter_stats_fg_imp),)),
#     keras.layers.Dense(30, activation='relu'),
#     keras.layers.Dense(30, activation='relu'),
#     keras.layers.Dense(len(batter_inputs))  
#     ])

# # Compile the model
# batter_imputations_model.compile(loss='mean_squared_error', optimizer='adam')

# # Train the model
# batter_imputations_model.fit(features, target, epochs=10, batch_size=25)

# # Pickle
# with open(os.path.join(model_path, "batter_imputations_model_20231210.pkl"), "wb") as file:
#     pickle.dump(batter_imputations_model, file)

In [None]:
# # Use the trained model to make predictions
# hitters_merged_df[batter_inputs] = batter_imputations_model.predict(hitters_merged_df[batter_stats_fg_imp])

### Pitchers

##### Dataset

In [20]:
pitchers_df = create_pa_inputs(2015, 2023, 50, 300)

In [21]:
pitchers_df.drop_duplicates(['gamePk', 'pitcher', 'b_L', 'p_L'], keep='last', inplace=True)

In [22]:
pitchers_df = pitchers_df[['pitcher', 'date', 'b_L', 'p_L', 'imp_p'] + pitcher_inputs]

In [23]:
# Standardize the data using StandardScaler
pitcher_stats_scaler = StandardScaler()
pitchers_df[pitcher_inputs] = pitcher_stats_scaler.fit_transform(pitchers_df[pitcher_inputs])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "pitcher_stats_scaler_20231027.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_scaler, file)

##### Steamer

In [24]:
# Read in Steamer hitters 
steamer_pitchers_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_pitchers_weekly_log.csv"), encoding='iso-8859-1')
# Clean
steamer_pitchers_df2 = clean_steamer_pitchers(steamer_pitchers_df)

In [25]:
# Standardize the data using StandardScaler
pitcher_stats_fg_scaler = StandardScaler()
steamer_pitchers_df2[pitcher_stats_fg] = pitcher_stats_fg_scaler.fit_transform(steamer_pitchers_df2[pitcher_stats_fg])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "pitcher_stats_fg_scaler_20231027.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_fg_scaler, file)

##### Create compatible dates

In [26]:
# Create column steamer_date column in hitters_df equal to the highest number <= a number in this list of uniques
steamer_dates = list(steamer_pitchers_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
pitchers_df["steamer_date"] = pitchers_df["date"].apply(find_steamer_date)

##### Merge

In [27]:
# Steamer stats we want to keep
pitcher_stats_fg_plus = ['mlbamid', 'steamerid', 'date'] + pitcher_stats_fg2 
# Merge
pitchers_merged_df = pd.merge(pitchers_df, steamer_pitchers_df2[pitcher_stats_fg_plus], left_on=['pitcher', 'steamer_date'], right_on=['mlbamid', 'date'], how='inner')
# Only keep those without missing data
pitchers_merged_df = pitchers_merged_df.dropna(subset=pitcher_inputs).dropna(subset=pitcher_stats_fg2)

##### Impute

In [28]:
%%time
# Define the architecture of the neural network
layers = (30, 30, 30)

pitcher_imputations_model_filename = f"pitcher_imputations_model_{todaysdate}.sav"
print(pitcher_imputations_model_filename)

# Create the MLPRegressor model
pitcher_imputation_model = MLPRegressor(hidden_layer_sizes=layers, activation='relu', random_state=10, learning_rate_init=0.0001, max_iter=10)

# Prepare data for training
X_train = pitchers_merged_df[pitcher_stats_fg_imp]
y_train = pitchers_merged_df[pitcher_inputs]

# Train the model
pitcher_imputation_model.fit(X_train, y_train)

# Update y_train with imputed values
pitchers_merged_df.loc[:, pitcher_inputs] = pitcher_imputation_model.predict(X_train)

# Save model
pickle.dump(pitcher_imputation_model, open(os.path.join(model_path, pitcher_imputations_model_filename), 'wb'))

pitcher_imputations_model_20240130.sav
CPU times: total: 4.72 s
Wall time: 52.1 s


In [29]:
pitchers_merged_df.head(1)

Unnamed: 0,pitcher,date_x,b_L,p_L,imp_p,b1_p,b2_p,b3_p,hr_p,bb_p,hbp_p,so_p,fo_p,go_p,lo_p,po_p,estimated_woba_using_speedangle_p,to_left_p,to_middle_p,to_right_p,hard_hit_p,barrel_p,iso_p,slg_p,obp_p,woba_p,maxSpeed_p,maxSpin_p,b1_p_long,b2_p_long,b3_p_long,hr_p_long,bb_p_long,hbp_p_long,so_p_long,fo_p_long,go_p_long,lo_p_long,po_p_long,estimated_woba_using_speedangle_p_long,to_left_p_long,to_middle_p_long,to_right_p_long,hard_hit_p_long,barrel_p_long,iso_p_long,slg_p_long,obp_p_long,woba_p_long,maxSpeed_p_long,maxSpin_p_long,steamer_date,mlbamid,steamerid,date_y,H9,HR9,K9,BB9,GBrate,FBrate,LDrate,SIERA,reliability,IP_start,IP,relief_IP
0,516935,20150329,True,False,1,0.046442,0.18048,0.193359,0.043585,0.167076,-0.20762,-0.436946,0.623687,-0.284312,0.148319,0.363476,-0.053814,-0.486914,-0.308343,0.545219,-0.209528,-0.088011,0.086627,0.103915,0.122609,0.131124,-0.373686,-0.54988,-0.018921,0.136351,0.21838,-0.005096,0.117795,-0.327746,-0.485725,0.830932,-0.3048,0.139718,0.396802,-0.214289,-0.665332,-0.2912,0.556387,-0.249455,-0.126824,0.085324,0.121095,0.157762,0.129971,-0.412215,-0.72518,20140928,516935.0,6415,20140928,-1.27412,-1.213385,0.73309,-1.050256,-1.480633,1.502674,-0.059549,-1.057632,0.68738,0.0,1.0,1.0


In [None]:
# # # Testing:
# # hitters_merged_df = hitters_merged_df.iloc[10000:].query('pa_b < 40')

# # Add hands to use in imputation
# pitcher_stats_fg_imp = pitcher_stats_fg + ['b_L', 'p_L']

# # Separate the features and target columns
# features = pitchers_merged_df[pitcher_stats_fg_imp]
# target = pitchers_merged_df[pitcher_inputs]

# # Create and fit the model
# pitcher_imputations_model = keras.Sequential([
#     keras.layers.Dense(30, activation='relu', input_shape=(len(pitcher_stats_fg_imp),)),
#     keras.layers.Dense(30, activation='relu'),
#     keras.layers.Dense(30, activation='relu'),
#     keras.layers.Dense(len(pitcher_inputs))  
#     ])

# # Compile the model
# pitcher_imputations_model.compile(loss='mean_squared_error', optimizer='adam')

# # Train the model
# pitcher_imputations_model.fit(features, target, epochs=10, batch_size=35)

# # Pickle
# with open(os.path.join(model_path, "pitcher_imputations_model_20231210.pkl"), "wb") as file:
#     pickle.dump(pitcher_imputations_model, file)

In [None]:
# # Use the trained model to make predictions
# pitchers_merged_df[pitcher_inputs] = pitcher_imputations_model.predict(pitchers_merged_df[pitcher_stats_fg_imp])