# M01. Impute Inputs
- This imputes model inputs using Steamer projections
- Type: Model
- Run Frequency: Irregular
- Sources:
    - MLB API
    - Steamer
- Dates:
    - Created: 1/28/2024
    - Updated: 1/31/2024

### Imports

In [1]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, classification_report, f1_score, make_scorer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from tensorflow import keras
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

import joblib
import matplotlib.pyplot as plt




In [3]:
%run "A02. MLB API.ipynb"
%run "A03. Steamer.ipynb"

### Batters

##### Dataset

In [4]:
%run "A02. MLB API-WIP.ipynb"

In [5]:
hitters_df = create_pa_inputs(park_factors, team_map, 2015, 2024, short=50, long=300, adjust=True)

In [6]:
hitters_df.drop_duplicates(['gamePk', 'batter', 'b_L', 'p_L'], keep='last', inplace=True)

In [7]:
hitters_df = hitters_df[['batter', 'date', 'b_L', 'p_L', 'imp_b'] + batter_inputs]

In [8]:
# Standardize the data using StandardScaler
batter_stats_scaler = StandardScaler()
hitters_df[batter_inputs] = batter_stats_scaler.fit_transform(hitters_df[batter_inputs])

# Save the trained StandardScaler object
with open(os.path.join(model_path, f"batter_stats_scaler_{todaysdate}.pkl"), "wb") as file:
    pickle.dump(batter_stats_scaler, file)

##### Steamer

In [9]:
# Read in Steamer hitters 
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')
# Clean
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)

In [10]:
# Standardize the data using StandardScaler
batter_stats_fg_scaler = StandardScaler()
steamer_hitters_df2[batter_stats_fg] = batter_stats_fg_scaler.fit_transform(steamer_hitters_df2[batter_stats_fg])

# Save the trained StandardScaler object
with open(os.path.join(model_path, f"batter_stats_fg_scaler_{todaysdate}.pkl"), "wb") as file:
    pickle.dump(batter_stats_fg_scaler, file)

##### Create compatible dates

In [11]:
# Create column steamer_date column in hitters_df equal to the highest number <= a number in this list of uniques
steamer_dates = list(steamer_hitters_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
hitters_df["steamer_date"] = hitters_df["date"].apply(find_steamer_date)

##### Merge

In [12]:
# Steamer stats we want to keep
batter_stats_fg_plus = ['mlbamid', 'steamerid', 'date'] + batter_stats_fg 
# Merge
hitters_merged_df = pd.merge(hitters_df, steamer_hitters_df2[batter_stats_fg_plus], left_on=['batter', 'steamer_date'], right_on=['mlbamid', 'date'], how='inner')
# Only keep those without missing data
hitters_merged_df = hitters_merged_df.dropna(subset=batter_inputs).dropna(subset=batter_stats_fg)

##### Impute

In [13]:
# Add hands to use in imputation
batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L']

In [14]:
%%time
# Define the architecture of the neural network
layers = (30, 30, 30)  # Example architecture

batter_imputations_model_filename = f"batter_imputations_model_{todaysdate}.sav"
print(batter_imputations_model_filename)

# Create the MLPRegressor model
batter_imputation_model = MLPRegressor(hidden_layer_sizes=layers, activation='relu', random_state=10, learning_rate_init=0.0001, max_iter=10)

# Prepare data for training
X_train = hitters_merged_df[batter_stats_fg_imp]
y_train = hitters_merged_df[batter_inputs]

# Train the model
batter_imputation_model.fit(X_train, y_train)

# Update y_train with imputed values
hitters_merged_df.loc[:, batter_inputs] = batter_imputation_model.predict(X_train)

# Save model
pickle.dump(batter_imputation_model, open(os.path.join(model_path, batter_imputations_model_filename), 'wb'))

batter_imputations_model_20240609.sav
CPU times: total: 1.7 s
Wall time: 25.6 s


In [15]:
# # # Testing (would need to add back pa_b to dataset):
# # hitters_merged_df = hitters_merged_df.iloc[10000:].query('pa_p < 40')

# # Add hands to use in imputation
# batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L']

# # Separate the features and target columns
# features = hitters_merged_df[batter_stats_fg_imp]
# target = hitters_merged_df[batter_inputs]

# # Create and fit the model
# batter_imputations_model = keras.Sequential([
#     keras.layers.Dense(30, activation='relu', input_shape=(len(batter_stats_fg_imp),)),
#     keras.layers.Dense(30, activation='relu'),
#     keras.layers.Dense(30, activation='relu'),
#     keras.layers.Dense(len(batter_inputs))  
#     ])

# # Compile the model
# batter_imputations_model.compile(loss='mean_squared_error', optimizer='adam')

# # Train the model
# batter_imputations_model.fit(features, target, epochs=10, batch_size=25)

# # Pickle
# with open(os.path.join(model_path, "batter_imputations_model_20231210.pkl"), "wb") as file:
#     pickle.dump(batter_imputations_model, file)

In [16]:
# # Use the trained model to make predictions
# hitters_merged_df[batter_inputs] = batter_imputations_model.predict(hitters_merged_df[batter_stats_fg_imp])

### Pitchers

##### Dataset

In [18]:
pitchers_df = create_pa_inputs(park_factors, team_map, 2015, 2024, short=50, long=300, adjust=True)

In [19]:
pitchers_df.drop_duplicates(['gamePk', 'pitcher', 'b_L', 'p_L'], keep='last', inplace=True)

In [20]:
pitchers_df = pitchers_df[['pitcher', 'date', 'b_L', 'p_L', 'imp_p'] + pitcher_inputs]

In [21]:
# Standardize the data using StandardScaler
pitcher_stats_scaler = StandardScaler()
pitchers_df[pitcher_inputs] = pitcher_stats_scaler.fit_transform(pitchers_df[pitcher_inputs])

# Save the trained StandardScaler object
with open(os.path.join(model_path, f"pitcher_stats_scaler_{todaysdate}.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_scaler, file)

##### Steamer

In [22]:
# Read in Steamer hitters 
steamer_pitchers_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_pitchers_weekly_log.csv"), encoding='iso-8859-1')
# Clean
steamer_pitchers_df2 = clean_steamer_pitchers(steamer_pitchers_df)

In [23]:
# Standardize the data using StandardScaler
pitcher_stats_fg_scaler = StandardScaler()
steamer_pitchers_df2[pitcher_stats_fg] = pitcher_stats_fg_scaler.fit_transform(steamer_pitchers_df2[pitcher_stats_fg])

# Save the trained StandardScaler object
with open(os.path.join(model_path, f"pitcher_stats_fg_scaler_{todaysdate}.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_fg_scaler, file)

##### Create compatible dates

In [24]:
# Create column steamer_date column in hitters_df equal to the highest number <= a number in this list of uniques
steamer_dates = list(steamer_pitchers_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
pitchers_df["steamer_date"] = pitchers_df["date"].apply(find_steamer_date)

##### Merge

In [25]:
# Steamer stats we want to keep
pitcher_stats_fg_plus = ['mlbamid', 'steamerid', 'date'] + pitcher_stats_fg2 
# Merge
pitchers_merged_df = pd.merge(pitchers_df, steamer_pitchers_df2[pitcher_stats_fg_plus], left_on=['pitcher', 'steamer_date'], right_on=['mlbamid', 'date'], how='inner')
# Only keep those without missing data
pitchers_merged_df = pitchers_merged_df.dropna(subset=pitcher_inputs).dropna(subset=pitcher_stats_fg2)

##### Impute

In [26]:
%%time
# Define the architecture of the neural network
layers = (30, 30, 30)

pitcher_imputations_model_filename = f"pitcher_imputations_model_{todaysdate}.sav"
print(pitcher_imputations_model_filename)

# Create the MLPRegressor model
pitcher_imputation_model = MLPRegressor(hidden_layer_sizes=layers, activation='relu', random_state=10, learning_rate_init=0.0001, max_iter=10)

# Prepare data for training
X_train = pitchers_merged_df[pitcher_stats_fg_imp]
y_train = pitchers_merged_df[pitcher_inputs]

# Train the model
pitcher_imputation_model.fit(X_train, y_train)

# Update y_train with imputed values
pitchers_merged_df.loc[:, pitcher_inputs] = pitcher_imputation_model.predict(X_train)

# Save model
pickle.dump(pitcher_imputation_model, open(os.path.join(model_path, pitcher_imputations_model_filename), 'wb'))

pitcher_imputations_model_20240609.sav
CPU times: total: 1.22 s
Wall time: 14.8 s


In [27]:
pitchers_merged_df.head(1)

Unnamed: 0,pitcher,date_x,b_L,p_L,imp_p,b1_p,b2_p,b3_p,hr_p,bb_p,hbp_p,so_p,fo_p,go_p,lo_p,po_p,estimated_woba_using_speedangle_p,to_left_p,to_middle_p,to_right_p,hard_hit_p,barrel_p,iso_p,slg_p,obp_p,woba_p,maxSpeed_p,maxSpin_p,b1_p_long,b2_p_long,b3_p_long,hr_p_long,bb_p_long,hbp_p_long,so_p_long,fo_p_long,go_p_long,lo_p_long,po_p_long,estimated_woba_using_speedangle_p_long,to_left_p_long,to_middle_p_long,to_right_p_long,hard_hit_p_long,barrel_p_long,iso_p_long,slg_p_long,obp_p_long,woba_p_long,maxSpeed_p_long,maxSpin_p_long,steamer_date,mlbamid,steamerid,date_y,H9,HR9,K9,BB9,GBrate,FBrate,LDrate,SIERA,reliability,IP_start,IP,relief_IP
0,452657,20150405,True,True,1,-0.103212,-0.286744,-0.012186,-0.442047,-0.672855,-0.130437,-0.398778,-0.392039,0.106572,-0.132655,-0.116328,-0.335421,-0.544868,-0.123354,0.535352,-0.496083,-0.49003,-0.413301,-0.450207,-0.643997,-0.576587,-0.64155,-0.447487,-0.220119,-0.424015,-0.068831,-0.540779,-0.832561,-0.049541,-0.614406,-0.486469,-0.007169,-0.183065,-0.146227,-0.47416,-0.574572,-0.172521,0.557646,-0.685939,-0.621043,-0.64064,-0.716733,-0.922776,-0.899126,-0.712021,-0.61496,20140928,452657.0,4930,20140928,-1.456957,-1.943896,1.0015,-1.335449,0.414186,-0.409941,-0.054406,-1.30813,0.84896,6.301224,1.184,0.0


In [None]:
# # # Testing:
# # hitters_merged_df = hitters_merged_df.iloc[10000:].query('pa_b < 40')

# # Add hands to use in imputation
# pitcher_stats_fg_imp = pitcher_stats_fg + ['b_L', 'p_L']

# # Separate the features and target columns
# features = pitchers_merged_df[pitcher_stats_fg_imp]
# target = pitchers_merged_df[pitcher_inputs]

# # Create and fit the model
# pitcher_imputations_model = keras.Sequential([
#     keras.layers.Dense(30, activation='relu', input_shape=(len(pitcher_stats_fg_imp),)),
#     keras.layers.Dense(30, activation='relu'),
#     keras.layers.Dense(30, activation='relu'),
#     keras.layers.Dense(len(pitcher_inputs))  
#     ])

# # Compile the model
# pitcher_imputations_model.compile(loss='mean_squared_error', optimizer='adam')

# # Train the model
# pitcher_imputations_model.fit(features, target, epochs=10, batch_size=35)

# # Pickle
# with open(os.path.join(model_path, "pitcher_imputations_model_20231210.pkl"), "wb") as file:
#     pickle.dump(pitcher_imputations_model, file)

In [None]:
# # Use the trained model to make predictions
# pitchers_merged_df[pitcher_inputs] = pitcher_imputations_model.predict(pitchers_merged_df[pitcher_stats_fg_imp])