# M01. Impute Inputs
- Normalizes model inputs
- Normalizes Steamer projections
- Uses Steamer projections to impute model inputs

In [1]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"

baseball_path = r'C:\Users\james\Documents\MLB\Database'

db_path = r'C:\Users\james\Documents\MLB\Database\MLBDB.db'
engine = create_engine(f'sqlite:///{db_path}')




In [2]:
%run "A02. MLB API.ipynb"
%run "A03. Steamer.ipynb"

### Batters

##### Dataset

In [4]:
hitters_df = create_pa_inputs(2015, 2023)

In [5]:
hitters_df.drop_duplicates(['gamePk', 'batter', 'b_L', 'p_L'], keep='last', inplace=True)

In [6]:
hitters_df = hitters_df[['batter', 'date', 'b_L', 'p_L', 'imp_b'] + batter_inputs]

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the data using MinMaxScaler
batter_stats_scaler = MinMaxScaler(feature_range=(-1, 1))
hitters_df[batter_inputs] = batter_stats_scaler.fit_transform(hitters_df[batter_inputs])

# Save the trained MinMaxScaler object
with open(os.path.join(model_path, "batter_stats_normal_20240128.pkl"), "wb") as file:
    pickle.dump(batter_stats_scaler, file)


##### Steamer

In [11]:
# Read in Steamer hitters 
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')
# Clean
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)

In [12]:
# Standardize the data using StandardScaler
batter_stats_fg_scaler = MinMaxScaler(feature_range=(-1, 1))
steamer_hitters_df2[batter_stats_fg] = batter_stats_fg_scaler.fit_transform(steamer_hitters_df2[batter_stats_fg])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "batter_stats_fg_normal_20240128.pkl"), "wb") as file:
    pickle.dump(batter_stats_fg_scaler, file)

##### Create compatible dates

In [13]:
# Create column steamer_date column in hitters_df equal to the highest number <= a number in this list of uniques
steamer_dates = list(steamer_hitters_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
hitters_df["steamer_date"] = hitters_df["date"].apply(find_steamer_date)

##### Merge

In [14]:
# Steamer stats we want to keep
batter_stats_fg_plus = ['mlbamid', 'steamerid', 'date'] + batter_stats_fg 
# Merge
hitters_merged_df = pd.merge(hitters_df, steamer_hitters_df2[batter_stats_fg_plus], left_on=['batter', 'steamer_date'], right_on=['mlbamid', 'date'], how='inner')
# Only keep those without missing data
hitters_merged_df = hitters_merged_df.dropna(subset=batter_inputs).dropna(subset=batter_stats_fg)

##### Impute

In [15]:
# # Testing (would need to add back pa_b to dataset):
# hitters_merged_df = hitters_merged_df.iloc[10000:].query('pa_p < 40')

# Add hands to use in imputation
batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L']

# Separate the features and target columns
features = hitters_merged_df[batter_stats_fg_imp]
target = hitters_merged_df[batter_inputs]

# Create and fit the model
batter_imputations_model = keras.Sequential([
    keras.layers.Dense(30, activation='relu', input_shape=(len(batter_stats_fg_imp),)),
    keras.layers.Dense(30, activation='relu'),
    keras.layers.Dense(30, activation='relu'),
    keras.layers.Dense(len(batter_inputs))  
    ])

# Compile the model
batter_imputations_model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
batter_imputations_model.fit(features, target, epochs=10, batch_size=25)

# Pickle
with open(os.path.join(model_path, "batter_imputations_model_20240128.pkl"), "wb") as file:
    pickle.dump(batter_imputations_model, file)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
# # Use the trained model to make predictions
# hitters_merged_df[batter_inputs] = batter_imputations_model.predict(hitters_merged_df[batter_stats_fg_imp])

### Pitchers

##### Dataset

In [18]:
pitchers_df = create_pa_inputs(2015, 2023)

In [19]:
pitchers_df.drop_duplicates(['gamePk', 'pitcher', 'b_L', 'p_L'], keep='last', inplace=True)

In [20]:
pitchers_df = pitchers_df[['pitcher', 'date', 'b_L', 'p_L', 'imp_p'] + pitcher_inputs]

In [21]:
# Standardize the data using StandardScaler
pitcher_stats_scaler = MinMaxScaler(feature_range=(-1, 1))
pitchers_df[pitcher_inputs] = pitcher_stats_scaler.fit_transform(pitchers_df[pitcher_inputs])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "pitcher_stats_normal_20240128.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_scaler, file)

##### Steamer

In [22]:
# Read in Steamer hitters 
steamer_pitchers_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_pitchers_weekly_log.csv"), encoding='iso-8859-1')
# Clean
steamer_pitchers_df2 = clean_steamer_pitchers(steamer_pitchers_df)

In [23]:
# Standardize the data using StandardScaler
pitcher_stats_fg_scaler = MinMaxScaler(feature_range=(-1, 1))
steamer_pitchers_df2[pitcher_stats_fg] = pitcher_stats_fg_scaler.fit_transform(steamer_pitchers_df2[pitcher_stats_fg])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "pitcher_stats_fg_normal_20240128.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_fg_scaler, file)

##### Create compatible dates

In [24]:
# Create column steamer_date column in hitters_df equal to the highest number <= a number in this list of uniques
steamer_dates = list(steamer_pitchers_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
pitchers_df["steamer_date"] = pitchers_df["date"].apply(find_steamer_date)

##### Merge

In [25]:
# Steamer stats we want to keep
pitcher_stats_fg_plus = ['mlbamid', 'steamerid', 'date'] + pitcher_stats_fg2 
# Merge
pitchers_merged_df = pd.merge(pitchers_df, steamer_pitchers_df2[pitcher_stats_fg_plus], left_on=['pitcher', 'steamer_date'], right_on=['mlbamid', 'date'], how='inner')
# Only keep those without missing data
pitchers_merged_df = pitchers_merged_df.dropna(subset=pitcher_inputs).dropna(subset=pitcher_stats_fg2)

##### Impute

In [26]:
# # Testing:
# hitters_merged_df = hitters_merged_df.iloc[10000:].query('pa_b < 40')

# Add hands to use in imputation
pitcher_stats_fg_imp = pitcher_stats_fg + ['b_L', 'p_L']

# Separate the features and target columns
features = pitchers_merged_df[pitcher_stats_fg_imp]
target = pitchers_merged_df[pitcher_inputs]

# Create and fit the model
pitcher_imputations_model = keras.Sequential([
    keras.layers.Dense(30, activation='relu', input_shape=(len(pitcher_stats_fg_imp),)),
    keras.layers.Dense(30, activation='relu'),
    keras.layers.Dense(30, activation='relu'),
    keras.layers.Dense(len(pitcher_inputs))  
    ])

# Compile the model
pitcher_imputations_model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
pitcher_imputations_model.fit(features, target, epochs=10, batch_size=35)

# Pickle
with open(os.path.join(model_path, "pitcher_imputations_model_20240128.pkl"), "wb") as file:
    pickle.dump(pitcher_imputations_model, file)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
pitchers_merged_df[pitcher_inputs].describe()

Unnamed: 0,b1_p,b2_p,b3_p,hr_p,bb_p,hbp_p,so_p,fo_p,go_p,lo_p,po_p,estimated_woba_using_speedangle_p,to_left_p,to_middle_p,to_right_p,hard_hit_p,barrel_p,iso_p,slg_p,obp_p,woba_p,maxSpeed_p,maxSpin_p,b1_p_long,b2_p_long,b3_p_long,hr_p_long,bb_p_long,hbp_p_long,so_p_long,fo_p_long,go_p_long,lo_p_long,po_p_long,estimated_woba_using_speedangle_p_long,to_left_p_long,to_middle_p_long,to_right_p_long,hard_hit_p_long,barrel_p_long,iso_p_long,slg_p_long,obp_p_long,woba_p_long,maxSpeed_p_long,maxSpin_p_long
count,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0
mean,-0.747596,-0.92846,-0.99147,-0.954764,-0.839251,-0.978003,-0.609699,-0.764772,-0.538981,-0.88876,-0.907515,-0.766342,-0.639556,-0.494802,-0.587939,-0.543131,-0.915862,-0.920252,-0.846721,-0.584374,-0.763551,-0.020739,0.444617,-0.74675,-0.928494,-0.991247,-0.955752,-0.840589,-0.978402,-0.610567,-0.766822,-0.533682,-0.888119,-0.908202,-0.769311,-0.644298,-0.499565,-0.589345,-0.552133,-0.919423,-0.921355,-0.847695,-0.585707,-0.764603,-0.012781,0.525551
std,0.117134,0.061908,0.025993,0.04828,0.101579,0.037828,0.156708,0.127032,0.187506,0.08064,0.07998,0.071731,0.165615,0.16814,0.180104,0.17093,0.070612,0.059545,0.067305,0.116702,0.073327,0.092816,0.197758,0.093795,0.048833,0.020492,0.037706,0.083275,0.029492,0.131281,0.103403,0.159819,0.061619,0.064109,0.058421,0.137508,0.133868,0.151107,0.139179,0.052952,0.04677,0.053111,0.092978,0.058006,0.093698,0.223555
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-0.82,-0.966337,-1.0,-1.0,-0.898993,-1.0,-0.710706,-0.84,-0.68,-0.96,-0.96,-0.80895,-0.76,-0.6,-0.72,-0.64,-0.96,-0.959729,-0.889264,-0.655807,-0.808853,-0.03186,0.347233,-0.792157,-0.951022,-1.0,-0.974763,-0.888341,-1.0,-0.687806,-0.823789,-0.633333,-0.916667,-0.944444,-0.795952,-0.72973,-0.56,-0.686667,-0.625,-0.948276,-0.944931,-0.873016,-0.630297,-0.792035,-0.023638,0.399091
50%,-0.751787,-0.933003,-1.0,-0.9675,-0.853452,-1.0,-0.618849,-0.76,-0.56,-0.88,-0.92,-0.767464,-0.64,-0.48,-0.6,-0.56,-0.92,-0.928078,-0.852514,-0.587449,-0.767119,-0.012333,0.433307,-0.748932,-0.931683,-0.997159,-0.958763,-0.848597,-0.98556,-0.6169,-0.772894,-0.544218,-0.890625,-0.913333,-0.766895,-0.656863,-0.493333,-0.602041,-0.546667,-0.921569,-0.924341,-0.849926,-0.58863,-0.76632,-0.004111,0.513499
75%,-0.680804,-0.897175,-1.0,-0.93343,-0.782887,-0.96,-0.519536,-0.68,-0.44,-0.84,-0.88,-0.724172,-0.529412,-0.4,-0.48,-0.44,-0.88,-0.89029,-0.811314,-0.517926,-0.723027,0.007194,0.543972,-0.706692,-0.910798,-0.987581,-0.941782,-0.80285,-0.969231,-0.541605,-0.718447,-0.446667,-0.862745,-0.88,-0.738899,-0.56,-0.428571,-0.493333,-0.47619,-0.893333,-0.902951,-0.826362,-0.54645,-0.740596,0.015416,0.648222
max,0.978022,1.0,1.0,0.925926,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.980573,1.0,1.0,1.0,1.0,1.0,0.925926,0.925926,1.0,0.925926,1.0,1.0,0.978022,1.0,1.0,0.925926,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.980573,1.0,1.0,1.0,1.0,1.0,0.925926,0.925926,1.0,0.925926,1.0,1.0


In [28]:
# # Use the trained model to make predictions
# pitchers_merged_df[pitcher_inputs] = pitcher_imputations_model.predict(pitchers_merged_df[pitcher_stats_fg_imp])

In [29]:
pitchers_merged_df[pitcher_inputs].describe()

Unnamed: 0,b1_p,b2_p,b3_p,hr_p,bb_p,hbp_p,so_p,fo_p,go_p,lo_p,po_p,estimated_woba_using_speedangle_p,to_left_p,to_middle_p,to_right_p,hard_hit_p,barrel_p,iso_p,slg_p,obp_p,woba_p,maxSpeed_p,maxSpin_p,b1_p_long,b2_p_long,b3_p_long,hr_p_long,bb_p_long,hbp_p_long,so_p_long,fo_p_long,go_p_long,lo_p_long,po_p_long,estimated_woba_using_speedangle_p_long,to_left_p_long,to_middle_p_long,to_right_p_long,hard_hit_p_long,barrel_p_long,iso_p_long,slg_p_long,obp_p_long,woba_p_long,maxSpeed_p_long,maxSpin_p_long
count,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0,330184.0
mean,-0.747596,-0.92846,-0.99147,-0.954764,-0.839251,-0.978003,-0.609699,-0.764772,-0.538981,-0.88876,-0.907515,-0.766342,-0.639556,-0.494802,-0.587939,-0.543131,-0.915862,-0.920252,-0.846721,-0.584374,-0.763551,-0.020739,0.444617,-0.74675,-0.928494,-0.991247,-0.955752,-0.840589,-0.978402,-0.610567,-0.766822,-0.533682,-0.888119,-0.908202,-0.769311,-0.644298,-0.499565,-0.589345,-0.552133,-0.919423,-0.921355,-0.847695,-0.585707,-0.764603,-0.012781,0.525551
std,0.117134,0.061908,0.025993,0.04828,0.101579,0.037828,0.156708,0.127032,0.187506,0.08064,0.07998,0.071731,0.165615,0.16814,0.180104,0.17093,0.070612,0.059545,0.067305,0.116702,0.073327,0.092816,0.197758,0.093795,0.048833,0.020492,0.037706,0.083275,0.029492,0.131281,0.103403,0.159819,0.061619,0.064109,0.058421,0.137508,0.133868,0.151107,0.139179,0.052952,0.04677,0.053111,0.092978,0.058006,0.093698,0.223555
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-0.82,-0.966337,-1.0,-1.0,-0.898993,-1.0,-0.710706,-0.84,-0.68,-0.96,-0.96,-0.80895,-0.76,-0.6,-0.72,-0.64,-0.96,-0.959729,-0.889264,-0.655807,-0.808853,-0.03186,0.347233,-0.792157,-0.951022,-1.0,-0.974763,-0.888341,-1.0,-0.687806,-0.823789,-0.633333,-0.916667,-0.944444,-0.795952,-0.72973,-0.56,-0.686667,-0.625,-0.948276,-0.944931,-0.873016,-0.630297,-0.792035,-0.023638,0.399091
50%,-0.751787,-0.933003,-1.0,-0.9675,-0.853452,-1.0,-0.618849,-0.76,-0.56,-0.88,-0.92,-0.767464,-0.64,-0.48,-0.6,-0.56,-0.92,-0.928078,-0.852514,-0.587449,-0.767119,-0.012333,0.433307,-0.748932,-0.931683,-0.997159,-0.958763,-0.848597,-0.98556,-0.6169,-0.772894,-0.544218,-0.890625,-0.913333,-0.766895,-0.656863,-0.493333,-0.602041,-0.546667,-0.921569,-0.924341,-0.849926,-0.58863,-0.76632,-0.004111,0.513499
75%,-0.680804,-0.897175,-1.0,-0.93343,-0.782887,-0.96,-0.519536,-0.68,-0.44,-0.84,-0.88,-0.724172,-0.529412,-0.4,-0.48,-0.44,-0.88,-0.89029,-0.811314,-0.517926,-0.723027,0.007194,0.543972,-0.706692,-0.910798,-0.987581,-0.941782,-0.80285,-0.969231,-0.541605,-0.718447,-0.446667,-0.862745,-0.88,-0.738899,-0.56,-0.428571,-0.493333,-0.47619,-0.893333,-0.902951,-0.826362,-0.54645,-0.740596,0.015416,0.648222
max,0.978022,1.0,1.0,0.925926,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.980573,1.0,1.0,1.0,1.0,1.0,0.925926,0.925926,1.0,0.925926,1.0,1.0,0.978022,1.0,1.0,0.925926,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.980573,1.0,1.0,1.0,1.0,1.0,0.925926,0.925926,1.0,0.925926,1.0,1.0


In [30]:
pitchers_merged_df[['hr_p', 'hr_p_long']].describe()

Unnamed: 0,hr_p,hr_p_long
count,330184.0,330184.0
mean,-0.954764,-0.955752
std,0.04828,0.037706
min,-1.0,-1.0
25%,-1.0,-0.974763
50%,-0.9675,-0.958763
75%,-0.93343,-0.941782
max,0.925926,0.925926


In [31]:
pitchers_merged_df.query('imp_p == 0')[['hr_p', 'hr_p_long']].describe()

Unnamed: 0,hr_p,hr_p_long
count,292672.0,292672.0
mean,-0.954592,-0.955707
std,0.039819,0.023665
min,-1.0,-1.0
25%,-1.0,-0.971718
50%,-0.966452,-0.95778
75%,-0.93343,-0.942053
max,-0.640339,-0.752728


In [32]:
pitchers_merged_df2 = pitchers_merged_df.copy()

In [33]:
# Use the trained model to make predictions
pitchers_merged_df2[pitcher_inputs] = pitcher_imputations_model.predict(pitchers_merged_df2[pitcher_stats_fg_imp])

