# M02. Stat Imputations
- This imputes model inputs using Steamer projections
- Type: Model
- Run Frequency: Irregular
- Sources:
    - MLB API
    - Steamer
- Created: 1/28/2024
- Updated: 12/17/2024

To do:
- Note: Imputations portion has largely been phased out. Just giving small-sample batters 0s and including Steamer inputs for all players. May want to clean up or remove this going forward.
- Add better evaluations?

### Imports

In [None]:
%run "U1. Imports.ipynb"
%run "U2. Functions.ipynb"
%run "U3. Classes.ipynb"
%run "U4. Datasets.ipynb"
%run "U5. Models.ipynb"

Create directory

In [None]:
os.makedirs(os.path.join(model_path, "M02. Stat Imputations", todaysdate), exist_ok=True)

### Data

##### Plate Apperances

Hitters

In [None]:
hitters_df = pd.read_csv(os.path.join(baseball_path, "Final Dataset.csv"))

Pitchers

In [None]:
pitchers_df = hitters_df.copy()

##### Steamer

Hitters

In [None]:
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')

Pitchers

In [None]:
steamer_pitchers_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_pitchers_weekly_log.csv"), encoding='iso-8859-1')

### Clean

##### Plate Appearances

Hitters

Keep one instance of each batter in each game vs. each side

In [None]:
hitters_df.drop_duplicates(['gamePk', 'batter', 'b_L', 'p_L'], keep='last', inplace=True)

Keep those with sufficient sample size

In [None]:
hitters_df = hitters_df[hitters_df['pa_b'] > 40]
hitters_df = hitters_df[~hitters_df[batter_inputs].isin([np.inf, -np.inf]).any(axis=1)]

Keep relevant columns

In [None]:
hitters_df = hitters_df[['batter', 'date', 'b_L', 'p_L', 'imp_b'] + batter_inputs]

Pitchers

Keep one instance of each pitcher in each game vs. each side

In [None]:
pitchers_df.drop_duplicates(['gamePk', 'pitcher', 'b_L', 'p_L'], keep='last', inplace=True)

Keep those with sufficient sample size

In [None]:
pitchers_df = pitchers_df[pitchers_df['pa_p'] > 40]
pitchers_df = pitchers_df[~pitchers_df[pitcher_inputs].isin([np.inf, -np.inf]).any(axis=1)]

Keep relevant columns

In [None]:
pitchers_df = pitchers_df[['pitcher', 'date', 'b_L', 'p_L', 'imp_p'] + pitcher_inputs]

##### Steamer

Hitters

In [None]:
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)
steamer_hitters_df2 = steamer_hitters_df2.dropna(subset=batter_stats_fg)

Pitchers

In [None]:
steamer_pitchers_df2 = clean_steamer_pitchers(steamer_pitchers_df)
steamer_pitchers_df2 = steamer_pitchers_df2.dropna(subset=pitcher_stats_fg)

### Merge

Hitters

Convert to datetime

In [None]:
hitters_df['date'] = pd.to_datetime(hitters_df['date'], format='%Y%m%d')
steamer_hitters_df2['date'] = pd.to_datetime(steamer_hitters_df2['date'], format='%Y%m%d')

Convert to integer

In [None]:
hitters_df['batter'] = hitters_df['batter'].astype(int)
steamer_hitters_df2['mlbamid'] = steamer_hitters_df2['mlbamid'].astype(int)

Sort

In [None]:
hitters_df = hitters_df.sort_values(['batter', 'date'])
steamer_hitters_df2 = steamer_hitters_df2.sort_values(['mlbamid', 'date'])

Merge

In [None]:
hitters_df = pd.merge_asof(
    hitters_df.sort_values('date'),
    steamer_hitters_df2[['mlbamid', 'steamerid', 'date'] + batter_stats_fg].sort_values('date'),
    left_on='date',
    right_on='date',
    left_by='batter',
    right_by='mlbamid'
)

In [None]:
hitters_df = hitters_df[hitters_df['date'].dt.year > 2015]

Pitchers

Convert to datetime

In [None]:
pitchers_df['date'] = pd.to_datetime(pitchers_df['date'], format='%Y%m%d')
steamer_pitchers_df2['date'] = pd.to_datetime(steamer_pitchers_df2['date'], format='%Y%m%d')

Convert to integer

In [None]:
pitchers_df['pitcher'] = pitchers_df['pitcher'].astype(int)
steamer_pitchers_df2 = steamer_pitchers_df2[~steamer_pitchers_df2['mlbamid'].isna()]
steamer_pitchers_df2['mlbamid'] = steamer_pitchers_df2['mlbamid'].astype(int)

Sort

In [None]:
pitchers_df = pitchers_df.sort_values(['pitcher', 'date'])
steamer_pitchers_df2 = steamer_pitchers_df2.sort_values(['mlbamid', 'date'])

Merge

In [None]:
pitchers_df = pd.merge_asof(
    pitchers_df.sort_values('date'),
    steamer_pitchers_df2[['mlbamid', 'steamerid', 'date'] + pitcher_stats_fg].sort_values('date'),
    left_on='date',
    right_on='date',
    left_by='pitcher',
    right_by='mlbamid'
)

In [None]:
pitchers_df = pitchers_df[pitchers_df['date'].dt.year > 2015]

In [None]:
hitters_df[batter_inputs].head()

### Scale

##### Plate Appearances

Hitters

Scale

In [None]:
batter_stats_scaler = MedianCenterer()

hitters_df[batter_inputs] = batter_stats_scaler.fit_transform(hitters_df[batter_inputs])

Save

In [None]:
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_batter_stats.pkl"), "wb") as file:
    pickle.dump(batter_stats_scaler, file)

Pitchers

Scale

In [None]:
pitcher_stats_scaler = MedianCenterer()
pitchers_df[pitcher_inputs] = pitcher_stats_scaler.fit_transform(pitchers_df[pitcher_inputs])

Save

In [None]:
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_pitcher_stats.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_scaler, file)

##### Steamer

Hitters

Scale

In [None]:
batter_stats_fg_scaler = MedianCenterer()

steamer_hitters_df2[batter_stats_fg] = batter_stats_fg_scaler.fit_transform(steamer_hitters_df2[batter_stats_fg])

In [None]:
steamer_hitters_df2[batter_stats_fg].head()

Save

In [None]:
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_batter_stats_steamer.pkl"), "wb") as file:
    pickle.dump(batter_stats_fg_scaler, file)

Pitchers

Scale

In [None]:
steamer_pitchers_df2[pitcher_stats_fg].head()

In [None]:
pitcher_stats_fg_scaler = MedianCenterer()

steamer_pitchers_df2[pitcher_stats_fg] = pitcher_stats_fg_scaler.fit_transform(steamer_pitchers_df2[pitcher_stats_fg])

In [None]:
steamer_pitchers_df2[pitcher_stats_fg].tail()

Save

In [None]:
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_pitcher_stats_steamer.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_fg_scaler, file)

### Impute

Hitters

Stat inputs

In [None]:
batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L', 'imp_b']

Train/Test Split

Split

In [None]:
hitters_df['split'] = np.random.choice([0, 0, 1], size=len(hitters_df))

Create masks to identify training and testing datasets

In [None]:
training_mask = (hitters_df['split'] == 0)
testing_mask = (hitters_df['split'] == 1)

In [None]:
hitters_df.tail()

In [None]:
hitters_df = hitters_df.dropna(subset=batter_inputs).dropna(subset=batter_stats_fg)

Train

In [None]:
%%time
# Define the architecture of the neural network
layers = (50,50)

# Create the MLPRegressor model
batter_imputation_model = MLPRegressor(hidden_layer_sizes=layers, activation='relu', random_state=100, learning_rate_init=0.001, max_iter=100)

# Train the model
batter_imputation_model.fit(hitters_df[training_mask][batter_stats_fg_imp], hitters_df[training_mask][batter_inputs])


# Save the model
os.makedirs(os.path.join(model_path, "M02. Stat Imputations", todaysdate), exist_ok=True)
pickle.dump(batter_imputation_model, open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "impute_batter_stats.sav"), 'wb'))

Predict

In [None]:
y_test_pred = pd.DataFrame(batter_imputation_model.predict(hitters_df[testing_mask][batter_stats_fg_imp]))
y_test_pred.columns = [f"{col}_pred" for col in batter_inputs]
batter_pred_df = pd.concat([hitters_df[testing_mask].reset_index(), y_test_pred], axis=1)

Evaluate

In [None]:
### MSEs - Player-level
all_list, imp_list = [], []

for stat in batter_inputs:
    batter_pred_df[f'{stat}_square_error'] = (batter_pred_df[stat] - batter_pred_df[f'{stat}_pred']) ** 2
    all_list.append(batter_pred_df[f'{stat}_square_error'].mean())
    imp_list.append(batter_pred_df[batter_pred_df['imp_b'] == 1][f'{stat}_square_error'].mean())
    
print("MSE All:    ", np.mean(all_list))
print("MSE Imputed:", np.mean(imp_list))


### MSEs - Stat-level
# Identify predicted columns
pred_columns = [col + "_pred" for col in batter_inputs]

# Check if all pred_columns exist in the dataframe
missing_columns = [col for col in pred_columns if col not in batter_pred_df.columns]
if missing_columns:
    raise ValueError(f"Missing expected columns: {missing_columns}")

# Compute means for actual and predicted columns
means_actual = batter_pred_df.query('imp_b == 1')[batter_inputs].mean()
means_pred = batter_pred_df.query('imp_b == 1')[pred_columns].mean()

# Align indices to ensure proper subtraction
means_pred.index = means_actual.index  

# Compute squared errors
squared_errors = (means_actual - means_pred) ** 2

# Compute final MSE
mse = squared_errors.mean()

print("MSE Stats:  ", mse)

Pitchers

Stat inputs

In [None]:
pitcher_stats_fg_imp = pitcher_stats_fg + ['b_L', 'p_L', 'imp_p']

Train/Test Split

Split

In [None]:
pitchers_df['split'] = np.random.choice([0, 0, 1], size=len(pitchers_df))

Create masks to identify training and testing datasets

In [None]:
training_mask = (pitchers_df['split'] == 0)
testing_mask = (pitchers_df['split'] == 1)

In [None]:
pitchers_df = pitchers_df.dropna(subset=pitcher_inputs).dropna(subset=pitcher_stats_fg)

Train

In [None]:
%%time
# Define the architecture of the neural network
layers = (50,50)

# Create the MLPRegressor model
pitcher_imputation_model = MLPRegressor(hidden_layer_sizes=layers, activation='relu', random_state=1, learning_rate_init=0.001, max_iter=100)

# Train the model
pitcher_imputation_model.fit(pitchers_df[training_mask][pitcher_stats_fg_imp], pitchers_df[training_mask][pitcher_inputs])


# Save the model
os.makedirs(os.path.join(model_path, "M02. Stat Imputations", todaysdate), exist_ok=True)
pickle.dump(pitcher_imputation_model, open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "impute_pitcher_stats.sav"), 'wb'))

Predict

In [None]:
y_test_pred = pd.DataFrame(pitcher_imputation_model.predict(pitchers_df[testing_mask][pitcher_stats_fg_imp]))
y_test_pred.columns = [f"{col}_pred" for col in pitcher_inputs]
pitcher_pred_df = pd.concat([pitchers_df[testing_mask].reset_index(), y_test_pred], axis=1)

Evaluate

In [None]:
### MSEs - Player-level
all_list, imp_list = [], []

for stat in pitcher_inputs:
    pitcher_pred_df[f'{stat}_square_error'] = (pitcher_pred_df[stat] - pitcher_pred_df[f'{stat}_pred']) ** 2
    all_list.append(pitcher_pred_df[f'{stat}_square_error'].mean())
    imp_list.append(pitcher_pred_df[pitcher_pred_df['imp_p'] == 1][f'{stat}_square_error'].mean())
    
print("MSE All:    ", np.mean(all_list))
print("MSE Imputed:", np.mean(imp_list))


### MSEs - Stat-level
# Identify predicted columns
pred_columns = [col + "_pred" for col in pitcher_inputs]

# Check if all pred_columns exist in the dataframe
missing_columns = [col for col in pred_columns if col not in pitcher_pred_df.columns]
if missing_columns:
    raise ValueError(f"Missing expected columns: {missing_columns}")

# Compute means for actual and predicted columns
means_actual = pitcher_pred_df.query('imp_p == 1')[pitcher_inputs].mean()
means_pred = pitcher_pred_df.query('imp_p == 1')[pred_columns].mean()

# Align indices to ensure proper subtraction
means_pred.index = means_actual.index  

# Compute squared errors
squared_errors = (means_actual - means_pred) ** 2

# Compute final MSE
mse = squared_errors.mean()

print("MSE Stats:  ", mse)

In [None]:
pitcher_pred_df.head()

In [None]:
event = "woba"  # change this as needed

pred_col = f"{event}_p_long_pred"
actual_col = f"{event}_p_long"

# Filter for imputed pitcher rows
imp_pitcher_pred_df = pitcher_pred_df[pitcher_pred_df['imp_p'] == 1].copy()

# Create decile bins
imp_pitcher_pred_df['quantile'] = pd.qcut(imp_pitcher_pred_df[pred_col], q=10, labels=False)

# Group by quantile and calculate means
quantile_means = imp_pitcher_pred_df.groupby('quantile')[[pred_col, actual_col]].mean().reset_index()

# Plot
plt.figure(figsize=(6, 6))  # Square plot
plt.plot(quantile_means[pred_col], quantile_means[actual_col], marker='o')
plt.plot([-1, 1], [-1, 1], 'r--')  # 45-degree reference line
plt.xlabel('Avg Predicted Probability')
plt.ylabel('Avg Actual Probability')
plt.title(f'Calibration Plot by Decile for {event}')
plt.xlim(-1, 1)
plt.ylim(-1, 1)
plt.gca().set_aspect('equal', adjustable='box')  # Force square axes
plt.grid(True)
plt.show()


To do:
- Improve evaluations
- Consider using MLPClassifier somehow
- Figure out what to train on:
    - All?
    - Just those with large samples?
    - Just those with small samples?

### Required Follow-Ups:
- M03. Plate Appearances