# M02. Stat Imputations
- This imputes model inputs using Steamer projections
- Type: Model
- Run Frequency: Irregular
- Sources:
    - MLB API
    - Steamer
- Dates:
    - Created: 1/28/2024
    - Updated: 12/17/2024

To do:
- Swap to merge_asof
- Add better evaluations?

### Imports

In [1]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"
%run "U4. Datasets.ipynb"
%run "U5. Models.ipynb"

Create directory

In [2]:
os.makedirs(os.path.join(model_path, "M02. Stat Imputations", todaysdate), exist_ok=True)

Read in Park x Weather Factors

In [3]:
multiplier_df = pd.read_csv(os.path.join(baseball_path, "Multiplier Dataset.csv"))

### Batters

##### Dataset

In [4]:
hitters_df = create_pa_inputs(multiplier_df, 2015, 2024, short=50, long=300, adjust=True)

In [5]:
hitters_df.drop_duplicates(['gamePk', 'batter', 'b_L', 'p_L'], keep='last', inplace=True)

Keep those with sufficient sample size

In [6]:
hitters_df = hitters_df[hitters_df['pa_b'] >= 40]

In [7]:
hitters_df = hitters_df[['batter', 'date', 'b_L', 'p_L', 'imp_b'] + batter_inputs]

In [8]:
# Standardize the data using StandardScaler
batter_stats_scaler = StandardScaler()
hitters_df[batter_inputs] = batter_stats_scaler.fit_transform(hitters_df[batter_inputs])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_batter_stats.pkl"), "wb") as file:
    pickle.dump(batter_stats_scaler, file)

##### Steamer

Read in Steamer hitter data

In [9]:
steamer_hitters_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_hitters_weekly_log.csv"), encoding='iso-8859-1')

Clean

In [10]:
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)

Scale hitter Steamer inputs

In [11]:
batter_stats_fg_scaler = StandardScaler()
steamer_hitters_df2[batter_stats_fg] = batter_stats_fg_scaler.fit_transform(steamer_hitters_df2[batter_stats_fg])

Save hitter Steamer scaler

In [12]:
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_batter_stats_steamer.pkl"), "wb") as file:
    pickle.dump(batter_stats_fg_scaler, file)

##### Create compatible dates

In [13]:
# Create column steamer_date column in hitters_df equal to the highest number <= a number in this list of uniques
steamer_dates = list(steamer_hitters_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
hitters_df["steamer_date"] = hitters_df["date"].apply(find_steamer_date)

##### Merge

In [14]:
# Steamer stats we want to keep
batter_stats_fg_plus = ['mlbamid', 'steamerid', 'date'] + batter_stats_fg 
# Merge
hitters_merged_df = pd.merge(hitters_df, steamer_hitters_df2[batter_stats_fg_plus], left_on=['batter', 'steamer_date'], right_on=['mlbamid', 'date'], how='inner')
# Only keep those without missing data
hitters_merged_df = hitters_merged_df.dropna(subset=batter_inputs).dropna(subset=batter_stats_fg)

##### Impute

In [15]:
# Add hands to use in imputation
batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L']

In [16]:
%%time
# Define the architecture of the neural network
layers = (11,11,11,11,11,11)

# Create the MLPRegressor model
batter_imputation_model = MLPRegressor(hidden_layer_sizes=layers, activation='relu', random_state=10, 
                                       learning_rate_init=0.0001, max_iter=100)

# Split hitters_merged_df into train and test sets
X = hitters_merged_df[batter_stats_fg_imp]
y = hitters_merged_df[batter_inputs]

# Use train_test_split to create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% train, 20% test

# Train the model
batter_imputation_model.fit(X_train, y_train)


# Save the model
model_dir = os.path.join(model_path, "M02. Stat Imputations", todaysdate)
os.makedirs(model_dir, exist_ok=True)
pickle.dump(batter_imputation_model, open(os.path.join(model_dir, "impute_batter_stats.sav"), 'wb'))

CPU times: total: 1min 22s
Wall time: 1min 24s


##### Predict

In [17]:
y_test_pred = pd.DataFrame(batter_imputation_model.predict(X_test))
y_test_pred.columns = [f"{col}_pred" for col in batter_inputs]
batter_pred_df = pd.concat([X_test.reset_index(), y_test.reset_index(), y_test_pred], axis=1)

##### Evaluate

In [18]:
mse_list = []
for stat in batter_inputs:
    batter_pred_df[f'{stat}_square_error'] = (batter_pred_df[stat] - batter_pred_df[f'{stat}_pred']) ** 2
    mse_list.append(batter_pred_df[f'{stat}_square_error'].mean())
    
np.mean(mse_list)

0.6635631478187268

In [19]:
# Best score: 11,11,11 0.6498825700128854

### Pitchers

##### Dataset

In [20]:
pitchers_df = create_pa_inputs(multiplier_df, 2015, 2024, short=50, long=300, adjust=True)

Keep one instance of each pitcher in each game vs. each side

In [21]:
pitchers_df.drop_duplicates(['gamePk', 'pitcher', 'b_L', 'p_L'], keep='last', inplace=True)

Keep those with sufficient sample size

In [22]:
pitchers_df = pitchers_df[pitchers_df['pa_p'] >= 40]

Keep relevant columns

In [23]:
pitchers_df = pitchers_df[['pitcher', 'date', 'b_L', 'p_L', 'imp_p'] + pitcher_inputs]

Scale pitcher inputs

In [24]:
# Standardize the data using StandardScaler
pitcher_stats_scaler = StandardScaler()
pitchers_df[pitcher_inputs] = pitcher_stats_scaler.fit_transform(pitchers_df[pitcher_inputs])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_pitcher_stats.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_scaler, file)

##### Steamer

Read in Steamer pitcher data

In [25]:
steamer_pitchers_df = pd.read_csv(os.path.join(baseball_path, "A03. Steamer", "steamer_pitchers_weekly_log.csv"), encoding='iso-8859-1')

Clean

In [26]:
steamer_pitchers_df2 = clean_steamer_pitchers(steamer_pitchers_df)

Scale pitcher Steamer inputs

In [27]:
pitcher_stats_fg_scaler = StandardScaler()
steamer_pitchers_df2[pitcher_stats_fg] = pitcher_stats_fg_scaler.fit_transform(steamer_pitchers_df2[pitcher_stats_fg])

Save pitcher Steamer scaler

In [28]:
with open(os.path.join(model_path, "M02. Stat Imputations", todaysdate, "scale_pitcher_stats_steamer.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_fg_scaler, file)

##### Create compatible dates

In [29]:
# Create column steamer_date column in hitters_df equal to the highest number <= a number in this list of uniques
steamer_dates = list(steamer_pitchers_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
pitchers_df["steamer_date"] = pitchers_df["date"].apply(find_steamer_date)

##### Merge

In [30]:
# Steamer stats we want to keep
pitcher_stats_fg_plus = ['mlbamid', 'steamerid', 'date'] + pitcher_stats_fg2 
# Merge
pitchers_merged_df = pd.merge(pitchers_df, steamer_pitchers_df2[pitcher_stats_fg_plus], left_on=['pitcher', 'steamer_date'], right_on=['mlbamid', 'date'], how='inner')
# Only keep those without missing data
pitchers_merged_df = pitchers_merged_df.dropna(subset=pitcher_inputs).dropna(subset=pitcher_stats_fg2)

##### Impute

In [31]:
%%time
# Define the architecture of the neural network
layers = (11,11,11,11,11,11)

# Create the MLPRegressor model
pitcher_imputation_model = MLPRegressor(hidden_layer_sizes=layers, activation='relu', random_state=10, 
                                       learning_rate_init=0.001, max_iter=100)

# Split hitters_merged_df into train and test sets
X = pitchers_merged_df[pitcher_stats_fg_imp]
y = pitchers_merged_df[pitcher_inputs]

# Use train_test_split to create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% train, 20% test

# Train the model
pitcher_imputation_model.fit(X_train, y_train)


# Save the model
model_dir = os.path.join(model_path, "M02. Stat Imputations", todaysdate)
os.makedirs(model_dir, exist_ok=True)
pickle.dump(pitcher_imputation_model, open(os.path.join(model_dir, "impute_pitcher_stats.sav"), 'wb'))

CPU times: total: 47.9 s
Wall time: 48.5 s


##### Predict

In [32]:
y_test_pred = pd.DataFrame(pitcher_imputation_model.predict(X_test))
y_test_pred.columns = [f"{col}_pred" for col in pitcher_inputs]
pitcher_pred_df = pd.concat([X_test.reset_index(), y_test.reset_index(), y_test_pred], axis=1)

##### Evaluate

In [34]:
mse_list = []
for stat in pitcher_inputs:
    pitcher_pred_df[f'{stat}_square_error'] = (pitcher_pred_df[stat] - pitcher_pred_df[f'{stat}_pred']) ** 2
    mse_list.append(pitcher_pred_df[f'{stat}_square_error'].mean())
    
np.mean(mse_list)

0.7830206297528826

In [None]:
# Best score: 11,11,11 0.781645038177113

### Note: Rerun M03. Plate Appearances if new Stat Imputations models are generated.