# M01. Impute Inputs
- Normalizes model inputs
- Normalizes Steamer projections
- Uses Steamer projections to impute model inputs

In [None]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"
%run "D3. Simulation Functions.ipynb"

baseball_path = r'C:\Users\james\Documents\MLB\Database'

db_path = r'C:\Users\james\Documents\MLB\Database\MLBDB.db'
engine = create_engine(f'sqlite:///{db_path}')

In [None]:
%run "A03. Steamer.ipynb"

### Batters

##### Dataset

In [None]:
# Choose the last instance of each player in each game, assuming they have enough PAs
sql_query = f'''
WITH ranked_data AS (
  SELECT *,
         ROW_NUMBER() OVER (PARTITION BY gamePk, batter ORDER BY atBatIndex DESC) AS rn
  FROM "Dataset"
)
SELECT *
FROM ranked_data
WHERE pa_b >= 40 AND pa_b <= 300 AND rn = 1
'''

hitters_df = pd.read_sql_query(sql_query, con=engine)

In [None]:
# Standardize the data using StandardScaler
batter_stats_scaler = StandardScaler()
hitters_df[batter_stats] = batter_stats_scaler.fit_transform(hitters_df[batter_stats])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "batter_stats_scaler_20231027.pkl"), "wb") as file:
    pickle.dump(batter_stats_scaler, file)

##### Steamer

In [None]:
# Choose the last instance of each player in each game, assuming they have enough PAs
sql_query = f'''
  SELECT *
  FROM "Steamer Hitters"
  WHERE "PA" >= 40
'''

steamer_hitters_df = pd.read_sql_query(sql_query, con=engine)

In [None]:
# Clean
steamer_hitters_df2 = clean_steamer_hitters(steamer_hitters_df)

In [None]:
# Standardize the data using StandardScaler
batter_stats_fg_scaler = StandardScaler()
steamer_hitters_df2[batter_stats_fg] = batter_stats_fg_scaler.fit_transform(steamer_hitters_df2[batter_stats_fg])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "batter_stats_fg_scaler_20231027.pkl"), "wb") as file:
    pickle.dump(batter_stats_fg_scaler, file)

##### Merge

In [None]:
# Create column steamer_date column in hitters_df equal to the highest number <= a number in this list of uniques
steamer_dates = list(steamer_hitters_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
hitters_df["steamer_date"] = hitters_df["date"].apply(find_steamer_date)

In [None]:
# Steamer stats we want to keep
batter_stats_fg_plus = ['mlbamid', 'steamerid', 'date'] + batter_stats_fg 
# Merge
hitters_merged_df = pd.merge(hitters_df, steamer_hitters_df2[batter_stats_fg_plus], left_on=['batter', 'steamer_date'], right_on=['mlbamid', 'date'], how='inner')
# Only keep one observation per player per game 
# Consider only keeping one observation per player per week/Steamer weekly projection
hitters_merged_df.drop_duplicates(['gamePk', 'batter'], inplace=True, keep='last')
# Only keep those without missing data
hitters_merged_df = hitters_merged_df.dropna(subset=batter_stats)

##### Impute

In [None]:
# Add hands to use in imputation
batter_stats_fg_imp = batter_stats_fg + ['b_L', 'p_L']

# Separate the features and target columns
features = hitters_merged_df[batter_stats_fg_imp]
target = hitters_merged_df[batter_stats]

# Create and fit the model
batter_imputations_model = keras.Sequential([
    keras.layers.Dense(25, activation='relu', input_shape=(len(batter_stats_fg_imp),)),
    keras.layers.Dense(25, activation='relu'),
    keras.layers.Dense(25, activation='relu'),
    keras.layers.Dense(len(batter_stats))  
    ])

# Compile the model
batter_imputations_model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
batter_imputations_model.fit(features, target, epochs=20, batch_size=32)

# Pickle
with open(os.path.join(model_path, "batter_imputations_model_20231027.pkl"), "wb") as file:
    pickle.dump(batter_imputations_model, file)

In [None]:
# Use the trained model to make predictions
hitters_merged_df[batter_stats] = batter_imputations_model.predict(hitters_merged_df[batter_stats_fg_imp])

### Pitchers

##### Dataset

In [None]:
# Select dataset
# Choose the last instance of each player in each game, assuming they have enough PAs
sql_query = f'''
WITH ranked_data AS (
  SELECT *,
         ROW_NUMBER() OVER (PARTITION BY gamePk, pitcher ORDER BY atBatIndex DESC) AS rn
  FROM "Dataset"
)
SELECT *
FROM ranked_data
WHERE pa_p >= 40 AND pa_p <= 300 AND rn = 1
'''

pitchers_df = pd.read_sql_query(sql_query, con=engine)

In [None]:
# Standardize the data using StandardScaler
pitcher_stats_scaler = StandardScaler()
pitchers_df[pitcher_stats] = pitcher_stats_scaler.fit_transform(pitchers_df[pitcher_stats])

# Save the trained StandardScaler object
with open(os.path.join(model_path, "pitcher_stats_scaler_20231027.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_scaler, file)

##### Steamer

In [None]:
# Choose the last instance of each player in each game, assuming they have enough PAs
sql_query = f'''
  SELECT *
  FROM "Steamer Pitchers"
  WHERE "PA" >= 40
'''

steamer_pitchers_df = pd.read_sql_query(sql_query, con=engine)

In [None]:
# Clean
steamer_pitchers_df2 = clean_steamer_pitchers(steamer_pitchers_df)

In [None]:
steamer_pitchers_df2.dropna(subset=pitcher_stats_fg2, inplace=True)

In [None]:
# Standardize the data using StandardScaler
pitcher_stats_fg_scaler = StandardScaler()
pitcher_stats_fg_scaled = pitcher_stats_fg_scaler.fit_transform(steamer_pitchers_df2[pitcher_stats_fg])
pitcher_stats_fg_scaled = pd.DataFrame(pitcher_stats_fg_scaled, columns=pitcher_stats_fg)

# Save the trained StandardScaler object
with open(os.path.join(model_path, "pitcher_stats_fg_scaler_20231027.pkl"), "wb") as file:
    pickle.dump(pitcher_stats_fg_scaler, file)

##### Merge

In [None]:
# Create column steamer_date column in hitters_df equal to the highest number <= a number in this list of uniques
steamer_dates = list(steamer_pitchers_df2['date'].unique())

# Define a function to find the largest number in "steamer_dates" less than or equal to a given "date"
def find_steamer_date(date):
    max_steamer_date = max(filter(lambda d: d <= date, steamer_dates), default=None)
    return max_steamer_date

# Apply the function to create the "steamer_date" column in your DataFrame
pitchers_df["steamer_date"] = pitchers_df["date"].apply(find_steamer_date)

In [None]:
# Steamer stats we want to keep
pitcher_stats_fg_plus = ['mlbamid', 'steamerid', 'date'] + pitcher_stats_fg2 
# Merge
pitchers_merged_df = pd.merge(pitchers_df, steamer_pitchers_df2[pitcher_stats_fg_plus], left_on=['pitcher', 'steamer_date'], right_on=['mlbamid', 'date'], how='inner')
# Only keep one observation per player per game 
# Consider only keeping one observation per player per week/Steamer weekly projection
pitchers_merged_df.drop_duplicates(['gamePk', 'pitcher'], inplace=True, keep='last')
# Only keep those without missing data
pitchers_merged_df = pitchers_merged_df.dropna(subset=pitcher_stats).dropna(subset=pitcher_stats_fg2)

##### Impute

In [None]:
# Add hands to use in imputation
pitcher_stats_fg_imp = pitcher_stats_fg + ['b_L', 'p_L']

# Separate the features and target columns
features = pitchers_merged_df[pitcher_stats_fg_imp]
target = pitchers_merged_df[pitcher_stats]

# Create and fit the model
pitcher_imputations_model = keras.Sequential([
    keras.layers.Dense(25, activation='relu', input_shape=(len(pitcher_stats_fg_imp),)),
    keras.layers.Dense(25, activation='relu'),
    keras.layers.Dense(25, activation='relu'),
    keras.layers.Dense(len(pitcher_stats))  
    ])

# Compile the model
pitcher_imputations_model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
pitcher_imputations_model.fit(features, target, epochs=20, batch_size=32)

# Pickle
with open(os.path.join(model_path, "pitcher_imputations_model_20231027.pkl"), "wb") as file:
    pickle.dump(pitcher_imputations_model, file)

In [None]:
# Use the trained model to make predictions
pitchers_merged_df[pitcher_stats] = pitcher_imputations_model.predict(pitchers_merged_df[pitcher_stats_fg_imp])