# MLB Models

## Data Export
```
# dump hitter data
dumpdata.sc --seasons 2015 2016 2017 2018 2019 2020 2021 2022 --stats "off_*" --extra venue is_home \
   --progress mlb_hist_20082022.scored.db --only_starters \
   --no_team --target_calc_stats "*" --hist_recent_games 5 --hist_recent_mode ma \
   -f mlb_hitter.csv

# dump pitchers
dumpdata.sc --seasons 2015 2016 2017 2018 2019 2020 2021 2022 --stats "p_*" --extra venue is_home \
   --progress mlb_hist_20082022.scored.db --only_starters \
   --no_team --target_calc_stats "*" --hist_recent_games 5 --hist_recent_mode ma \
   -f mlb_pitcher.csv

# teams
dumpdata.sc --seasons 2015 2016 2017 2018 2019 2020 2021 2022 --stats "off_*" --extra venue is_home \
   --progress mlb_hist_20082022.scored.db --no_player \
   --target_stats off_runs --hist_recent_games 5 --hist_recent_mode ma \
   -f mlb_team-runs.csv
```

In [None]:
import pandas as pd

RANDOM_SEED = 1
TRAINING_TIME = 1800
VALIDATION_SEASON = 2022

TARGET_FEATURE = "calc:dk_score"
MODEL_PREFIX = "model-mlb-hitter"
df = pd.read_csv("/fantasy/mlb_hitter.csv")

# TARGET_FEATURE = "calc:dk_score"
# MODEL_PREFIX = "model-mlb-pitcher"
# df = pd.read_csv("/fantasy/mlb_pitcher.csv")

# TARGET_FEATURE = "stat:off_runs"
# MODEL_PREFIX = "model-mlb-team-runs"
# df = pd.read_csv("/fantasy/mlb_team-runs.csv")

In [None]:
import sklearn.model_selection

one_hots = ["extra:venue"]
cleaned_df = df
if "pos" in cleaned_df:
    one_hots.append("pos")
    cleaned_df.pos = cleaned_df.pos.astype(str)
cleaned_df = pd.get_dummies(df, columns=one_hots)
cleaned_df["extra:is_home"] = cleaned_df["extra:is_home"].astype(int)

feature_cols = [
    col
    for col in cleaned_df
    if col == "pos" or col.startswith("extra") or ":recent" in col or ":std" in col
]

train_test_df = cleaned_df.query("season != @VALIDATION_SEASON")

X = train_test_df[feature_cols]
y = train_test_df[TARGET_FEATURE]
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, random_state=RANDOM_SEED
)

validation_df = cleaned_df.query("season == @VALIDATION_SEASON")
X_val = validation_df[feature_cols]
y_val = validation_df[TARGET_FEATURE]
print(
    f"Training will use {len(feature_cols)} features, {len(X_train)} training cases, {len(X_test)} test cases, {len(X_val)} validation test cases",
)

In [None]:
import joblib
from datetime import datetime

import autosklearn.regression
import sklearn.model_selection
import sklearn.metrics
from pprint import pprint

sk_automl = autosklearn.regression.AutoSklearnRegressor(
    seed=RANDOM_SEED, time_left_for_this_task=TRAINING_TIME, memory_limit=-1
)
sk_automl.fit(X_train, y_train)
print(sk_automl.leaderboard())
pprint(sk_automl.show_models(), indent=4)

y_hat = sk_automl.predict(X_test)
print("Test R2 score:", sklearn.metrics.r2_score(y_test, y_hat))
y_hat_val = sk_automl.predict(X_val)
print("Validation R2 score:", sklearn.metrics.r2_score(y_val, y_hat_val))

filename = f"{MODEL_PREFIX}-autosk-{TARGET_FEATURE}-{TRAINING_TIME}.{datetime.now().isoformat().rsplit('.', 1)[0]}.pkl"
print(f"Exporting model to '{filename}'")
joblib.dump(sk_automl, filename)


In [None]:
from tpot import TPOTRegressor

tpot_automl = TPOTRegressor(
    random_state=RANDOM_SEED,
    max_time_mins=TRAINING_TIME / 60,
    verbosity=3,
)
tpot_automl.fit(X_train, y_train)

pprint(tpot_automl.fitted_pipeline_)

y_hat = tpot_automl.predict(X_test)
print("Test R2 score:", sklearn.metrics.r2_score(y_test, y_hat))
y_hat_val = tpot_automl.predict(X_val)
print("Validation R2 score:", sklearn.metrics.r2_score(y_val, y_hat_val))

filename = f"{MODEL_PREFIX}-tpot-{TARGET_FEATURE}-{TRAINING_TIME}.{datetime.now().isoformat().rsplit('.', 1)[0]}.pkl"
print(f"Exporting model to '{filename}'")
joblib.dump(tpot_automl.fitted_pipeline_, filename)
