In [None]:
# MLB Models

## Data Export
```
DB_FILE=${FANTASY_HOME}/mlb_hist_20082022.scored.db
SEASONS="2015 2016 2017 2018 2019 2020 2021 2022"
FANTASY_TARGETS="dk_score y_score"

# dump hitter data
dumpdata.sc --seasons $SEASONS --only_starters \
   --pos LF CF RF 1B 2B 3B SS C OF DH PH --no_team \
   --stats "off_*" \
   --current_extra venue is_home hitting_side opp_starting_pitcher_hand \
   --opp_team_stats errors "p_*" --player_team_stats "off_*" \
   --target_calc_stats $FANTASY_TARGETS \
   --hist_recent_games 5 --hist_recent_mode ma \
   --progress $DB_FILE -f mlb_hitter.csv

# dump pitchers
dumpdata.sc --seasons $SEASONS --no_team \
   --only_starters --pos P \
   --stats "p_*" \
   --current_extra venue is_home "hit_*_opp" \
   --opp_team_stats "off_*" win --player_team_stats win "off_*" \
   --target_calc_stats $FANTASY_TARGETS \
   --hist_recent_games 5 --hist_recent_mode ma \
   --progress $DB_FILE -f mlb_pitcher.csv

# teams
dumpdata.sc --seasons $SEASONS --no_player \
   --stats "*" --opp_team_stats "*" \
   --current_extra venue is_home "*starting_pitcher_hand" "hit_*" \
   --target_stats off_runs win \
   --hist_recent_games 5 --hist_recent_mode ma \
   --progress $DB_FILE -f mlb_team.csv
```

In [None]:
RANDOM_SEED = 1
TRAINING_TIME = 1200
VALIDATION_SEASON = 2022
RECENT_GAMES = 5
TRAINING_SEASONS = [2015, 2016, 2017, 2018, 2019, 2020, 2021]



In [None]:
from datetime import datetime

from fantasy_py import PlayerOrTeam

from train_test import load_data, train_test, create_fantasy_model

TARGET = ("stat", "off_runs")
MODEL_NAME = "MLB-team-runs"
DATA_FILENAME = "mlb_team.csv"
P_OR_T = PlayerOrTeam.TEAM
COLS_TO_DROP = None
QUERY_FILTER = None
ONLY_STARTERS = None
INCLUDE_POS = None
TARGET_POS = None

# TARGET = ("stat", "win")
# MODEL_NAME = "MLB-team-win"
# DATA_FILENAME = "mlb_team.csv"
# P_OR_T = PlayerOrTeam.TEAM
# COLS_TO_DROP = None
# QUERY_FILTER = None
# ONLY_STARTERS = None
# INCLUDE_POS = None
# TARGET_POS = None

# TARGET = ("calc", "dk_score")
# MODEL_NAME = "MLB-P-DK"
# DATA_FILENAME = "mlb_pitcher.csv"
# P_OR_T = PlayerOrTeam.PLAYER
# COLS_TO_DROP = ["*p_po*", "*:p_hold"]
# QUERY_FILTER = None
# ONLY_STARTERS = True
# INCLUDE_POS = False
# TARGET_POS = ["P"]

# TARGET = ("calc", "dk_score")
# MODEL_NAME = "MLB-H-DK"
# DATA_FILENAME = "mlb_hitter.csv"
# P_OR_T = PlayerOrTeam.PLAYER
# COLS_TO_DROP = None
# ONLY_STARTERS = True
# QUERY_FILTER = f"`{':'.join(TARGET)}`.notna()"
# INCLUDE_POS = True
# TARGET_POS = ["1B", "2B", "3B", "SS", "C", "LF", "RF", "CF"]

raw_df, tt_data, one_hot_stats = load_data(
    DATA_FILENAME,
    TARGET,
    VALIDATION_SEASON,
    seed=RANDOM_SEED,
    col_drop_filters=COLS_TO_DROP,
    filtering_query=QUERY_FILTER,
    include_position=INCLUDE_POS,
)

assert list(one_hot_stats.keys()) == ["extra:venue"]

# for automl_type in ["tpot", "autosk"]:
for automl_type in ["tpot"]:
    dt_trained = datetime.now()

    model_filepath, performance = train_test(
        automl_type, MODEL_NAME, TARGET, tt_data, RANDOM_SEED, TRAINING_TIME, dt_trained
    )
    performance["season"] = VALIDATION_SEASON
    model = create_fantasy_model(
        MODEL_NAME,
        model_filepath,
        dt_trained,
        tt_data[0],
        TARGET,
        TRAINING_TIME,
        P_OR_T,
        RECENT_GAMES,
        automl_type,
        performance,
        TRAINING_SEASONS,
        only_starters=ONLY_STARTERS,
        seed=RANDOM_SEED,
        target_pos=TARGET_POS,
        training_pos=TARGET_POS,
        raw_df=raw_df,
        one_hot_stats=one_hot_stats,
    )
    model_filepath = model.dump(".".join([MODEL_NAME, TARGET[1], automl_type, "model"]))
    print(f"Model file saved to '{model_filepath}'")
