# MLB Models

## Data Export
```
DB_FILE=${FANTASY_HOME}/mlb_hist_20082022.scored.db
SEASONS="2015 2016 2017 2018 2019 2020 2021 2022"
FANTASY_TARGETS="dk_score y_score"

# dump hitter data
dumpdata.sc --seasons $SEASONS --only_starters \
   --pos LF CF RF 1B 2B 3B SS C OF DH PH --no_team \
   --stats "off_*" \
   --current_extra venue is_home hitting_side opp_starting_pitcher_hand \
   --opp_team_stats errors "p_*" --player_team_stats "off_*" \
   --target_calc_stats $FANTASY_TARGETS --target_stats off_hit off_runs \
   --hist_recent_games 5 --hist_recent_mode ma \
   --progress $DB_FILE --format parquet -f mlb_hitter.pq

# dump pitchers
dumpdata.sc --seasons $SEASONS --no_team \
   --only_starters --pos P \
   --stats "p_*" \
   --current_extra venue is_home "hit_*_opp" starting_pitcher_hand \
   --opp_team_stats "off_*" win --player_team_stats win "off_*" \
   --target_calc_stats $FANTASY_TARGETS --target_stats p_k p_ip p_hits \
   --hist_recent_games 5 --hist_recent_mode ma \
   --progress $DB_FILE --format parquet -f mlb_pitcher.pq

# teams
dumpdata.sc --seasons $SEASONS --no_player \
   --stats "*" --opp_team_stats "*" \
   --current_extra venue is_home "*starting_pitcher_hand" "hit_*" \
   --target_stats off_runs win \
   --hist_recent_games 5 --hist_recent_mode ma \
   --progress $DB_FILE --format parquet -f mlb_team.pq
```

In [None]:
RANDOM_SEED = 1
TRAINING_TIME = 1200
VALIDATION_SEASON = 2022
RECENT_GAMES = 5
TRAINING_SEASONS = [2015, 2016, 2017, 2018, 2019, 2020, 2021]
REUSE_EXISTING = False
MISSING_DATA_THRESHOLD = .1


In [None]:
import sys

sys.path.append("..")

from fantasy_py import PlayerOrTeam
from train_test import load_data, model_and_test

# TARGET = ("stat", "off_runs")
# MODEL_NAME = "MLB-team-runs"
# DATA_FILENAME = "mlb_team.pq"
# P_OR_T = PlayerOrTeam.TEAM
# COLS_TO_DROP = None
# QUERY_FILTER = None
# ONLY_STARTERS = None
# INCLUDE_POS = None
# TARGET_POS = None

# TARGET = ("stat", "win")
# MODEL_NAME = "MLB-team-win"
# DATA_FILENAME = "mlb_team.pq"
# P_OR_T = PlayerOrTeam.TEAM
# COLS_TO_DROP = None
# QUERY_FILTER = None
# ONLY_STARTERS = None
# INCLUDE_POS = None
# TARGET_POS = None


# TARGET = ("calc", "dk_score")
# MODEL_NAME = "MLB-P-DK"
# TARGET = ("stat", "p_k")
# MODEL_NAME = "MLB-P-K"
# TARGET = ("stat", "p_ip")
# MODEL_NAME = "MLB-P-IP"
# TARGET = ("stat", "p_hits")
# MODEL_NAME = "MLB-P-HITS"

# DATA_FILENAME = "mlb_pitcher.pq"
# P_OR_T = PlayerOrTeam.PLAYER
# COLS_TO_DROP = ["*p_po*", "*:p_hold", "*p_save"]
# QUERY_FILTER = None
# ONLY_STARTERS = True
# INCLUDE_POS = False
# TARGET_POS = ["P"]


# TARGET = ("calc", "dk_score")
# MODEL_NAME = "MLB-H-DK"
# TARGET = ("stat", "off_hit")
# MODEL_NAME = "MLB-H-hit"
# TARGET = ("stat", "off_runs")
# MODEL_NAME = "MLB-H-run"

# DATA_FILENAME = "mlb_hitter.pq"
# P_OR_T = PlayerOrTeam.PLAYER
# COLS_TO_DROP = None
# ONLY_STARTERS = True
# QUERY_FILTER = f"`{':'.join(TARGET)}`.notna()"
# INCLUDE_POS = True
# TARGET_POS = ["1B", "2B", "3B", "SS", "C", "LF", "RF", "CF"]



raw_df, tt_data, one_hot_stats = load_data(
    DATA_FILENAME,
    TARGET,
    VALIDATION_SEASON,
    seed=RANDOM_SEED,
    col_drop_filters=COLS_TO_DROP,
    filtering_query=QUERY_FILTER,
    include_position=INCLUDE_POS,
    missing_data_threshold=MISSING_DATA_THRESHOLD,
)

for automl_type in ["tpot"]:
    model = model_and_test(
        MODEL_NAME,
        VALIDATION_SEASON,
        tt_data,
        TARGET,
        TRAINING_TIME,
        automl_type,
        P_OR_T,
        RECENT_GAMES,
        TRAINING_SEASONS,
        seed=RANDOM_SEED,
        target_pos=TARGET_POS,
        training_pos=TARGET_POS,
        raw_df=raw_df,
        reuse_existing=REUSE_EXISTING,
    )