# Model Training and Evaluation 

## Predicting MLB game results

This notebook explores the training and evaluation of machine learning models on the outcomes of MLB games using a dataset composed of current season statistics.

### Notebook Setup

Import the necessary dependencies.

In [1]:
import pandas as pd
from datetime import datetime
from utils.notebook_setup import setup_notebook_env, load_env_variables

Setup the notebook environment. Add the project root to `sys.path` so higher-level modules can be accessed from this notebook, and load environment variables that are required to configure connection to the PostgreSQL database.

In [2]:
setup_notebook_env()
load_env_variables()

Now, import the remaining dependencies that couldn't be accessed prior to the notebook environment setup, and connect to the database.

In [3]:
from shared.database import connect_to_db
from machine_learning.data.processing.mlb_data_pipeline import MLBDataPipeline

session = connect_to_db()

### Model Selection

Read the relevant database tables into DataFrames so training data can be prepared using the data pipeline class.

In [4]:
teams_df = pd.read_sql_table("mlb_teams", session.bind)
schedule_df = pd.read_sql_table("mlb_schedule", session.bind)
offensive_stats_df = pd.read_sql_table("mlb_offensive_stats", session.bind)
mlb_defensive_stats_df = pd.read_sql_table("mlb_defensive_stats", session.bind)

Prepare training data for a given date range.

In [5]:
data_pipeline = MLBDataPipeline(rolling_window=10, head_to_head_window=5)

start_date = datetime(2024, 4, 1)
end_date = datetime(2024, 10, 1)

training_data = data_pipeline.prepare_training_data(
    schedule_df=schedule_df,
    teams_df=teams_df,
    offensive_stats_df=offensive_stats_df,
    defensive_stats_df=mlb_defensive_stats_df,
    start_date=start_date,
    end_date=end_date
)

In [6]:
training_data.head()

Unnamed: 0,home_rolling_win_pct,away_rolling_win_pct,home_rolling_runs_scored,away_rolling_runs_scored,home_rolling_runs_allowed,away_rolling_runs_allowed,home_days_rest,away_days_rest,home_batting_avg,away_batting_avg,...,away_strikeouts,h2h_home_win_pct,h2h_away_win_pct,h2h_games_played,game_id,game_date,home_team_id,away_team_id,home_team_won,run_differential
0,0.2,0.4,5.2,3.5,7.1,5.9,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1,746899,2024-04-01,112,115,True,5.0
1,,,,,,,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0,744875,2024-04-01,120,134,False,-4.0
2,,,,,,,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0,747058,2024-04-01,110,118,True,2.0
3,,,,,,,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0,745600,2024-04-01,143,113,False,-3.0
4,,0.4,,4.1,,5.7,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0,746085,2024-04-01,146,108,False,-3.0


In [7]:
training_data.tail()

Unnamed: 0,home_rolling_win_pct,away_rolling_win_pct,home_rolling_runs_scored,away_rolling_runs_scored,home_rolling_runs_allowed,away_rolling_runs_allowed,home_days_rest,away_days_rest,home_batting_avg,away_batting_avg,...,away_strikeouts,h2h_home_win_pct,h2h_away_win_pct,h2h_games_played,game_id,game_date,home_team_id,away_team_id,home_team_won,run_differential
2369,0.6,0.6,5.8,4.8,4.0,3.7,2.0,2.0,0.0,0.0,...,0.0,0.6,0.4,5,747064,2024-09-25,144,121,True,3.0
2370,0.6,0.7,4.5,3.7,4.7,3.5,1.0,1.0,0.0,0.0,...,0.0,0.6,0.4,5,775345,2024-10-01,117,116,False,-2.0
2371,0.7,0.4,5.6,1.9,4.0,2.8,1.0,1.0,0.0,0.0,...,0.0,0.6,0.4,5,775343,2024-10-01,110,118,False,-1.0
2372,0.5,0.5,4.2,4.0,4.1,4.4,1.0,1.0,0.0,0.0,...,0.0,0.8,0.2,5,775340,2024-10-01,158,121,False,-4.0
2373,0.7,0.8,3.8,5.1,3.3,2.0,1.0,1.0,0.0,0.0,...,0.0,0.6,0.4,5,775333,2024-10-01,135,144,True,4.0
