In [1]:
import pandas as pd

from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline

from analysis import wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [2]:
version = 6
df = pd.read_csv(f'../data/baseball/training/game_data_v{version}.csv')

In [3]:
season = 2023

In [4]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff',
                                                                              'away_team_xwoba_diff', 'away_run_diff'],
                                                                   y_col='away_score',
                                                                   split_by='season',
                                                                   season=season)
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff',
                                                                              'home_team_xwoba_diff', 'home_run_diff'],
                                                                   y_col='home_score',
                                                                   split_by='season',
                                                                   season=season)
# x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, y_col='away_score', split_by='season')
# x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, y_col='home_score', split_by='season')

In [6]:
model_away = Pipeline([('scaler', QuantileTransformer(n_quantiles=10)),
                       # ('normalizer', Normalizer()),
                       ('model', PoissonRegressor(alpha=9))])
model_home = Pipeline([('scaler', QuantileTransformer(n_quantiles=10)),
                       # ('normalizer', Normalizer()),
                       ('model', PoissonRegressor(alpha=9))])

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print(f'POISSON REGRESSION MODEL - testing on {season} season')
print('-' * 30)
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results, season=season)
print('-' * 30)
runs_per_game(df, results, season=season)

POISSON REGRESSION MODEL - testing on 2023 season
------------------------------
Accuracy: 53.991769547325106%
------------------------------
Season records:
ATL: 101-61	PHI: 99-63	LAD: 97-65	TOR: 94-68	SF: 94-68	
MIL: 90-72	HOU: 90-72	MIA: 89-73	SEA: 89-73	NYM: 87-75	
BAL: 84-78	MIN: 84-78	NYY: 84-78	TB: 84-78	LAA: 83-79	
SD: 82-80	CHC: 81-81	STL: 81-81	BOS: 81-81	TEX: 81-81	
CIN: 79-83	CLE: 78-84	DET: 78-84	CWS: 77-85	KC: 74-88	
PIT: 70-92	AZ: 69-93	WSH: 59-103	COL: 49-113	OAK: 42-120	
------------------------------
Runs scored per game:
ATL: 4.58 (742)	LAD: 4.58 (741)	HOU: 4.57 (741)	TOR: 4.57 (740)	SEA: 4.57 (740)	
PHI: 4.57 (740)	MIL: 4.56 (739)	BAL: 4.56 (739)	SF: 4.56 (738)	MIA: 4.56 (738)	
STL: 4.56 (738)	MIN: 4.56 (738)	SD: 4.55 (738)	NYM: 4.55 (738)	CHC: 4.55 (738)	
LAA: 4.55 (738)	CIN: 4.55 (737)	DET: 4.55 (737)	NYY: 4.55 (737)	TEX: 4.55 (737)	
KC: 4.55 (737)	CLE: 4.54 (736)	PIT: 4.54 (736)	BOS: 4.54 (736)	TB: 4.54 (735)	
CWS: 4.53 (734)	WSH: 4.53 (734)	AZ: 4.53 (734)	COL: 4

In [7]:
print(f'D^2 (away) = {model_away.score(x_test_away, y_test_away)}')
print(f'D^2 (home) = {model_home.score(x_test_home, y_test_home)}')

D^2 (away) = 0.004085885362741459
D^2 (home) = 0.00388352817694837
