In [1]:
import pandas as pd

from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline

from analysis import wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [2]:
version = 6
df = pd.read_csv(f'../data/baseball/training/game_data_v{version}.csv')

In [3]:
season = 2023

In [4]:
# x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
#                                                                               'away_team', 'home_team',
#                                                                               'away_team_xfip', 'away_starter_xfip',
#                                                                               'home_lineup_xwoba', 'home_run_diff',
#                                                                               'away_team_xwoba_diff', 'away_run_diff'],
#                                                                    y_col='away_score',
#                                                                    split_by='season',
#                                                                    season=season)
# x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
#                                                                               'away_team', 'home_team',
#                                                                               'home_team_xfip', 'home_starter_xfip',
#                                                                               'away_lineup_xwoba', 'away_run_diff',
#                                                                               'home_team_xwoba_diff', 'home_run_diff'],
#                                                                    y_col='home_score',
#                                                                    split_by='season',
#                                                                    season=season)
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, y_col='away_score', split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, y_col='home_score', split_by='season')

In [8]:
model_away = Pipeline([('scaler', QuantileTransformer(n_quantiles=10)),
                       # ('normalizer', Normalizer()),
                       ('model', PoissonRegressor(alpha=17))])
model_home = Pipeline([('scaler', QuantileTransformer(n_quantiles=10)),
                       # ('normalizer', Normalizer()),
                       ('model', PoissonRegressor(alpha=17))])

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print(f'POISSON REGRESSION MODEL - testing on {season} season')
print('-' * 30)
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results, season=season)
print('-' * 30)
runs_per_game(df, results, season=season)

POISSON REGRESSION MODEL - testing on 2023 season
------------------------------
Accuracy: 53.991769547325106%
------------------------------
Season records:
ATL: 105-57	LAD: 102-60	PHI: 98-64	TOR: 95-67	HOU: 94-68	
SF: 91-71	MIL: 90-72	SEA: 89-73	TB: 89-73	SD: 87-75	
NYM: 85-77	NYY: 85-77	MIN: 84-78	BAL: 83-79	TEX: 82-80	
CHC: 81-81	MIA: 81-81	STL: 81-81	BOS: 81-81	LAA: 81-81	
CIN: 79-83	CLE: 79-83	DET: 77-85	CWS: 76-86	KC: 71-91	
PIT: 69-93	AZ: 69-93	WSH: 58-104	COL: 47-115	OAK: 41-121	
------------------------------
Runs scored per game:
ATL: 4.59 (744)	LAD: 4.59 (744)	HOU: 4.58 (743)	PHI: 4.58 (742)	TOR: 4.58 (742)	
SEA: 4.58 (741)	MIN: 4.57 (741)	MIL: 4.57 (741)	SD: 4.57 (740)	TB: 4.57 (740)	
SF: 4.57 (740)	NYY: 4.56 (739)	NYM: 4.56 (739)	TEX: 4.56 (738)	BAL: 4.56 (738)	
CHC: 4.56 (738)	MIA: 4.55 (738)	STL: 4.55 (737)	BOS: 4.55 (737)	CLE: 4.54 (736)	
LAA: 4.54 (736)	CIN: 4.54 (736)	DET: 4.54 (735)	CWS: 4.53 (734)	KC: 4.53 (734)	
AZ: 4.53 (733)	PIT: 4.52 (733)	WSH: 4.51 (731)	COL: 

In [7]:
print(f'D^2 (away) = {model_away.score(x_test_away, y_test_away)}')
print(f'D^2 (home) = {model_home.score(x_test_home, y_test_home)}')

D^2 (away) = 0.004085885362741459
D^2 (home) = 0.00388352817694837


In [6]:
df = pd.read_csv('../data/baseball/game_data_test.csv')
x_away = df.drop(['away_score', 'home_score', 'away_team', 'home_team'], axis=1)
x_home = df.drop(['away_score', 'home_score', 'away_team', 'home_team'], axis=1)
# x_away, x_home
model_away.predict(x_away), model_home.predict(x_home)

(array([4.53658125]), array([4.48248258]))