In [1]:
import pandas as pd

from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline

from analysis import wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [2]:
version = 6
df = pd.read_csv(f'../data/baseball/training/game_data_v{version}.csv')

In [3]:
season = 2023

In [4]:
# x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
#                                                                               'away_team', 'home_team',
#                                                                               'away_team_xfip', 'away_starter_xfip',
#                                                                               'home_lineup_xwoba', 'home_run_diff',
#                                                                               'away_team_xwoba_diff', 'away_run_diff'],
#                                                                    y_col='away_score',
#                                                                    split_by='season',
#                                                                    season=season)
# x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
#                                                                               'away_team', 'home_team',
#                                                                               'home_team_xfip', 'home_starter_xfip',
#                                                                               'away_lineup_xwoba', 'away_run_diff',
#                                                                               'home_team_xwoba_diff', 'home_run_diff'],
#                                                                    y_col='home_score',
#                                                                    split_by='season',
#                                                                    season=season)
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, y_col='away_score', split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, y_col='home_score', split_by='season')

In [5]:
model_away = Pipeline([('scaler', QuantileTransformer(n_quantiles=10)),
                       # ('normalizer', Normalizer()),
                       ('model', PoissonRegressor(alpha=9))])
model_home = Pipeline([('scaler', QuantileTransformer(n_quantiles=10)),
                       # ('normalizer', Normalizer()),
                       ('model', PoissonRegressor(alpha=9))])

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print(f'POISSON REGRESSION MODEL - testing on {season} season')
print('-' * 30)
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results, season=season)
print('-' * 30)
runs_per_game(df, results, season=season)

POISSON REGRESSION MODEL - testing on 2023 season
------------------------------
Accuracy: 57.69547325102881%
------------------------------
Season records:
ATL: 157-5	LAD: 150-12	TOR: 140-22	HOU: 140-22	SEA: 132-30	
PHI: 132-30	TB: 128-34	MIN: 126-36	MIL: 121-41	NYY: 116-46	
SF: 112-50	SD: 111-51	NYM: 93-69	TEX: 85-77	CHC: 83-79	
MIA: 75-87	BAL: 74-88	BOS: 68-94	CLE: 61-101	STL: 59-103	
DET: 52-110	LAA: 45-117	CIN: 44-118	AZ: 30-132	CWS: 26-136	
KC: 26-136	PIT: 21-141	WSH: 11-151	COL: 10-152	OAK: 2-160	
------------------------------
Runs scored per game:
ATL: 4.87 (789)	LAD: 4.85 (786)	HOU: 4.79 (775)	PHI: 4.73 (767)	TOR: 4.73 (766)	
SEA: 4.71 (762)	MIN: 4.69 (760)	SD: 4.68 (757)	MIL: 4.66 (755)	TB: 4.63 (750)	
NYY: 4.62 (748)	SF: 4.61 (747)	TEX: 4.6 (745)	NYM: 4.6 (745)	CHC: 4.57 (740)	
BAL: 4.56 (739)	STL: 4.55 (737)	MIA: 4.52 (732)	BOS: 4.5 (729)	CLE: 4.47 (724)	
LAA: 4.46 (722)	CIN: 4.45 (721)	DET: 4.41 (715)	KC: 4.35 (705)	AZ: 4.35 (704)	
CWS: 4.34 (703)	PIT: 4.31 (699)	WSH: 4.2

In [7]:
print(f'D^2 (away) = {model_away.score(x_test_away, y_test_away)}')
print(f'D^2 (home) = {model_home.score(x_test_home, y_test_home)}')

D^2 (away) = 0.004085885362741459
D^2 (home) = 0.00388352817694837


In [6]:
df = pd.read_csv('../data/baseball/game_data_test.csv')
x_away = df.drop(['away_score', 'home_score', 'away_team', 'home_team'], axis=1)
x_home = df.drop(['away_score', 'home_score', 'away_team', 'home_team'], axis=1)
# x_away, x_home
model_away.predict(x_away), model_home.predict(x_home)

(array([4.53658125]), array([4.48248258]))