In [1]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from analysis import wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [2]:
version = 6
df = pd.read_csv(f'../data/baseball/training/game_data_v{version}.csv')

In [4]:
season = 2023

In [5]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff',
                                                                              'away_team_xwoba_diff', 'away_run_diff'],
                                                                   y_col='away_score',
                                                                   split_by='season',
                                                                   season=season)
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff',
                                                                              'home_team_xwoba_diff', 'home_run_diff'],
                                                                   y_col='home_score',
                                                                   split_by='season',
                                                                   season=season)
# x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, y_col='away_score', split_by='season', season=season)
# x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, y_col='home_score', split_by='season', season=season)

In [6]:
model_away = Pipeline([('scaler', StandardScaler()),
                       ('model', LinearRegression())])
model_home = Pipeline([('scaler', StandardScaler()),
                       ('model', LinearRegression())])

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print(f'LINEAR REGRESSION MODEL - testing on {season} season')
print('-' * 30)
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results, season=season)
print('-' * 30)
runs_per_game(df, results, season=season)

LINEAR REGRESSION MODEL - testing on 2023 season
------------------------------
Accuracy: 58.2716049382716%
------------------------------
Season records:
ATL: 151-11	LAD: 140-22	TOR: 132-30	PHI: 131-31	MIN: 123-39	
HOU: 121-41	SEA: 117-45	SF: 112-50	SD: 111-51	NYY: 109-53	
MIL: 108-54	TB: 100-62	NYM: 96-66	TEX: 90-72	MIA: 89-73	
CHC: 87-75	BOS: 83-79	STL: 77-85	BAL: 77-85	LAA: 64-98	
DET: 58-104	CIN: 52-110	CLE: 50-112	KC: 41-121	AZ: 32-130	
PIT: 29-133	CWS: 27-135	WSH: 11-151	COL: 7-155	OAK: 5-157	
------------------------------
Runs scored per game:
ATL: 5.33 (864)	LAD: 5.01 (812)	HOU: 4.87 (789)	PHI: 4.8 (777)	STL: 4.78 (775)	
SD: 4.78 (775)	TOR: 4.78 (774)	MIN: 4.72 (764)	SEA: 4.71 (763)	CHC: 4.64 (752)	
TEX: 4.62 (749)	NYM: 4.62 (748)	BAL: 4.59 (743)	NYY: 4.58 (743)	MIL: 4.56 (739)	
LAA: 4.56 (739)	MIA: 4.55 (736)	SF: 4.53 (733)	BOS: 4.5 (729)	CIN: 4.48 (726)	
KC: 4.45 (722)	DET: 4.43 (718)	TB: 4.41 (715)	CLE: 4.36 (706)	PIT: 4.31 (698)	
AZ: 4.29 (696)	CWS: 4.25 (689)	COL: 4.15 (

In [7]:
print(f'R^2 (away) = {model_away.score(x_test_away, y_test_away)}')
print(f'R^2 (home) = {model_home.score(x_test_home, y_test_home)}')

R^2 (away) = 0.05080470455299846
R^2 (home) = 0.03717991399842202
