In [1]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from analysis import wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [2]:
version = 6
df = pd.read_csv(f'../data/baseball/training/game_data_v{version}.csv')

In [3]:
season = 2023

In [4]:
# x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
#                                                                               'away_team', 'home_team',
#                                                                               'away_team_xfip', 'away_starter_xfip',
#                                                                               'home_lineup_xwoba', 'home_run_diff',
#                                                                               'away_team_xwoba_diff', 'away_run_diff'],
#                                                                    y_col='away_score',
#                                                                    split_by='season',
#                                                                    season=season)
# x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
#                                                                               'away_team', 'home_team',
#                                                                               'home_team_xfip', 'home_starter_xfip',
#                                                                               'away_lineup_xwoba', 'away_run_diff',
#                                                                               'home_team_xwoba_diff', 'home_run_diff'],
#                                                                    y_col='home_score',
#                                                                    split_by='season',
#                                                                    season=season)
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, y_col='away_score', split_by='season', season=season)
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, y_col='home_score', split_by='season', season=season)

In [5]:
model_away = Pipeline([('scaler', StandardScaler()),
                       ('model', LinearRegression())])
model_home = Pipeline([('scaler', StandardScaler()),
                       ('model', LinearRegression())])

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print(f'LINEAR REGRESSION MODEL - testing on {season} season')
print('-' * 30)
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results, season=season)
print('-' * 30)
runs_per_game(df, results, season=season)

LINEAR REGRESSION MODEL - testing on 2023 season
------------------------------
Accuracy: 58.806584362139915%
------------------------------
Season records:
LAD: 156-6	TB: 150-12	ATL: 149-13	HOU: 148-14	TOR: 122-40	
SD: 121-41	MIN: 119-43	PHI: 115-47	TEX: 114-48	SEA: 109-53	
MIL: 105-57	CHC: 105-57	NYM: 97-65	BAL: 89-73	NYY: 88-74	
SF: 85-77	BOS: 73-89	CLE: 73-89	STL: 70-92	CIN: 54-108	
AZ: 54-108	DET: 53-109	MIA: 49-113	LAA: 45-117	CWS: 30-132	
PIT: 21-141	KC: 18-144	WSH: 10-152	COL: 7-155	OAK: 1-161	
------------------------------
Runs scored per game:
ATL: 5.48 (888)	LAD: 5.45 (882)	HOU: 5.18 (840)	SD: 4.99 (808)	TEX: 4.95 (802)	
TB: 4.86 (787)	TOR: 4.82 (781)	CHC: 4.79 (776)	MIN: 4.77 (773)	STL: 4.75 (769)	
PHI: 4.73 (767)	SEA: 4.73 (766)	BAL: 4.68 (758)	NYM: 4.67 (756)	MIL: 4.64 (751)	
NYY: 4.54 (736)	CLE: 4.54 (735)	BOS: 4.49 (727)	AZ: 4.47 (723)	CIN: 4.43 (718)	
LAA: 4.39 (711)	SF: 4.34 (703)	DET: 4.28 (694)	MIA: 4.2 (680)	CWS: 4.13 (668)	
KC: 4.12 (667)	PIT: 4.07 (659)	WSH: 4.0

In [6]:
print(f'R^2 (away) = {model_away.score(x_test_away, y_test_away)}')
print(f'R^2 (home) = {model_home.score(x_test_home, y_test_home)}')

R^2 (away) = 0.06096846843759929
R^2 (home) = 0.03952141139840959


In [6]:
df = pd.read_csv('../data/baseball/game_data_test.csv')
x_away = df.drop(['away_score', 'home_score', 'away_team', 'home_team'], axis=1)
x_home = df.drop(['away_score', 'home_score', 'away_team', 'home_team'], axis=1)
# x_away, x_home
model_away.predict(x_away), model_home.predict(x_home)

(array([4.39900607]), array([4.94491191]))