In [3]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from analysis import wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [4]:
version = 6
df = pd.read_csv(f'./data/baseball/training/game_data_v{version}.csv')

In [26]:
season = 2019

In [27]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff',
                                                                              'away_team_xwoba_diff', 'away_run_diff'],
                                                                   y_col='away_score',
                                                                   split_by='season',
                                                                   season=season)
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff',
                                                                              'home_team_xwoba_diff', 'home_run_diff'],
                                                                   y_col='home_score',
                                                                   split_by='season',
                                                                   season=season)

In [28]:
model_away = Pipeline([('scaler', StandardScaler()),
                       ('model', LinearRegression())])
model_home = Pipeline([('scaler', StandardScaler()),
                       ('model', LinearRegression())])

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print(f'LINEAR REGRESSION MODEL - testing on {season} season')
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results, season=season)
print('-' * 30)
runs_per_game(df, results, season=season)

LINEAR REGRESSION MODEL - testing on 2019 season
Accuracy: 61.09510086455331%
------------------------------
Season records:
LAD: 160-2	HOU: 151-11	NYY: 139-23	TB: 128-34	ATL: 125-37	
BOS: 125-37	CLE: 120-42	WSH: 119-43	MIN: 115-47	MIL: 110-52	
NYM: 107-55	OAK: 105-57	CHC: 101-61	STL: 96-66	LAA: 77-85	
AZ: 73-89	PHI: 72-90	CIN: 64-98	PIT: 57-105	SD: 55-107	
TOR: 53-109	SF: 52-110	COL: 44-118	SEA: 39-123	TEX: 39-123	
DET: 38-124	CWS: 35-127	KC: 15-147	BAL: 8-154	MIA: 7-155	
------------------------------
Runs scored per game:
NYY: 5.24 (848)	MIN: 5.21 (843)	HOU: 5.2 (843)	BOS: 5.2 (842)	OAK: 5.14 (832)	
TB: 5.01 (812)	LAD: 4.99 (809)	CLE: 4.98 (807)	ATL: 4.94 (801)	LAA: 4.85 (785)	
WSH: 4.85 (785)	MIL: 4.78 (775)	NYM: 4.65 (754)	STL: 4.63 (750)	TOR: 4.6 (746)	
PHI: 4.59 (744)	CHC: 4.58 (742)	TEX: 4.57 (741)	PIT: 4.53 (735)	SEA: 4.48 (725)	
AZ: 4.44 (720)	SF: 4.43 (717)	CWS: 4.35 (705)	COL: 4.34 (703)	KC: 4.33 (702)	
CIN: 4.33 (702)	DET: 4.33 (702)	SD: 4.3 (697)	BAL: 4.2 (680)	MIA: 4.1 (