In [5]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

from analysis import wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [2]:
version = 6
df = pd.read_csv(f'./data/baseball/training/game_data_v{version}.csv')

In [3]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff',
                                                                              'away_team_xwoba_diff', 'away_run_diff'],
                                                                   y_col=['away_score', 'home_score'],
                                                                   split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff',
                                                                              'home_team_xwoba_diff', 'home_run_diff'],
                                                                   y_col=['home_score', 'away_score'],
                                                                   split_by='season')

In [7]:
model_away = MultiOutputRegressor(RandomForestRegressor(n_estimators=3, max_features='sqrt'))
model_home = MultiOutputRegressor(RandomForestRegressor(n_estimators=3, max_features='sqrt'))

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

pred_away_score = []
pred_home_score = []

for i in range(len(pred_away)):
    pred_away_score.append((pred_away[i][0] + pred_home[i][1]) / 2)
    pred_home_score.append((pred_home[i][0] + pred_away[i][1]) / 2)

results = pd.DataFrame({'away_pred': pred_away_score, 'home_pred': pred_home_score,
                        'away_true': y_test_away['away_score'], 'home_true': y_test_home['home_score']})
results.describe()

print('RANDOM FOREST REGRESSION MODEL (v5 - incorporating runs allowed)')
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results)
print('-' * 30)
runs_per_game(df, results)
print('-' * 30)
# compare runs scored vs. runs allowed

RANDOM FOREST REGRESSION MODEL (v5 - incorporating runs allowed)
Accuracy: 52.2633744855967%
------------------------------
Season records:
ATL: 113-49	LAD: 109-53	SEA: 104-58	PHI: 104-58	SF: 98-64	
TOR: 96-66	BAL: 96-66	HOU: 96-66	CHC: 95-67	TB: 95-67	
MIL: 93-69	CIN: 86-76	MIA: 86-76	BOS: 86-76	NYM: 85-77	
NYY: 85-77	MIN: 83-79	LAA: 73-89	AZ: 71-91	STL: 71-91	
DET: 70-92	PIT: 64-98	KC: 64-98	CLE: 64-98	TEX: 64-98	
SD: 63-99	WSH: 58-104	COL: 55-107	CWS: 53-109	OAK: 50-112	
------------------------------
Runs scored per game:
TOR: 5.14 (833)	SF: 5.08 (822)	LAD: 5.03 (814)	ATL: 4.96 (804)	SEA: 4.94 (800)	
STL: 4.86 (787)	CHC: 4.84 (784)	BAL: 4.8 (777)	HOU: 4.79 (776)	MIN: 4.75 (769)	
SD: 4.74 (768)	TB: 4.67 (757)	CIN: 4.65 (753)	NYY: 4.62 (749)	BOS: 4.61 (747)	
TEX: 4.6 (746)	PHI: 4.59 (744)	MIL: 4.58 (742)	MIA: 4.57 (740)	NYM: 4.49 (728)	
LAA: 4.45 (721)	DET: 4.43 (718)	AZ: 4.37 (708)	KC: 4.31 (697)	CLE: 4.3 (697)	
PIT: 4.23 (685)	WSH: 4.21 (682)	COL: 4.17 (676)	OAK: 4.16 (674)	CWS: 4.