In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

from analysis import wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [2]:
version = 6
df = pd.read_csv(f'../data/baseball/training/game_data_v{version}.csv')

In [3]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff',
                                                                              'away_team_xwoba_diff', 'away_run_diff'],
                                                                   y_col=['away_score', 'home_score'],
                                                                   split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff',
                                                                              'home_team_xwoba_diff', 'home_run_diff'],
                                                                   y_col=['home_score', 'away_score'],
                                                                   split_by='season')

In [5]:
model_away = MultiOutputRegressor(RandomForestRegressor(n_estimators=3, max_features='sqrt'))
model_home = MultiOutputRegressor(RandomForestRegressor(n_estimators=3, max_features='sqrt'))

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

pred_away_score = []
pred_home_score = []

for i in range(len(pred_away)):
    pred_away_score.append((pred_away[i][0] + pred_home[i][1]) / 2)
    pred_home_score.append((pred_home[i][0] + pred_away[i][1]) / 2)

results = pd.DataFrame({'away_pred': pred_away_score, 'home_pred': pred_home_score,
                        'away_true': y_test_away['away_score'], 'home_true': y_test_home['home_score']})
results.describe()

print('RANDOM FOREST REGRESSION MODEL (v5 - incorporating runs allowed)')
print('-' * 30)
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results)
print('-' * 30)
runs_per_game(df, results)
print('-' * 30)
# compare runs scored vs. runs allowed

RANDOM FOREST REGRESSION MODEL (v5 - incorporating runs allowed)
------------------------------
Accuracy: 53.333333333333336%
------------------------------
Season records:
ATL: 125-37	HOU: 124-38	LAD: 119-43	PHI: 113-49	TOR: 107-55	
MIA: 97-65	MIL: 95-67	SF: 93-69	NYY: 91-71	BAL: 87-75	
SEA: 87-75	LAA: 84-78	MIN: 81-81	SD: 79-83	CIN: 77-85	
NYM: 76-86	KC: 76-86	DET: 75-87	CHC: 74-88	TEX: 74-88	
TB: 73-89	STL: 67-95	PIT: 65-97	CLE: 65-97	WSH: 61-101	
BOS: 60-102	AZ: 59-103	CWS: 58-104	COL: 45-117	OAK: 43-119	
------------------------------
Runs scored per game:
ATL: 5.77 (935)	LAD: 5.51 (892)	HOU: 5.49 (889)	PHI: 5.3 (858)	MIA: 5.28 (856)	
SD: 5.2 (843)	TOR: 5.09 (825)	NYY: 5.04 (817)	MIL: 4.9 (794)	CLE: 4.77 (773)	
SEA: 4.75 (770)	LAA: 4.75 (769)	KC: 4.74 (768)	BAL: 4.73 (766)	SF: 4.66 (756)	
TB: 4.63 (750)	STL: 4.6 (746)	CHC: 4.59 (744)	CIN: 4.58 (741)	DET: 4.56 (738)	
NYM: 4.54 (735)	MIN: 4.46 (722)	PIT: 4.44 (720)	AZ: 4.44 (720)	TEX: 4.41 (715)	
WSH: 4.33 (701)	COL: 4.32 (700)	CWS: