In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from analysis import wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [2]:
version = 6
df = pd.read_csv(f'../data/baseball/training/game_data_v{version}.csv')

## Random forest regression

In [3]:
season = 2023

In [4]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, y_col='away_score', split_by='season', season=season)
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, y_col='home_score', split_by='season', season=season)

In [5]:
model_away = RandomForestRegressor(n_estimators=500, max_depth=5, max_features='sqrt')
model_home = RandomForestRegressor(n_estimators=500, max_depth=5, max_features='sqrt')

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

In [6]:
pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

Unnamed: 0,away_pred,home_pred,away_true,home_true
count,2430.0,2430.0,2430.0,2430.0
mean,4.465435,4.591563,4.634979,4.596296
std,0.563927,0.470511,3.285711,3.150994
min,3.21144,3.38506,0.0,0.0
25%,4.055589,4.228463,2.0,2.0
50%,4.406973,4.54683,4.0,4.0
75%,4.84486,4.986865,6.0,6.0
max,6.894156,5.942409,25.0,20.0


In [7]:
print(f'RANDOM FOREST REGRESSION MODEL (v1) - testing on {season} season')
print('-' * 30)
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results, season=season)
print('-' * 30)
runs_per_game(df, results, season=season)

RANDOM FOREST REGRESSION MODEL (v1) - testing on 2023 season
------------------------------
Accuracy: 58.0246913580247%
------------------------------
Season records:
ATL: 158-4	LAD: 150-12	TB: 145-17	HOU: 139-23	TOR: 135-27	
PHI: 131-31	SEA: 129-33	MIN: 124-38	MIL: 119-43	SD: 113-49	
SF: 108-54	NYY: 105-57	NYM: 93-69	TEX: 92-70	CHC: 88-74	
BAL: 75-87	BOS: 69-93	STL: 64-98	MIA: 62-100	CLE: 61-101	
DET: 54-108	LAA: 44-118	CIN: 40-122	AZ: 38-124	CWS: 29-133	
KC: 27-135	PIT: 20-142	WSH: 13-149	COL: 4-158	OAK: 1-161	
------------------------------
Runs scored per game:
ATL: 5.24 (850)	LAD: 5.09 (824)	HOU: 4.91 (796)	TOR: 4.78 (774)	PHI: 4.76 (772)	
SD: 4.76 (771)	MIN: 4.73 (766)	SEA: 4.71 (762)	TB: 4.68 (758)	MIL: 4.66 (755)	
TEX: 4.63 (750)	NYY: 4.6 (745)	SF: 4.57 (740)	STL: 4.57 (740)	NYM: 4.57 (740)	
CHC: 4.56 (739)	BAL: 4.5 (729)	BOS: 4.44 (719)	CLE: 4.43 (718)	CIN: 4.43 (717)	
MIA: 4.43 (717)	LAA: 4.41 (714)	DET: 4.36 (706)	AZ: 4.29 (694)	KC: 4.24 (688)	
CWS: 4.23 (685)	PIT: 4.16 (675

A random forest regressor with 500 decision trees and a maximum depth of 5 produces an accuracy of about 58%. This is great - most other models have an accuracy ranging from 55% to 60%. However, when I combine all the predicted outcomes and print the aggregate (i.e. the season win-loss record), the results appear to be extremely skewed. The Oakland Athletics (OAK), for instance, are predicted to lose almost every single game of the season. This can be partly explained by the low standard deviation of predicted runs scored, and one way to increase the variance is to decrease the number of decision trees (the `n_estimators` parameter) in the forest.