In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from analysis import wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [2]:
version = 6
df = pd.read_csv(f'../data/baseball/training/game_data_v{version}.csv')

## Random forest regression

In [3]:
season = 2023

In [4]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, y_col='away_score', split_by='season', season=season)
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, y_col='home_score', split_by='season', season=season)

In [5]:
model_away = RandomForestRegressor(n_estimators=500, max_depth=5, max_features='sqrt')
model_home = RandomForestRegressor(n_estimators=500, max_depth=5, max_features='sqrt')

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

In [6]:
pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

Unnamed: 0,away_pred,home_pred,away_true,home_true
count,2430.0,2430.0,2430.0,2430.0
mean,4.467801,4.591455,4.634979,4.596296
std,0.567272,0.472116,3.285711,3.150994
min,3.232708,3.315216,0.0,0.0
25%,4.056336,4.219196,2.0,2.0
50%,4.407113,4.536327,4.0,4.0
75%,4.847399,4.991816,6.0,6.0
max,7.36534,5.943138,25.0,20.0


In [7]:
print(f'RANDOM FOREST REGRESSION MODEL (v1) - testing on {season} season')
print('-' * 30)
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results, season=season)
print('-' * 30)
runs_per_game(df, results, season=season)

RANDOM FOREST REGRESSION MODEL (v1) - testing on 2023 season
------------------------------
Accuracy: 57.983539094650205%
------------------------------
Season records:
ATL: 159-3	LAD: 150-12	TB: 139-23	HOU: 138-24	TOR: 135-27	
PHI: 133-29	SEA: 127-35	MIN: 126-36	MIL: 122-40	SD: 114-48	
SF: 110-52	NYY: 103-59	TEX: 95-67	NYM: 93-69	CHC: 89-73	
BAL: 73-89	BOS: 70-92	CLE: 63-99	MIA: 62-100	STL: 61-101	
DET: 51-111	LAA: 45-117	AZ: 39-123	CIN: 38-124	CWS: 30-132	
KC: 29-133	PIT: 18-144	WSH: 10-152	COL: 8-154	OAK: 0-162	
------------------------------
Runs scored per game:
ATL: 5.28 (855)	LAD: 5.1 (826)	HOU: 4.92 (797)	TOR: 4.78 (774)	PHI: 4.77 (773)	
SD: 4.75 (770)	MIN: 4.73 (766)	SEA: 4.7 (761)	TB: 4.67 (756)	MIL: 4.66 (756)	
TEX: 4.65 (754)	NYY: 4.6 (745)	STL: 4.58 (742)	SF: 4.58 (742)	CHC: 4.57 (741)	
NYM: 4.57 (740)	BAL: 4.49 (728)	BOS: 4.44 (720)	CLE: 4.44 (720)	MIA: 4.42 (717)	
CIN: 4.41 (715)	LAA: 4.41 (714)	DET: 4.35 (704)	AZ: 4.28 (694)	KC: 4.24 (686)	
CWS: 4.22 (684)	PIT: 4.15 (67

A random forest regressor with 500 decision trees and a maximum depth of 5 produces an accuracy of about 58%. This is great - most other models have an accuracy ranging from 55% to 60%. However, when I combine all the predicted outcomes and print the aggregate (i.e. the season win-loss record), the results appear to be extremely skewed. The Oakland Athletics (OAK), for instance, are predicted to lose almost every single game of the season. This can be partly explained by the low standard deviation of predicted runs scored, and one way to increase the variance is to decrease the number of decision trees (the `n_estimators` parameter) in the forest.

In [19]:
df = pd.read_csv('../data/baseball/game_data_test.csv')
x_away = df.drop(['away_score', 'home_score', 'away_team', 'home_team'], axis=1)
x_home = df.drop(['away_score', 'home_score', 'away_team', 'home_team'], axis=1)
# x_away, x_home
model_away.predict(x_away), model_home.predict(x_home)

(array([4.27299688]), array([4.33122046]))