In [2]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from analysis import get_feature_importances, wl_accuracy, season_record, runs_per_game
from data_format import create_data

In [3]:
version = 6
df = pd.read_csv(f'./data/baseball/training/game_data_v{version}.csv')

In [4]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, y_col='away_score', split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, y_col='home_score', split_by='season')

There is another problem with this random forest model, though. The output below shows how much each column is weighted when making predictions for how many runs the home team will score. We can see that, for example, `home_starter_xfip` is weighted at about 12% - but does the home team's starting *pitcher* really have that much effect on the home team's *batting* results? I tried to fine-tune the model by removing unnecessary or irrelevant features.

In [5]:
model_away = RandomForestRegressor(n_estimators=3, max_features='sqrt')
model_home = RandomForestRegressor(n_estimators=3, max_features='sqrt')

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

print('INITIAL FEATURE IMPORTANCES:')
print(get_feature_importances(x_test_home.columns, model_home))

INITIAL FEATURE IMPORTANCES:
                 feature  weight
0   away_team_xwoba_diff    5.7%
1      away_lineup_xwoba  12.35%
2         away_team_xfip   6.57%
3      away_starter_xfip   12.6%
4     away_close_win_pct   5.91%
5          away_run_diff   5.89%
6   home_team_xwoba_diff   6.37%
7      home_lineup_xwoba  13.56%
8         home_team_xfip   5.87%
9      home_starter_xfip  12.14%
10    home_close_win_pct   6.02%
11         home_run_diff   6.99%


In [6]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff'],
                                                                   y_col='away_score',
                                                                   split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff'],
                                                                   y_col='home_score',
                                                                   split_by='season')

In [8]:
model_away = RandomForestRegressor(n_estimators=3, max_features='sqrt')
model_home = RandomForestRegressor(n_estimators=3, max_features='sqrt')

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print('RANDOM FOREST REGRESSION MODEL (v3 - selecting features)')
print(f'Accuracy: {wl_accuracy(results)}%')

RANDOM FOREST REGRESSION MODEL (v3 - selecting features)
Accuracy: 53.53909465020577%


In [9]:
print('FINAL FEATURE IMPORTANCES:')
print(get_feature_importances(x_test_home.columns, model_home))

FINAL FEATURE IMPORTANCES:
                feature  weight
0  away_team_xwoba_diff   7.38%
1        away_team_xfip   8.75%
2     away_starter_xfip  22.16%
3    away_close_win_pct   7.81%
4  home_team_xwoba_diff    9.0%
5     home_lineup_xwoba   27.2%
6    home_close_win_pct   8.91%
7         home_run_diff   8.79%


Now, `away_starter_xfip` and `home_lineup_xwoba` are weighted the most, which is more reasonable; the home team's lineup strength and the away team's starting pitcher should matter most when predicting the home team's runs scored.