In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from analysis import get_feature_importances, get_vif, wl_accuracy, season_record, runs_per_game
from data_format import create_data

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
version = 6
df = pd.read_csv(f'../data/baseball/training/game_data_v{version}.csv')

In [3]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff'],
                                                                   y_col='away_score',
                                                                   split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff'],
                                                                   y_col='home_score',
                                                                   split_by='season')

At this point, my mentor has raised the question of a possible multicollinearity, where two or more features are correlated with each other. Indeed, some features had very high variance inflation factors. In particular, `home_team_xwoba_diff` had the highest VIF, which makes sense because *team* batting stats would generally be similar to the game's *lineup* batting stats.

In [4]:
ols_model = sm.OLS(y_train_home, sm.add_constant(x_train_home)).fit()
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:             home_score   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     71.97
Date:                Wed, 31 Jul 2024   Prob (F-statistic):          1.61e-116
Time:                        13:57:58   Log-Likelihood:                -33365.
No. Observations:               13047   AIC:                         6.675e+04
Df Residuals:                   13038   BIC:                         6.681e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -4.5704 

In [5]:
print('INITIAL VIF VALUES:')
print(get_vif(x_train_home))

INITIAL VIF VALUES:
                feature  variance_inflation_factor
0                 const                1236.384644
1  away_team_xwoba_diff                   3.094334
2        away_team_xfip                   3.203630
3     away_starter_xfip                   1.449220
4    away_close_win_pct                   1.353591
5  home_team_xwoba_diff                   6.005394
6     home_lineup_xwoba                   1.784004
7    home_close_win_pct                   1.445171
8         home_run_diff                   5.582397


After looking at the table above, I decided to remove the `home_team_xwoba_diff` and `home_run_diff` features, both of which had a VIF over 5, from the home team dataset. Similarly, I removed the `away_team_xwoba_diff` and `away_run_diff` features from the away team dataset.

In [6]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff',
                                                                              'away_team_xwoba_diff', 'away_run_diff'],
                                                                   y_col='away_score',
                                                                   split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(df, drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff',
                                                                              'home_team_xwoba_diff', 'home_run_diff'],
                                                                   y_col='home_score',
                                                                   split_by='season')

In [7]:
model_away = RandomForestRegressor(n_estimators=3, max_features='sqrt')
model_home = RandomForestRegressor(n_estimators=3, max_features='sqrt')

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print('RANDOM FOREST REGRESSION MODEL (v4 - reducing multicollinearity)')
print(f'Accuracy: {wl_accuracy(results)}%')

RANDOM FOREST REGRESSION MODEL (v4 - reducing multicollinearity)
Accuracy: 52.345679012345684%


In [8]:
print('FINAL FEATURE IMPORTANCES:')
print(get_feature_importances(x_test_home.columns, model_home))

FINAL FEATURE IMPORTANCES:
                feature  weight
0  away_team_xwoba_diff    8.95
1        away_team_xfip    9.76
2     away_starter_xfip   22.05
3    away_close_win_pct    8.48
4     home_lineup_xwoba   30.80
5    home_close_win_pct   19.97
