In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, LogisticRegression, PoissonRegressor
from sklearn.tree import DecisionTreeRegressor

import graphviz
from sklearn.tree import export_graphviz
from IPython.display import Image

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from analysis import get_feature_importances, visualize_decision_trees, get_vif, wl_accuracy, season_record, runs_per_game

In [2]:
version = 6
df = pd.read_csv(f'./data/baseball/training/game_data_v{version}.csv')

In [3]:
# create the appropriate training and testing data based on home/away, dropping columns as needed
def create_data(drop_cols=['away_score', 'home_score', 'away_team', 'home_team'], y_col='away_score', split_by='random'):
    y = df[y_col]
    x = df.drop(drop_cols, axis=1)

    if split_by == 'season':
        x_train, x_test = x.iloc[0:13047], x.iloc[13047:]
        y_train, y_test = y.iloc[0:13047], y.iloc[13047:]
    else:
        x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True)
    
    return x_train, x_test, y_train, y_test

In [4]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(y_col='away_score', split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(y_col='home_score', split_by='season')

There is another problem with this random forest model, though. The output below shows how much each column is weighted when making predictions for how many runs the home team will score. We can see that, for example, `home_starter_xfip` is weighted at about 12% - but does the home team's starting *pitcher* really have that much effect on the home team's *batting* results? I tried to fine-tune the model by removing unnecessary or irrelevant features.

In [5]:
model_away = RandomForestRegressor(n_estimators=3, max_features='sqrt')
model_home = RandomForestRegressor(n_estimators=3, max_features='sqrt')

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

print('INITIAL FEATURE IMPORTANCES:')
print(get_feature_importances(x_test_home.columns, model_home))

INITIAL FEATURE IMPORTANCES:
                 feature  weight
0   away_team_xwoba_diff   5.03%
1      away_lineup_xwoba  12.14%
2         away_team_xfip   6.25%
3      away_starter_xfip  12.62%
4     away_close_win_pct   5.85%
5          away_run_diff   5.77%
6   home_team_xwoba_diff   6.36%
7      home_lineup_xwoba  13.34%
8         home_team_xfip    6.2%
9      home_starter_xfip  13.04%
10    home_close_win_pct   6.72%
11         home_run_diff   6.68%


In [6]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff'],
                                                                   y_col='away_score',
                                                                   split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff'],
                                                                   y_col='home_score',
                                                                   split_by='season')

In [7]:
model_away = RandomForestRegressor(n_estimators=3, max_features='sqrt')
model_home = RandomForestRegressor(n_estimators=3, max_features='sqrt')

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print('RANDOM FOREST REGRESSION MODEL (v3 - selecting features)')
print(f'Accuracy: {wl_accuracy(results)}%')

RANDOM FOREST REGRESSION MODEL (v3 - selecting features)
Accuracy: 52.5514403292181%


In [8]:
print('FINAL FEATURE IMPORTANCES:')
print(get_feature_importances(x_test_home.columns, model_home))

FINAL FEATURE IMPORTANCES:
                feature  weight
0  away_team_xwoba_diff   8.24%
1        away_team_xfip   9.75%
2     away_starter_xfip  21.41%
3    away_close_win_pct   8.18%
4  home_team_xwoba_diff   9.09%
5     home_lineup_xwoba   24.7%
6    home_close_win_pct   8.98%
7         home_run_diff   9.65%


Now, `away_starter_xfip` and `home_lineup_xwoba` are weighted the most, which is more reasonable; the home team's lineup strength and the away team's starting pitcher should matter most when predicting the home team's runs scored.