In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, LogisticRegression, PoissonRegressor
from sklearn.tree import DecisionTreeRegressor

import graphviz
from sklearn.tree import export_graphviz
from IPython.display import Image

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from analysis import get_feature_importances, visualize_decision_trees, get_vif, wl_accuracy, season_record, runs_per_game

In [2]:
version = 6
df = pd.read_csv(f'./data/baseball/training/game_data_v{version}.csv')

In [3]:
# create the appropriate training and testing data based on home/away, dropping columns as needed
def create_data(drop_cols=['away_score', 'home_score', 'away_team', 'home_team'], y_col='away_score', split_by='random'):
    y = df[y_col]
    x = df.drop(drop_cols, axis=1)

    if split_by == 'season':
        x_train, x_test = x.iloc[0:13047], x.iloc[13047:]
        y_train, y_test = y.iloc[0:13047], y.iloc[13047:]
    else:
        x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True)
    
    return x_train, x_test, y_train, y_test

In [4]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff',
                                                                              'away_team_xwoba_diff', 'away_run_diff'],
                                                                   y_col=['away_score', 'home_score'],
                                                                   split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff',
                                                                              'home_team_xwoba_diff', 'home_run_diff'],
                                                                   y_col=['home_score', 'away_score'],
                                                                   split_by='season')

In [20]:
model_away = MultiOutputRegressor(RandomForestRegressor(n_estimators=3, max_features='sqrt'))
model_home = MultiOutputRegressor(RandomForestRegressor(n_estimators=3, max_features='sqrt'))

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

pred_away_score = []
pred_home_score = []

for i in range(len(pred_away)):
    pred_away_score.append((pred_away[i][0] + pred_home[i][1]) / 2)
    pred_home_score.append((pred_home[i][0] + pred_away[i][1]) / 2)

results = pd.DataFrame({'away_pred': pred_away_score, 'home_pred': pred_home_score,
                        'away_true': y_test_away['away_score'], 'home_true': y_test_home['home_score']})
results.describe()

print('RANDOM FOREST REGRESSION MODEL (v5 - incorporating runs allowed)')
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results)
print('-' * 30)
runs_per_game(df, results)

RANDOM FOREST REGRESSION MODEL (v5 - incorporating runs allowed)
Accuracy: 53.53909465020577%
------------------------------
Season records:
TOR: 112-50	LAD: 111-51	ATL: 111-51	PHI: 109-53	MIL: 103-59	
HOU: 102-60	MIA: 97-65	MIN: 92-70	LAA: 91-71	SF: 90-72	
BOS: 85-77	NYY: 85-77	BAL: 82-80	TB: 82-80	DET: 79-83	
CHC: 77-85	AZ: 76-86	SD: 76-86	SEA: 75-87	NYM: 73-89	
STL: 73-89	WSH: 70-92	CIN: 68-94	KC: 66-96	TEX: 65-97	
CWS: 60-102	COL: 59-103	PIT: 57-105	CLE: 57-105	OAK: 47-115	
------------------------------
Runs scored per game:
ATL: 5.59 (906)	LAD: 5.3 (859)	HOU: 5.07 (821)	PHI: 5.01 (811)	SD: 4.8 (778)	
NYY: 4.78 (775)	MIN: 4.78 (774)	DET: 4.76 (772)	MIA: 4.76 (771)	TOR: 4.74 (768)	
SF: 4.7 (761)	STL: 4.7 (761)	LAA: 4.69 (759)	MIL: 4.66 (755)	CIN: 4.64 (751)	
TEX: 4.63 (751)	CHC: 4.55 (738)	TB: 4.5 (730)	CLE: 4.5 (729)	AZ: 4.5 (729)	
WSH: 4.5 (729)	COL: 4.48 (726)	BAL: 4.41 (715)	SEA: 4.41 (714)	PIT: 4.38 (710)	
CWS: 4.38 (709)	BOS: 4.37 (709)	NYM: 4.36 (707)	KC: 4.34 (704)	OAK: 4.0