In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, LogisticRegression, PoissonRegressor
from sklearn.tree import DecisionTreeRegressor

import graphviz
from sklearn.tree import export_graphviz
from IPython.display import Image

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from analysis import get_feature_importances, visualize_decision_trees, get_vif, wl_accuracy, season_record, runs_per_game

In [2]:
version = 6
df = pd.read_csv(f'./data/baseball/training/game_data_v{version}.csv')

In [3]:
# y = pd.DataFrame({'away_score': df['away_score'], 'home_score': df['home_score']})
# x = df.drop(['away_score', 'home_score', 'away_team', 'home_team', 'away_run_diff', 'home_run_diff'], axis=1)
# x_train, x_test = x.iloc[0:13047], x.iloc[13047:]
# y_train, y_test = y.iloc[0:13047], y.iloc[13047:]

In [4]:
# create the appropriate training and testing data based on home/away, dropping columns as needed
def create_data(drop_cols=['away_score', 'home_score', 'away_team', 'home_team'], y_col='away_score', split_by='random'):
    y = df[y_col]
    x = df.drop(drop_cols, axis=1)

    if split_by == 'season':
        x_train, x_test = x.iloc[0:13047], x.iloc[13047:]
        y_train, y_test = y.iloc[0:13047], y.iloc[13047:]
    else:
        x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True)
    
    return x_train, x_test, y_train, y_test

In [5]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff',
                                                                              'away_team_xwoba_diff', 'away_run_diff'],
                                                                   y_col='away_score',
                                                                   split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff',
                                                                              'home_team_xwoba_diff', 'home_run_diff'],
                                                                   y_col='home_score',
                                                                   split_by='season')

In [8]:
def enter_data(data_away, data_home,
               away_xwoba=0.000, away_team=4.00, away_starter=4.00, away_close=0.5, away_lineup=0.300,
               home_xwoba=0.000, home_team=4.00, home_starter=4.00, home_close=0.5, home_lineup=0.300):
    data_home['away_team_xwoba_diff'].append(away_xwoba)
    data_home['away_team_xfip'].append(away_team)
    data_home['away_starter_xfip'].append(away_starter)
    data_home['away_close_win_pct'].append(away_close)
    data_away['away_lineup_xwoba'].append(away_lineup)
    data_away['away_close_win_pct'].append(away_close)

    data_away['home_team_xwoba_diff'].append(home_xwoba)
    data_away['home_team_xfip'].append(home_team)
    data_away['home_starter_xfip'].append(home_starter)
    data_away['home_close_win_pct'].append(home_close)
    data_home['home_lineup_xwoba'].append(home_lineup)
    data_home['home_close_win_pct'].append(home_close)

In [37]:
test_data_away = {
    'away_lineup_xwoba': [],
    'away_close_win_pct': [],
    'home_team_xwoba_diff': [],
    'home_team_xfip': [],
    'home_starter_xfip': [],
    'home_close_win_pct': []
}

test_data_home = {
    'away_team_xwoba_diff': [],
    'away_team_xfip': [],
    'away_starter_xfip': [],
    'away_close_win_pct': [],
    'home_lineup_xwoba': [],
    'home_close_win_pct': []
}

# test_data_combined = {
#     'away_team_xwoba_diff': [],
#     'away_lineup_xwoba': [],
#     'away_team_xfip': [],
#     'away_starter_xfip': [],
#     'away_close_win_pct': [],
#     'home_team_xwoba_diff': [],
#     'home_lineup_xwoba': [],
#     'home_team_xfip': [],
#     'home_starter_xfip': [],
#     'home_close_win_pct': []
# }

In [38]:
# Game #1
# (away) good pitching and good hitting team vs. (home) bad pitching and bad hitting team
#  => should expect away team to win most of the time
enter_data(test_data_away, test_data_home, away_starter=2.50, away_lineup=0.450, home_starter=5.50, home_lineup=0.250)

# Game #2
# (away) bad pitching and bad pitching team vs. (home) good pitching and good hitting team
#  => should expect home team to win most of the time
enter_data(test_data_away, test_data_home, away_starter=5.50, away_lineup=0.250, home_starter=2.50, home_lineup=0.450)

# Game #3
# (away) good pitching and good hitting team vs. (home) average pitching and average hitting team
#  => should expect away team to win most of the time, but less extreme
enter_data(test_data_away, test_data_home, away_starter=2.50, away_lineup=0.450, home_starter=4.00, home_lineup=0.300)

# Game #4
# (away) good pitching and good hitting team vs. (home) good pitching and good hitting team
#  => should expect close to a 50-50 split
enter_data(test_data_away, test_data_home, away_starter=2.50, away_lineup=0.450, home_starter=2.50, home_lineup=0.450)

# Game #5
# (away) average pitching and average hitting team vs. (home) average pitching and average hitting team
#  => should expect close to a 50-50 split
enter_data(test_data_away, test_data_home, away_starter=4.00, away_lineup=0.320, home_starter=4.00, home_lineup=0.320)

# Game #6
# (away) slightly above average pitching and slightly below average hitting vs. (home) slightly below average pitching and slightly below average pitching
#  => should expect slight edge to away team
enter_data(test_data_away, test_data_home, away_starter=3.80, away_lineup=0.290, home_starter=4.60, home_lineup=0.410)

x_test_away = pd.DataFrame(test_data_away)
x_test_home = pd.DataFrame(test_data_home)
# x_test = pd.DataFrame(test_data_combined)

In [41]:
model_away = RandomForestRegressor(n_estimators=3, max_features='sqrt')
model_home = RandomForestRegressor(n_estimators=3, max_features='sqrt')
# model = RandomForestRegressor(n_estimators=3, max_features='sqrt')

In [43]:
away_wins = []
home_wins = []
run_diffs = []
total_iter = 100

for i in range(len(x_test_away)):
    away_wins.append(0)
    home_wins.append(0)
    run_diffs.append(0)

for i in range(total_iter):
    model_away.fit(x_train_away, y_train_away)
    model_home.fit(x_train_home, y_train_home)
    # model.fit(x_train, y_train)
    
    pred_away = model_away.predict(x_test_away)
    pred_home = model_home.predict(x_test_home)
    # pred = model.predict(x_test)
    
    for j in range(len(pred_away)):
        if pred_away[j] > pred_home[j]:
            away_wins[j] += 1
        else:
            home_wins[j] += 1

        run_diffs[j] += pred_home[j] - pred_away[j]

for i in range(len(away_wins)):
    print(f'Game #{i + 1}')
    print(f'Away: {away_wins[i]}-{total_iter - away_wins[i]}')
    print(f'Home: {home_wins[i]}-{total_iter - home_wins[i]}')
    print(f'Rdiff: {round(run_diffs[i] / total_iter, 4)}')
    print()

Game #1
Away: 86-14
Home: 14-86
Rdiff: -2.5367

Game #2
Away: 19-81
Home: 81-19
Rdiff: 2.0317

Game #3
Away: 84-16
Home: 16-84
Rdiff: -2.62

Game #4
Away: 15-85
Home: 85-15
Rdiff: 4.2933

Game #5
Away: 46-54
Home: 54-46
Rdiff: 0.2633

Game #6
Away: 21-79
Home: 79-21
Rdiff: 2.1017

