In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, LogisticRegression, PoissonRegressor
from sklearn.tree import DecisionTreeRegressor

import graphviz
from sklearn.tree import export_graphviz
from IPython.display import Image

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from analysis import get_feature_importances, visualize_decision_trees, get_vif, wl_accuracy, season_record, runs_per_game

In [20]:
version = 6
df = pd.read_csv(f'./data/baseball/training/game_data_v{version}.csv')

In [21]:
# create the appropriate training and testing data based on home/away, dropping columns as needed
def create_data(drop_cols=['away_score', 'home_score', 'away_team', 'home_team'], y_col='away_score', split_by='random'):
    y = df[y_col]
    x = df.drop(drop_cols, axis=1)

    if split_by == 'season':
        x_train, x_test = x.iloc[0:13047], x.iloc[13047:]
        y_train, y_test = y.iloc[0:13047], y.iloc[13047:]
        # x_train, x_test = x.iloc[0:10617], x.iloc[10617:]
        # y_train, y_test = y.iloc[0:10617], y.iloc[10617:]
    else:
        x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True)
    
    return x_train, x_test, y_train, y_test

In [28]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff',
                                                                              'away_team_xwoba_diff', 'away_run_diff'],
                                                                   y_col='away_score',
                                                                   split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff',
                                                                              'home_team_xwoba_diff', 'home_run_diff'],
                                                                   y_col='home_score',
                                                                   split_by='season')

In [30]:
model_away = Pipeline([('scaler', StandardScaler()),
                       ('model', LinearRegression())])
model_home = Pipeline([('scaler', StandardScaler()),
                       ('model', LinearRegression())])

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print('LINEAR REGRESSION MODEL')
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results)
print('-' * 30)
runs_per_game(df, results)

LINEAR REGRESSION MODEL
Accuracy: 58.2716049382716%
------------------------------
Season records:
ATL: 151-11	LAD: 140-22	TOR: 132-30	PHI: 131-31	MIN: 123-39	
HOU: 121-41	SEA: 117-45	SF: 112-50	SD: 111-51	NYY: 109-53	
MIL: 108-54	TB: 100-62	NYM: 96-66	TEX: 90-72	MIA: 89-73	
CHC: 87-75	BOS: 83-79	STL: 77-85	BAL: 77-85	LAA: 64-98	
DET: 58-104	CIN: 52-110	CLE: 50-112	KC: 41-121	AZ: 32-130	
PIT: 29-133	CWS: 27-135	WSH: 11-151	COL: 7-155	OAK: 5-157	
------------------------------
Runs scored per game:
ATL: 5.33 (864)	LAD: 5.01 (812)	HOU: 4.87 (789)	PHI: 4.8 (777)	STL: 4.78 (775)	
SD: 4.78 (775)	TOR: 4.78 (774)	MIN: 4.72 (764)	SEA: 4.71 (763)	CHC: 4.64 (752)	
TEX: 4.62 (749)	NYM: 4.62 (748)	BAL: 4.59 (743)	NYY: 4.58 (743)	MIL: 4.56 (739)	
LAA: 4.56 (739)	MIA: 4.55 (736)	SF: 4.53 (733)	BOS: 4.5 (729)	CIN: 4.48 (726)	
KC: 4.45 (722)	DET: 4.43 (718)	TB: 4.41 (715)	CLE: 4.36 (706)	PIT: 4.31 (698)	
AZ: 4.29 (696)	CWS: 4.25 (689)	COL: 4.15 (672)	WSH: 4.14 (671)	OAK: 4.0 (648)	
