In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, LogisticRegression, PoissonRegressor
from sklearn.tree import DecisionTreeRegressor

import graphviz
from sklearn.tree import export_graphviz
from IPython.display import Image

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from analysis import get_feature_importances, visualize_decision_trees, get_vif, wl_accuracy, season_record, runs_per_game

In [2]:
version = 6
df = pd.read_csv(f'./data/baseball/training/game_data_v{version}.csv')

In [3]:
# create the appropriate training and testing data based on home/away, dropping columns as needed
def create_data(drop_cols=['away_score', 'home_score', 'away_team', 'home_team'], y_col='away_score', split_by='random'):
    y = df[y_col]
    x = df.drop(drop_cols, axis=1)

    if split_by == 'season':
        x_train, x_test = x.iloc[0:13047], x.iloc[13047:]
        y_train, y_test = y.iloc[0:13047], y.iloc[13047:]
    else:
        x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True)
    
    return x_train, x_test, y_train, y_test

In [5]:
x_train_away, x_test_away, y_train_away, y_test_away = create_data(drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'away_team_xfip', 'away_starter_xfip',
                                                                              'home_lineup_xwoba', 'home_run_diff',
                                                                              'away_team_xwoba_diff', 'away_run_diff'],
                                                                   y_col='away_score',
                                                                   split_by='season')
x_train_home, x_test_home, y_train_home, y_test_home = create_data(drop_cols=['away_score', 'home_score',
                                                                              'away_team', 'home_team',
                                                                              'home_team_xfip', 'home_starter_xfip',
                                                                              'away_lineup_xwoba', 'away_run_diff',
                                                                              'home_team_xwoba_diff', 'home_run_diff'],
                                                                   y_col='home_score',
                                                                   split_by='season')

In [7]:
model_away = Pipeline([('scaler', StandardScaler()),
                       ('model', LinearRegression())])
model_home = Pipeline([('scaler', StandardScaler()),
                       ('model', LinearRegression())])

model_away.fit(x_train_away, y_train_away)
model_home.fit(x_train_home, y_train_home)

pred_away = model_away.predict(x_test_away)
pred_home = model_home.predict(x_test_home)

results = pd.DataFrame({'away_pred': pred_away, 'home_pred': pred_home, 'away_true': y_test_away, 'home_true': y_test_home})
results.describe()

print('LINEAR REGRESSION MODEL')
print(f'Accuracy: {wl_accuracy(results)}%')
print('-' * 30)
season_record(df, results)
print('-' * 30)
runs_per_game(df, results)

LINEAR REGRESSION MODEL
Accuracy: 60.041152263374485%
------------------------------
Season records:
HOU: 143-19	ATL: 142-20	LAD: 141-21	PHI: 124-38	TB: 122-40	
TOR: 121-41	SD: 120-42	MIN: 118-44	TEX: 115-47	SEA: 111-51	
MIL: 108-54	CHC: 103-59	NYM: 101-61	BAL: 85-77	CLE: 81-81	
SF: 76-86	BOS: 70-92	NYY: 70-92	LAA: 68-94	AZ: 66-96	
MIA: 63-99	STL: 60-102	CIN: 54-108	DET: 49-113	CWS: 38-124	
PIT: 31-131	KC: 21-141	WSH: 14-148	COL: 10-152	OAK: 5-157	
------------------------------
Runs scored per game:
ATL: 5.52 (894)	LAD: 5.26 (852)	HOU: 4.88 (791)	TEX: 4.88 (790)	PHI: 4.86 (787)	
TB: 4.79 (777)	SD: 4.74 (768)	TOR: 4.7 (762)	CIN: 4.69 (759)	MIN: 4.68 (758)	
CHC: 4.58 (742)	BOS: 4.56 (738)	NYM: 4.54 (736)	SEA: 4.52 (732)	BAL: 4.49 (727)	
STL: 4.48 (726)	AZ: 4.43 (718)	LAA: 4.4 (713)	MIL: 4.35 (705)	NYY: 4.17 (676)	
SF: 4.14 (671)	CLE: 4.14 (670)	MIA: 4.11 (666)	COL: 4.1 (664)	CWS: 4.02 (651)	
PIT: 4.01 (650)	WSH: 3.99 (647)	KC: 3.95 (640)	DET: 3.91 (633)	OAK: 3.55 (575)	
