In [24]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import numpy as np
from sklearn.metrics import mean_squared_error

In [22]:
raw = pd.read_csv('data/korbb.csv')
df = raw[[
 'lvl_mlb', 'lvl_aaa', 'lvl_aa', 'lvl_higha', 'lvl_lowa',
       'swing_percentage', 'chase_percentage', 'zone_swing_percentage',
       'out_zone_contact_percentage', 'zone_contact_percentage',
       'swiss_percentage', 'whiff_percentage', 'strikeout_percentage']].copy().rename(columns={'strikeout_percentage': 'target'})

# Linear

In [23]:
df.replace([np.inf, -np.inf], 0, inplace=True)
print(f'Shape of df: {df.shape}')


X = df.drop('target', axis=1)
#drop(['player_id', 'year', 'level', 'target'], axis=1)
xcols = X.columns
X = MinMaxScaler().fit_transform(X)
X = pd.DataFrame(X)
X.columns = list(xcols)

y = df.target

est = sm.OLS(endog=y, exog=X, missing='drop')
fit_model = est.fit()
# l1_model = est.fit_regularized(method='elastic_net', alpha=0.1, L1_wt=1.0, start_params=None)
y_pred = fit_model.predict(X)
print(fit_model.summary())

Shape of df: (2827, 13)
                            OLS Regression Results                            
Dep. Variable:                 target   R-squared:                       0.852
Model:                            OLS   Adj. R-squared:                  0.852
Method:                 Least Squares   F-statistic:                     1479.
Date:                Thu, 23 Jun 2022   Prob (F-statistic):               0.00
Time:                        15:09:53   Log-Likelihood:                -6593.0
No. Observations:                2827   AIC:                         1.321e+04
Df Residuals:                    2815   BIC:                         1.328e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------

In [27]:
print('validation rmse: ' + str(mean_squared_error(y_pred, y, squared=True)))

validation rmse: 6.211706216691758


# XGB

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_predict, GridSearchCV, RandomizedSearchCV
import datetime

In [None]:
X = df.drop(['player_id', 'year', 'level', 'target'], axis=1)
xcols = X.columns
X = MinMaxScaler().fit_transform(X)
X = pd.DataFrame(X)
X.columns = list(xcols)

y = df.target

In [29]:
seed = 524
test_size = 0.15
X1, X_valid, y1, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)

seed = 524
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=test_size, random_state=seed)

In [30]:
params = {
        'learning_rate': [0.3, 0.03, 0.003],
        #'n_estimators': [50, 250, 500],
       # 'min_child_weight': [2, 5, 8],
        #'gamma': [0, 1.5, 5],
       # 'subsample': [0.6, 1.0], 
        #'max_depth': [2, 5, 8]
        }

In [39]:
xgb_model = XGBRegressor(booster='gbtree', objective='reg:squarederror')

In [40]:
# this cell takes a long time
print(datetime.datetime.now())
kf = KFold(n_splits=3, shuffle = True, random_state = 524)

gridcv = GridSearchCV(estimator=xgb_model, 
                      param_grid=params, 
                      scoring='neg_root_mean_squared_error', 
                      n_jobs=4, 
                      cv=kf.split(X_train,y_train),  
                      #verbose=3, 
                     return_train_score=True)

gridcv.fit(X_train, y_train, early_stopping_rounds=10, verbose=1,
        eval_set=[(X_test, y_test)])
print(datetime.datetime.now())

2022-06-23 15:29:50.902486
[0]	validation_0-rmse:16.64333
[1]	validation_0-rmse:11.89060
[2]	validation_0-rmse:8.57127
[3]	validation_0-rmse:6.31816
[4]	validation_0-rmse:4.85645
[5]	validation_0-rmse:3.93220
[6]	validation_0-rmse:3.37105
[7]	validation_0-rmse:3.05210
[8]	validation_0-rmse:2.87103
[9]	validation_0-rmse:2.77247
[10]	validation_0-rmse:2.72779
[11]	validation_0-rmse:2.68943
[12]	validation_0-rmse:2.66057
[13]	validation_0-rmse:2.65326
[14]	validation_0-rmse:2.64155
[15]	validation_0-rmse:2.63896
[16]	validation_0-rmse:2.64475
[17]	validation_0-rmse:2.65109
[18]	validation_0-rmse:2.65861
[19]	validation_0-rmse:2.65244
[20]	validation_0-rmse:2.65014
[21]	validation_0-rmse:2.64484
[22]	validation_0-rmse:2.64794
[23]	validation_0-rmse:2.65347
[24]	validation_0-rmse:2.66495
2022-06-23 15:29:51.753684


In [41]:
#print('\n All results:')
#print(gridcv.cv_results_)
print('\n Best estimator:')
print(gridcv.best_estimator_)
print('\n Best score:')
print(gridcv.best_score_)
print('\n Best parameters:')
print(gridcv.best_params_)
results = pd.DataFrame(gridcv.cv_results_)
#results.to_csv('../predictions/xgb-grid-search-results-3fold.csv', index=False)




 Best estimator:
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

 Best score:
-2.7942090094166487

 Best parameters:
{'learning_rate': 0.3}


In [42]:

y_preds = gridcv.best_estimator_.predict(X_valid)
print('validation rmse: ' + str(mean_squared_error(y_preds, y_valid, squared=True)))


y_preds = gridcv.best_estimator_.predict(X_test)
print('test rmse: ' + str(mean_squared_error(y_preds, y_test, squared=True)))

y_preds = gridcv.best_estimator_.predict(X_train)
print('train rmse: ' + str(mean_squared_error(y_preds, y_train, squared=True)))

validation rmse: 7.404194692513419
test rmse: 6.964131811299392
train rmse: 3.42739642912801


# Small NN

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

In [4]:
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.ylim([0, 10])
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.legend()
    plt.grid(True)


In [5]:
raw = pd.read_csv('data/korbb.csv')
dataset = raw[[
 'lvl_mlb', 'lvl_aaa', 'lvl_aa', 'lvl_higha', 'lvl_lowa',
       'swing_percentage', 'chase_percentage', 'zone_swing_percentage',
       'out_zone_contact_percentage', 'zone_contact_percentage',
       'swiss_percentage', 'whiff_percentage', 'strikeout_percentage']].copy()

In [6]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [7]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('strikeout_percentage')
test_labels = test_features.pop('strikeout_percentage')

In [9]:
N_VALIDATION = round(0.2*dataset.shape[0])
N_TRAIN = round(0.8*dataset.shape[0])
BUFFER_SIZE = int(1e4)
BATCH_SIZE = 500
STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

In [10]:
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
  0.001,
  decay_steps=STEPS_PER_EPOCH*1000,
  decay_rate=1,
  staircase=False)

def get_optimizer():
    return tf.keras.optimizers.Adam(lr_schedule)

In [11]:
def get_callbacks(name):
    return [
    tfdocs.modeling.EpochDots(),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=200),
    tf.keras.callbacks.TensorBoard(name),
    ]

In [16]:
def compile_and_fit(model, name, optimizer=None, max_epochs=1000):
    if optimizer is None:
        optimizer = get_optimizer()

    model.compile(optimizer=optimizer,
                loss='mean_squared_error')

    model.summary()

    history = model.fit(
    train_features,
        train_labels,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs=max_epochs,
       # validation_split=0.2,
    validation_data=(test_features, test_labels),
    callbacks=get_callbacks(name),
    verbose=0)
    
    
    return history

In [17]:
N_FEATURES = 12
tiny_model = tf.keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(N_FEATURES,), kernel_regularizer=regularizers.l2(0.001)),
    #layers.Dropout(0.5),
    layers.Dense(1)
])

In [18]:
histories = {}

In [19]:
histories['Tiny'] = compile_and_fit(tiny_model, 'sizes/Tiny')

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 16)                208       
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 225
Trainable params: 225
Non-trainable params: 0
_________________________________________________________________

Epoch: 0, loss:589.3600,  val_loss:538.7766,  
....................................................................................................
Epoch: 100, loss:27.5272,  val_loss:27.0654,  
....................................................................................................
Epoch: 200, loss:17.8677,  val_loss:16.9818,  
....................................................................................................
Epoch: 