In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression , LassoLars, TweedieRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error

import acquire
from wrangle import mlb_wrangle
import explore

import warnings
warnings.filterwarnings("ignore")

In [2]:
train,validate,test = mlb_wrangle()

In [3]:
train,validate, test = explore.model_prep(train,validate,test)

In [4]:
train.head().T

Unnamed: 0,2389,3134,1585,2564,3049
owar,0.1,1.7,1.2,3.9,0.0
scaled_age,1.8,0.0,0.2,-0.6,-0.8
scaled_ops,0.125541,0.380952,0.528139,0.800866,-1.688312
scaled_tb,-0.035714,0.75,0.196429,1.526786,-0.455357
scaled_pitches_faced,-0.088185,0.655822,-0.024829,1.11387,-0.454623
scaled_batted,0.005,0.565,0.095,1.07,-0.42
scaled_raa,-0.666667,1.333333,0.888889,2.0,0.111111
scaled_waa,-0.555556,1.222222,0.888889,2.111111,0.111111
scaled_salary,1.333333,2.166667,4.833333,-4.404167,-4.425
scaled_impact,0.022581,0.354839,0.545161,0.458065,-1.532258


In [5]:
def baseline_model(train,validate,test):
    x_train=train.drop(columns='owar')
    y_train= train.owar

    x_validate=validate.drop(columns='owar')
    y_validate= validate.owar

    x_test=test.drop(columns='owar')
    y_test= test.owar
    
    y_train = pd.DataFrame(y_train)
    y_validate = pd.DataFrame(y_validate)
    y_test = pd.DataFrame(y_test)
    y_train.columns = ['owar']
    y_validate.columns = ['owar']

    y_train['owar_pred_mean']=y_train.owar.mean()
    y_train['owar_pred_median']=y_train.owar.median()
    y_validate['owar_pred_mean']=y_train.owar.mean()
    y_validate['owar_pred_median']=y_train.owar.median()

    rmse_train = mean_squared_error(y_train.owar, y_train.owar_pred_mean)**(1/2)
    rmse_validate = mean_squared_error(y_validate.owar, y_validate.owar_pred_mean)**(1/2)

    print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

    rmse_train = mean_squared_error(y_train.owar, y_train.owar_pred_median)**(1/2)
    rmse_validate = mean_squared_error(y_validate.owar, y_validate.owar_pred_median)**(1/2)

    print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))


def linear_model(train,validate,test):
    x_train=train.drop(columns='owar')
    y_train= train.owar

    x_validate=validate.drop(columns='owar')
    y_validate= validate.owar

    x_test=test.drop(columns='owar')
    y_test= test.owar

    y_train = pd.DataFrame(y_train)
    y_validate = pd.DataFrame(y_validate)
    y_test = pd.DataFrame(y_test)
    y_train.columns = ['owar']
    y_validate.columns = ['owar']

    # initialize the ML algorithm
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=7)
    rfe.fit(x_train,y_train)  
    feature_mask = rfe.support_
    rfe_feature = x_train.iloc[:,feature_mask].columns.tolist()
    print(rfe_feature)
    
    linear_train = train[rfe_feature]
    linear_validate= validate[rfe_feature]
    linear_test = test[rfe_feature]

    lm = LinearRegression(normalize=True)
    lm.fit(linear_train, y_train.owar)
    y_train['lm_pred'] = lm.predict(linear_train)
    y_validate['lm_pred'] = lm.predict(linear_validate)
    rmse_train = mean_squared_error(y_train.owar, y_train.lm_pred)**(1/2)
    rmse_validate = mean_squared_error(y_validate.owar, y_validate.lm_pred)**(1/2)

    print("RMSE using OLS\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

def lasso_model(train,validate,test):

    x_train=train.drop(columns='owar')
    y_train= train.owar

    x_validate=validate.drop(columns='owar')
    y_validate= validate.owar

    x_test=test.drop(columns='owar')
    y_test= test.owar

    y_train = pd.DataFrame(y_train)
    y_validate = pd.DataFrame(y_validate)
    y_test = pd.DataFrame(y_test)
    y_train.columns = ['owar']
    y_validate.columns = ['owar']

    # initialize the ML algorithm
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=7)
    rfe.fit(x_train,y_train)  
    feature_mask = rfe.support_
    rfe_feature = x_train.iloc[:,feature_mask].columns.tolist()
    print(rfe_feature)

    linear_train = train[rfe_feature]
    linear_validate= validate[rfe_feature]
    linear_test = test[rfe_feature]

    lasso = LassoLars(alpha=1.0)
    lasso.fit(linear_train, y_train.owar)
    y_train['lasso_pred'] = lasso.predict(linear_train)
    y_validate['lasso_pred'] = lasso.predict(linear_validate)
    rmse_train = mean_squared_error(y_train.owar, y_train.lasso_pred)**(1/2)
    rmse_validate = mean_squared_error(y_validate.owar, y_validate.lasso_pred)**(1/2)

    print("RMSE using Lasso\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))



def poly_model(train,validate,test):


    x_train=train.drop(columns='owar')
    y_train= train.owar

    x_validate=validate.drop(columns='owar')
    y_validate= validate.owar

    x_test=test.drop(columns='owar')
    y_test= test.owar

    y_train = pd.DataFrame(y_train)
    y_validate = pd.DataFrame(y_validate)
    y_test = pd.DataFrame(y_test)
    y_train.columns = ['owar']
    y_validate.columns = ['owar']
    
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=7)
    rfe.fit(x_train,y_train)  
    feature_mask = rfe.support_
    rfe_feature = x_train.iloc[:,feature_mask].columns.tolist()
    print(rfe_feature)

    # make the polynomial features to get a new set of features
    pf = PolynomialFeatures(degree=2)

    # fit and transform X_train_scaled
    x_train_poly2 = pf.fit_transform(x_train)

    # transform X_validate_scaled & X_test_scaled
    x_validate_poly2 = pf.transform(x_validate)
    x_test_poly2 = pf.transform(x_test)

    lm2 = LinearRegression(normalize=True)
    lm2.fit(x_train_poly2, y_train.owar)
    y_train['lm2_pred'] = lm2.predict(x_train_poly2)
    y_validate['lm2_pred'] = lm2.predict(x_validate_poly2)
    rmse_train = mean_squared_error(y_train.owar, y_train.lm2_pred)**(1/2)
    rmse_validate = mean_squared_error(y_validate.owar, y_validate.lm2_pred)**(1/2)

    print("RMSE using Polynomial\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

    y_test['lm2_pred'] = lm2.predict(x_test_poly2)
    rmse_test = mean_squared_error(y_test.owar, y_test.lm2_pred)**(1/2)
    print("RMSE using Polynomial \nTest/Out-of-sample; ", round(rmse_test,2))


In [6]:
baseline_model(train,validate,test)

RMSE using Mean
Train/In-Sample:  1.34 
Validate/Out-of-Sample:  1.43
RMSE using Median
Train/In-Sample:  1.42 
Validate/Out-of-Sample:  1.53


In [7]:
linear_model(train,validate,test)

['scaled_ops', 'scaled_tb', 'scaled_pitches_faced', 'scaled_batted', 'scaled_raa', 'scaled_waa', 'scaled_impact']
RMSE using OLS
Train/In-Sample:  0.37 
Validate/Out-of-Sample:  0.38


In [8]:
lasso_model(train,validate,test)

['scaled_ops', 'scaled_tb', 'scaled_pitches_faced', 'scaled_batted', 'scaled_raa', 'scaled_waa', 'scaled_impact']
RMSE using Lasso
Train/In-Sample:  1.34 
Validate/Out-of-Sample:  1.43


In [9]:
poly_model(train,validate,test)

['scaled_ops', 'scaled_tb', 'scaled_pitches_faced', 'scaled_batted', 'scaled_raa', 'scaled_waa', 'scaled_impact']
RMSE using Polynomial
Train/In-Sample:  0.3 
Validate/Out-of-Sample:  0.32
RMSE using Polynomial 
Test/Out-of-sample;  0.41
