# 18.7 - Overfitting and Regularization

* Load the **houseprices** data from Thinkful's database.
* Reimplement your model from the previous checkpoint.
* Try OLS, Lasso, Ridge, and ElasticNet regression using the same model specification. This time, you need to do **k-fold cross-validation** to choose the best hyperparameter values for your models. Scikit-learn has RidgeCV, LassoCV, and ElasticNetCV that you can utilize to do this. Which model is the best? Why?

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

In [9]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
houses_df = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

In [10]:
houses_df = pd.concat([houses_df,pd.get_dummies(houses_df.mszoning, prefix="mszoning", drop_first=True)], axis=1)
houses_df = pd.concat([houses_df,pd.get_dummies(houses_df.street, prefix="street", drop_first=True)], axis=1)
houses_df = pd.concat([houses_df,pd.get_dummies(houses_df.street, prefix="kitchenqual", drop_first=True)], axis=1)

cat_column_names = list(pd.get_dummies(houses_df.mszoning, prefix="mszoning", drop_first=True).columns)
cat_column_names = cat_column_names + list(pd.get_dummies(houses_df.street, prefix="street", drop_first=True).columns)
cat_column_names2 = cat_column_names + list(pd.get_dummies(houses_df.street, prefix="kitchenqual", drop_first=True).columns)

In [11]:
# create interaction term
houses_df['int_term'] = houses_df['overallqual'] * houses_df['totalbsmtsf']

# Y is the target variable
Y = np.log1p(houses_df['saleprice'])
# X is the feature set which includes
X = houses_df[['overallqual', 'grlivarea', 'garagecars', 'garagearea', 'totalbsmtsf', 'int_term']  + cat_column_names2]

X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

results = sm.OLS(y_train, X_train).fit()

results.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.831
Model:,OLS,Adj. R-squared:,0.83
Method:,Least Squares,F-statistic:,517.9
Date:,"Wed, 13 Nov 2019",Prob (F-statistic):,0.0
Time:,13:03:04,Log-Likelihood:,461.17
No. Observations:,1168,AIC:,-898.3
Df Residuals:,1156,BIC:,-837.6
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.9990,0.098,102.358,0.000,9.807,10.191
overallqual,0.1727,0.008,21.682,0.000,0.157,0.188
grlivarea,0.0002,1.19e-05,17.786,0.000,0.000,0.000
garagecars,0.0841,0.015,5.672,0.000,0.055,0.113
garagearea,9.612e-05,5.04e-05,1.908,0.057,-2.71e-06,0.000
totalbsmtsf,0.0004,4.06e-05,10.262,0.000,0.000,0.000
int_term,-4.622e-05,5.56e-06,-8.310,0.000,-5.71e-05,-3.53e-05
mszoning_FV,0.4105,0.065,6.350,0.000,0.284,0.537
mszoning_RH,0.2906,0.074,3.938,0.000,0.146,0.435

0,1,2,3
Omnibus:,356.207,Durbin-Watson:,1.878
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2810.085
Skew:,-1.183,Prob(JB):,0.0
Kurtosis:,10.221,Cond. No.,2.24e+21


In [12]:
lrm = LinearRegression()

lrm.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in training set is: 0.8313186827438339
-----Test set statistics-----
R-squared of the model in test set is: 0.829949415218692
Mean absolute error of the prediction is: 0.12334422039102903
Mean squared error of the prediction is: 0.02835519509105203
Root mean squared error of the prediction is: 0.16839000888132297
Mean absolute percentage error of the prediction is: 1.0303957002998065


In [13]:
lrm = LinearRegression()
y_pred = lrm.fit(X, Y).predict(X)

from sklearn.model_selection import cross_val_score
cross_val_score(lrm, X, Y, cv=10)

array([0.83662294, 0.86691217, 0.84618998, 0.796952  , 0.81784156,
       0.8501804 , 0.85966703, 0.83665493, 0.62721508, 0.82338322])

In [14]:
#choose the range in which our CV models scan for alpha
#how is this range determined?

alpha = [np.power(10.0,p) for p in np.arange(-5,10,1)]

In [16]:
lasso_cv = LassoCV(alphas=alpha, cv=10)

lasso_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)

print("Best alpha value is: {}".format(lasso_cv.alpha_))
print("R-squared of the model in training set is: {}".format(lasso_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lasso_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 0.0001
R-squared of the model in training set is: 0.8311261667899961
-----Test set statistics-----
R-squared of the model in test set is: 0.8277677487203086
Mean absolute error of the prediction is: 0.12383843115794951
Mean squared error of the prediction is: 0.028718978486827074
Root mean squared error of the prediction is: 0.1694667474368558
Mean absolute percentage error of the prediction is: 1.0348885851180014


In [24]:
ridge_cv = RidgeCV(alphas=alpha, cv=5)

ridge_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)

print("Best alpha value is: {}".format(ridge_cv.alpha_))
print("R-squared of the model in training set is: {}".format(ridge_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(ridge_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 1.0
R-squared of the model in training set is: 0.8311132374013046
-----Test set statistics-----
R-squared of the model in test set is: 0.8271453692807561
Mean absolute error of the prediction is: 0.12397217385936399
Mean squared error of the prediction is: 0.028822757550285554
Root mean squared error of the prediction is: 0.16977266431992388
Mean absolute percentage error of the prediction is: 1.0360989833436018


In [25]:
elasticnet_cv = ElasticNetCV(alphas=alpha, cv=5)

elasticnet_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = elasticnet_cv.predict(X_train)
y_preds_test = elasticnet_cv.predict(X_test)

print("Best alpha value is: {}".format(elasticnet_cv.alpha_))
print("R-squared of the model in training set is: {}".format(elasticnet_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(elasticnet_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 0.0001
R-squared of the model in training set is: 0.831260924106127
-----Test set statistics-----
R-squared of the model in test set is: 0.8287539757640325
Mean absolute error of the prediction is: 0.12362133561119794
Mean squared error of the prediction is: 0.028554529418540545
Root mean squared error of the prediction is: 0.1689808551834809
Mean absolute percentage error of the prediction is: 1.0329145335827639
