### Import Libraries

In [1]:
# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# modeling imports
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import metrics

#### Load data

In [2]:
#read in cleaned data file
ames_clean = pd.read_csv('../datasets/ames_clean_v6.csv')

In [3]:
#shape of date file
ames_clean.shape

(2036, 28)

In [4]:
#columns in data file
ames_clean.columns

Index(['saleprice', 'is_residential', 'neighborhood_order', 'local_conditions',
       'age', 'was_remodeled', 'overall_qual', 'exter_qual',
       'external_feature', 'mas_vnr_area', 'buildingtype_bystory',
       'functional', 'lot_frontage', 'lot_area', 'developed_outside_sf',
       'garage_qual', 'garage_fin*sqft', 'garage_cars', 'paved_drive',
       'bsmt_qual', 'bsmt_type*sf_all', 'bsmt_exposure', 'heating_qc',
       'kitchen_qual', 'fireplace_qu', 'quality_above_sqft', 'totrms_abvgrd',
       'room_size'],
      dtype='object')

### Prepare data for modeling

In [5]:
#create X and y variables
X = ames_clean.drop(columns=['saleprice'])
y = ames_clean['saleprice']

In [6]:
#split into training and test groups
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=.2, train_size = .8)

#### Linear Regression Model, not scaled

In [8]:
#Instantiate, fit and score 
lr = LinearRegression()
lr.fit(X=X_train, y=y_train)
print(f' Unscaled Linear Regression training R^2 score: {lr.score(X=X_train, y=y_train)}')
print(f' Unscaled Linear Regression test R^2 score: {lr.score(X_test, y_test)}')

LinearRegression()

#### Linear Regression Model, scaled (all models here on out are scaled)

In [12]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [13]:
lr_stan = LinearRegression()
lr_stan.fit(Z_train, y_train)
print(f' Scaled Linear Regression training R^2 score: {lr_stan.score(Z_train, y_train)}')
print(f' Scaled Linear Regression test R^2 score: {lr_stan.score(Z_test, y_test)}')

#### RidgeCV Regression

In [18]:
# Set up a list of ridge alphas to check.
# np.logspace generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.
r_alphas = np.logspace(0, 5, 100)

# Cross-validate over our list of ridge alphas.
ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)

# Fit model using best ridge alpha!
ridge_cv.fit(Z_train, y_train);

In [19]:
# Here is the optimal value of alpha
ridge_cv.alpha_

29.150530628251758

In [20]:
print(f' RidgeCV Regression training R^2 score: {ridge_cv.score(Z_train, y_train)}')
print(f' RidgeCV Regression test R^2 score: {ridge_cv.score(Z_test, y_test)}')

0.8917416153182917
0.8874376072994777


#### LassoCV Regression

In [21]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(-3, 3, 500)

# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=10000)

# Fit model using best ridge alpha!
lasso_cv.fit(Z_train, y_train);

#best found alpha
lasso_cv.alpha_

In [23]:
print(f' LassoCV Regression training R^2 score: {lasso_cv.score(Z_train, y_train)}')
print(f' LassoCV Regression test R^2 score: {lasso_cv.score(Z_test, y_test)}')

0.8916711732174127
0.8869310542988934


## Models utilizing Pipeline and Gridsearch
##### Pipeline saves time by executing scaling, model initiation and polynomial feature creation.
##### Gridsearch searches for the best values in a given list of parameters

#### Lasso Model

In [31]:
pipe_power = Pipeline([
    #('poly', PolynomialFeatures()),
    ('sc', StandardScaler()),
    ('lassocv', Lasso())
])

In [32]:
lasso_params = {'lassocv__alpha':np.logspace(-5, 5, 500),
               'lassocv__max_iter':[5000]}

In [33]:
lasso_pipe_gridsearch = GridSearchCV(pipe_power, 
                               param_grid=lasso_params)

In [34]:
lasso_pipe_gridsearch.fit(X_train, y_train)
lasso_pipe_gridsearch.best_estimator_

GridSearchCV(estimator=Pipeline(steps=[('sc', StandardScaler()),
                                       ('lassocv', Lasso())]),
             param_grid={'lassocv__alpha': array([1.00000000e-05, 1.02331658e-05, 1.04717682e-05, 1.07159340e-05,
       1.09657929e-05, 1.12214777e-05, 1.14831241e-05, 1.17508713e-05,
       1.20248614e-05, 1.23052400e-05, 1.25921561e-05, 1.28857621e-05,
       1.31862140e-05, 1.34936714e-05, 1.38082977e-05, 1.41...
       5.88531578e+04, 6.02254120e+04, 6.16296626e+04, 6.30666554e+04,
       6.45371540e+04, 6.60419396e+04, 6.75818117e+04, 6.91575883e+04,
       7.07701066e+04, 7.24202233e+04, 7.41088152e+04, 7.58367791e+04,
       7.76050334e+04, 7.94145172e+04, 8.12661920e+04, 8.31610415e+04,
       8.51000725e+04, 8.70843150e+04, 8.91148232e+04, 9.11926760e+04,
       9.33189772e+04, 9.54948564e+04, 9.77214697e+04, 1.00000000e+05]),
                         'lassocv__max_iter': [5000]})

In [35]:
lasso_pipe_gridsearch.best_score_

0.8861658589906888

#### Lasso Model with polynomial features

In [38]:
lasso_poly_pipe = Pipeline([
    ('poly', PolynomialFeatures(degree = 3)), #uncomment me to release to power
    ('sc', StandardScaler()),
    ('lassocv', Lasso())
])

lasso_poly_params = {'lassocv__alpha':np.logspace(1, 2.5, 50),
               'lassocv__max_iter':[10000],
                 'lassocv__tol':[.01]}

pipe_gridsearch = GridSearchCV(lasso_poly_pipe, 
                                param_grid=lasso_poly_params,)


In [40]:
pipe_gridsearch.fit(X_train, y_train)
pipe_gridsearch.best_estimator_

GridSearchCV(estimator=Pipeline(steps=[('poly', PolynomialFeatures(degree=3)),
                                       ('sc', StandardScaler()),
                                       ('lassocv', Lasso())]),
             param_grid={'lassocv__alpha': array([ 10.        ,  10.73030941,  11.51395399,  12.35482888,
        13.25711366,  14.22529313,  15.26417967,  16.37893707,
        17.57510625,  18.85863279,  20.23589648,  21.71374303,
        23.29951811,  25.00110383,  26.82695795,  28.78615592,
        30.8884...
        54.28675439,  58.25136712,  62.50551925,  67.07035611,
        71.9685673 ,  77.22449946,  82.86427729,  88.91593339,
        95.40954763, 102.37739663, 109.8541142 , 117.87686348,
       126.48552169, 135.7228783 , 145.63484775, 156.27069765,
       167.68329368, 179.92936233, 193.06977289, 207.16983999,
       222.29964825, 238.53440064, 255.95479227, 274.64741148,
       294.70517026, 316.22776602]),
                         'lassocv__max_iter': [10000], 'lassocv_

In [41]:
pipe_gridsearch.best_score_

0.9183873600455857