In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 90)
pd.set_option('display.max_rows', 90)

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
df = pd.read_csv('../datasets/df_engineered.csv', index_col=0)
holdout = pd.read_csv('../datasets/holdout_engineered.csv', index_col=0)

In [15]:
df.shape

(2026, 86)

## Model Prep: Create our features matrix (`X`) and target vector (`y`)

In [3]:
numerical_columns = df._get_numeric_data().columns
features = [col for col in numerical_columns if col != 'saleprice']
X = df[features]
y = df['saleprice']

## Model Prep: Train/test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Model Prep: Scaling

In [5]:
ss = StandardScaler()
ss.fit(X_train) # Learning the mean and std for every column
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

## Model Prep: Instantiate our models

In [6]:
lr = LinearRegression()
lasso = LassoCV()
ridge = RidgeCV()

## Cross Validation

In [7]:
cross_val_score(lr, X_train_scaled, y_train, cv=5).mean()

-2.4923196311958747e+23

In [8]:
cross_val_score(lasso, X_train_scaled, y_train, cv=5).mean()

0.7894525849534599

In [9]:
cross_val_score(ridge, X_train_scaled, y_train, cv=5).mean()

0.7705091464131102

## Model Fitting and Evaluation

In [10]:
lasso.fit(X_train_scaled, y_train);

In [11]:
# Train Score
lasso.score(X_train_scaled, y_train)

0.8342159569977194

In [12]:
# Test Score
lasso.score(X_test_scaled, y_test)

0.8799372045181327

## Check Lasso Coefficients of Potential Features

In [19]:
coefs = pd.DataFrame({'feature': X.columns, 'lasso coef': abs(lasso.coef_)})
coefs.sort_values('lasso coef', ascending=False)[0:25]

Unnamed: 0,feature,lasso coef
66,overall_qual_gr_area,19169.730917
67,overall_qc_gr_area,10500.278486
63,garage_car_area,9719.735538
36,kitchen_qual,7237.97309
12,exter_qual,6073.436173
7,overall_qual,5885.740851
9,year_built,5428.452963
40,fireplace_qu,5240.097863
26,1st_flr_sf,4834.764523
16,bsmt_exposure,3749.967088


Top features from the entire dataframe are shown above.

In [28]:
def lasso_reg_score(features, y = df['saleprice']):
    X = features
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    l_alphas = np.linspace(0, 500, 100)
    lasso_model = LassoCV( cv=5)
    lasso_model.fit(X_train_sc, y_train)
    print('optimal alpha: ', lasso_model.alpha_)
        
    lasso = Lasso(alpha = lasso_model.alpha_)
    lasso.fit(X_train_sc, y_train)
    print('cv score: ', round(cross_val_score(lasso, X_train_sc, y_train, cv=5).mean(), 4))

    # Generate predictions
    lasso_preds = lasso.predict(X_test_sc)
    lasso_preds_train = lasso.predict(X_train_sc)

    # Evaluate model.
    print('r2 train: ', r2_score(y_train, lasso_preds_train))
    print('r2 test: ', r2_score(y_test, lasso_preds))
    print('RMSE: ', round(mean_squared_error(y_test, lasso_preds)**0.5, 0))
    
    coefs = pd.DataFrame({'predictor': X.columns, 'lasso coef': abs(lasso.coef_)})
    return lasso

In [21]:
models = lasso_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
                   'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
                   'garage_qual', #increased RMSE
                   'garage_cond', #increased RMSE
                   'heating_qc', #increased RMSE
                   'pool_qc', #increased RMSE
                   'totrms_abvgrd',#increased RMSE
                   'year_built', #increased RMSE
                    'full_bath',
                    'exter_cond',
                    'lot_shape',
                    'utilities',
                    'functional'
                   ]])

optimal alpha:  769.6132006541245
cv score:  0.7746
0.8481392788904798
0.8064985711991055
RMSE:  31388.0


In [58]:
models = lasso_reg_score(df[[
    'overall_qual_gr_area',
    'garage_car_area',
    'kitchen_qual',
    'exter_qual',
    'year_built',
    'fireplace_qu',
    '1st_flr_sf',
    'bsmt_exposure',
#    'misc_val',
    'bsmt_qual',
    'ms_subclass',
    'mas_vnr_area',
    'bsmtfin_sf_1',
#    'bsmtfin_type_1',
    'bsmt_full_bath',
#    'pool_qc',
#    'heating_qc',
#    'full_bath',
    'screen_porch',
#   'land_slope'
                  ]])

optimal alpha:  138.972976688399
cv score:  0.8005
0.8995278078433788
0.8286237503915816
RMSE:  25531.0
