In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 90)
pd.set_option('display.max_rows', 90)

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
df = pd.read_csv('../datasets/df_engineered.csv', index_col=0)
holdout = pd.read_csv('../datasets/holdout_engineered.csv', index_col=0)

## Benchmark: Single Variable Linear Regression

I'll start modeling by checking the score of model trained on only one feature, Overall Quality, which is the feature with the highest Pearson coefficient.

In [3]:
X = df[['overall_qual']]
y = df['saleprice']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('cv score', cross_val_score(lr, X_train, y_train, cv=5).mean())
print ('train score ', lr.score(X_train, y_train))
print('test score ', lr.score(X_test, y_test))
print('RMSE: ', round(mean_squared_error(y_test, y_pred)**0.5, 0))

cv score 0.6211065229501409
train score  0.6309827113557348
test score  0.6681557808804754
RMSE:  45539.0


## Define a function to streamline this

In [4]:
def linear_reg_score(features, y = np.log(df['saleprice'])):
    X = features
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    print('cv score: ', round(cross_val_score(lr, X_train, y_train, cv=5).mean(), 4))
    print('train score: ', round(lr.score(X_train, y_train), 4))
    print('test score: ', round(lr.score(X_test, y_test), 4))
    print('RMSE: ', round(mean_squared_error(np.exp(y_test), np.exp(y_pred))**0.5, 0))

## Try out adding a bunch of highly-correlated features

In [5]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area']])

cv score:  0.7337
train score:  0.7425
test score:  0.7683
RMSE:  34119.0


In [6]:
linear_reg_score(df[['overall_qual_gr_area']])

cv score:  0.6142
train score:  0.6307
test score:  0.7006
RMSE:  39261.0


In [7]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual']])

cv score:  0.7495
train score:  0.7583
test score:  0.7719
RMSE:  33167.0


In [8]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual']])

cv score:  0.7574
train score:  0.7661
test score:  0.7825
RMSE:  31736.0


In [9]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area'
                   ]])

cv score:  0.7767
train score:  0.7857
test score:  0.802
RMSE:  29952.0


In [10]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf'
                   ]])

cv score:  0.7822
train score:  0.7949
test score:  0.8219
RMSE:  26957.0


In [11]:
# add in converted ordinal 
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
#                    'bsmt_qual', #increased RMSE
                    'bsmt_cond'
                   ]])

cv score:  0.7843
train score:  0.7976
test score:  0.8258
RMSE:  27095.0


In [12]:
# add in converted ordinal 
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
#                    'bsmt_qual', #increased RMSE
                    'bsmt_cond',
                    'exter_cond'
                   ]])

cv score:  0.7857
train score:  0.7997
test score:  0.8311
RMSE:  26978.0


In [13]:
# add in converted ordinal 
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu'
                   ]])

cv score:  0.8034
train score:  0.8166
test score:  0.8321
RMSE:  26142.0


In [14]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
#                    'garage_qual', #increased RMSE
#                    'garage_cond', #increased RMSE
#                    'heating_qc', #increased RMSE
#                    'pool_qc' #increased RMSE
#                    'totrms_abvgrd'#increased RMSE
#                    'year_built' #increased RMSE
                    'full_bath'
                   ]])

cv score:  0.8045
train score:  0.8184
test score:  0.8328
RMSE:  26337.0


In [15]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
#                    'garage_qual', #increased RMSE
#                    'garage_cond', #increased RMSE
#                    'heating_qc', #increased RMSE
#                    'pool_qc' #increased RMSE
#                    'totrms_abvgrd'#increased RMSE
#                    'year_built' #increased RMSE
                    'full_bath',
                    'exter_cond',
                    'lot_shape'
                   ]])

cv score:  0.8049
train score:  0.8203
test score:  0.8377
RMSE:  26111.0


In [16]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual_gr_area', 
                    #'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
#                    'garage_qual', #increased RMSE
#                    'garage_cond', #increased RMSE
#                    'heating_qc', #increased RMSE
#                    'pool_qc' #increased RMSE
#                    'totrms_abvgrd'#increased RMSE
#                    'year_built' #increased RMSE
                    'full_bath',
                    'lot_shape',
                   ]])

cv score:  0.777
train score:  0.796
test score:  0.818
RMSE:  27270.0


In [17]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual_gr_area', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
                    'full_bath',
                    'exter_cond',
                    'lot_shape',
                    'utilities'
                   ]])

cv score:  0.778
train score:  0.7972
test score:  0.8156
RMSE:  27308.0


## Linear Regression Submission

In [18]:
lr = LinearRegression()
features = ['overall_qual_gr_area', 
                    #'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
                    'full_bath',
                    'exter_cond',
                    'lot_shape',
                    'utilities']
X = df[features]
y = df.saleprice
lr.fit(X, y)
holdout_preds = lr.predict(holdout[features])

submission = pd.DataFrame({'Id': holdout['id'], 'SalePrice': holdout_preds})
submission = submission.reset_index(drop=True)
submission.to_csv('../submissions/kaggle_sub_linear.csv', index=False)

### Ridge Regression

In [19]:
def ridge_reg_score(X, y = np.log(df['saleprice'])):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    ridge_model = RidgeCV(cv=5)
    ridge_model.fit(X_train_sc, y_train)
    print('optimal alpha: ', ridge_model.alpha_)
        
    ridge = Ridge(alpha = ridge_model.alpha_)
    ridge.fit(X_train_sc, y_train)
    print('cv score: ', round(cross_val_score(ridge, X_train_sc, y_train, cv=5).mean(), 4))

    # Generate predictions
    ridge_preds = ridge.predict(X_test_sc)
    ridge_preds_train = ridge.predict(X_train_sc)

    # Evaluate model.
    print('r2 train: ', r2_score(y_train, ridge_preds_train))
    print('r2 test: ', r2_score(np.exp(y_test), np.exp(ridge_preds)))
    print('RMSE: ', round(mean_squared_error(np.exp(y_test), np.exp(ridge_preds))**0.5, 0))
    
    coefs = pd.DataFrame({'predictor': X.columns, 'ridge coef': abs(ridge.coef_)})
    return ridge

In [20]:
ridge_reg_score(df.drop(columns='saleprice'))

optimal alpha:  10.0
cv score:  0.8715
r2 train:  0.9394803210032941
r2 test:  0.9209425199115208
RMSE:  22227.0


Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

#### Ridge Prediction Submission

In [21]:
matching_cols= [col for col in df.columns if col in holdout.columns]

X = df[matching_cols]
y = np.log(df.saleprice)
ridge = RidgeCV(cv=5)
ridge.fit(X, y)
holdout_preds = np.exp(ridge.predict(holdout[matching_cols]))

submission = pd.DataFrame({'Id': holdout['id'], 'SalePrice': holdout_preds})
submission = submission.reset_index(drop=True)
submission.to_csv('../submissions/ridge.csv', index=False)

### Lasso Regression

In [22]:
def lasso_reg_score(X, y = np.log(df['saleprice'])):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    lasso_model = LassoCV(cv=5)
    lasso_model.fit(X_train_sc, y_train)
    print('optimal alpha: ', lasso_model.alpha_)
        
    lasso = Lasso(alpha = lasso_model.alpha_)
    lasso.fit(X_train_sc, y_train)
    print('cv score: ', round(cross_val_score(lasso, X_train_sc, y_train, cv=5).mean(), 4))

    # Generate predictions
    lasso_preds = lasso.predict(X_test_sc)
    lasso_preds_train = lasso.predict(X_train_sc)

    # Evaluate model.
    print('r2 train: ', r2_score(y_train, lasso_preds_train))
    print('r2 test: ', r2_score(y_test, lasso_preds))
    print('RMSE: ', round(mean_squared_error(np.exp(y_test), np.exp(lasso_preds))**0.5, 0))
    
    coefs = pd.DataFrame({'predictor': X.columns, 'lasso coef': abs(lasso.coef_)})
    return lasso

In [23]:
lasso_reg_score(df.drop(columns='saleprice'))

optimal alpha:  0.0012662886799966583
cv score:  0.8697
r2 train:  0.9326467711029777
r2 test:  0.8921929283050496
RMSE:  21162.0


Lasso(alpha=0.0012662886799966583, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

#### Lasso Predictions Submission

In [24]:
matching_cols= [col for col in df.columns if col in holdout.columns]

X = df[matching_cols]
y = np.log(df.saleprice)
lasso = LassoCV(cv=5)
lasso.fit(X, y)
holdout_preds = np.exp(lasso.predict(holdout[matching_cols]))

submission = pd.DataFrame({'Id': holdout['id'], 'SalePrice': holdout_preds})
submission = submission.reset_index(drop=True)
submission.to_csv('../submissions/lasso.csv', index=False)


### Elastic Net Regression

In [33]:
def enet_reg_score(X, y = np.log(df['saleprice'])):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    enet_model = ElasticNetCV(cv=5)
    enet_model.fit(X_train_sc, y_train)
    print('optimal alpha: ', enet_model.alpha_)
        
    enet = ElasticNet(alpha = enet_model.alpha_)
    enet.fit(X_train_sc, y_train)
    print('cv score: ', cross_val_score(enet, X_train_sc, y_train, cv=5))

    # Generate predictions
    enet_preds = enet.predict(X_test_sc)
    enet_preds_train = enet.predict(X_train_sc)

    # Evaluate model.
    print('r2 train: ', r2_score(y_train, enet_preds_train))
    print('r2 test: ', r2_score(y_test, enet_preds))
    print('RMSE: ', round(mean_squared_error(np.exp(y_test), np.exp(enet_preds))**0.5, 0))
    
    coefs = pd.DataFrame({'predictor': X.columns, 'enet coef': abs(enet.coef_)})
    return enet

In [34]:
enet_reg_score(df.drop(columns='saleprice'))

optimal alpha:  0.0023618901221704367
cv score:  [0.93057252 0.86569494 0.87749168 0.88364806 0.79025588]
r2 train:  0.9325337914838794
r2 test:  0.8921424615743792
RMSE:  21153.0


ElasticNet(alpha=0.0023618901221704367, copy_X=True, fit_intercept=True,
      l1_ratio=0.5, max_iter=1000, normalize=False, positive=False,
      precompute=False, random_state=None, selection='cyclic', tol=0.0001,
      warm_start=False)

#### ElasticNet Prediction Submission

In [27]:
matching_cols= [col for col in df.columns if col in holdout.columns]

X = df[matching_cols]
y = np.log(df.saleprice)
enet = ElasticNetCV(cv=5)
enet.fit(X, y)
holdout_preds = np.exp(enet.predict(holdout[matching_cols]))

submission = pd.DataFrame({'Id': holdout['id'], 'SalePrice': holdout_preds})
submission = submission.reset_index(drop=True)
submission.to_csv('../submissions/enet.csv', index=False)

In [None]:
pipe = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler', StandardScaler()),
    ('lcv', LassoCV(0.00418)),
])