In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 90)
pd.set_option('display.max_rows', 90)

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
df = pd.read_csv('../datasets/df_engineered.csv', index_col=0)
holdout = pd.read_csv('../datasets/holdout_engineered.csv', index_col=0)

## Benchmark: Single Variable Linear Regression

I'll start modeling by checking the score of model trained on only one feature, Overall Quality, which is the feature with the highest Pearson coefficient.

In [33]:
X = df[['overall_qual']]
y = df['saleprice']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('cv score', cross_val_score(lr, X_train, y_train, cv=5).mean())
print ('train score ', lr.score(X_train, y_train))
print('test score ', lr.score(X_test, y_test))
print('RMSE: ', round(mean_squared_error(y_test, y_pred)**0.5, 0))

cv score 0.6211065229501409
train score  0.6309827113557348
test score  0.6681557808804754
RMSE:  45539.0


## Define a function to streamline this

In [24]:
def linear_reg_score(features, y = df['saleprice']):
    X = features
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    print('cv score: ', round(cross_val_score(lr, X_train, y_train, cv=5).mean(), 4))
    print('train score: ', round(lr.score(X_train, y_train), 4))
    print('test score: ', round(lr.score(X_test, y_test), 4))
    print('RMSE: ', round(mean_squared_error(y_test, y_pred)**0.5, 0))

## Try out adding a bunch of highly-correlated features

In [25]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area']])

cv score:  0.7081
train score:  0.7158
test score:  0.7631
RMSE:  38473.0


In [26]:
linear_reg_score(df[['overall_qual_gr_area']])

cv score:  0.6649
train score:  0.6795
test score:  0.7626
RMSE:  38518.0


In [27]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual']])

cv score:  0.7394
train score:  0.7467
test score:  0.7857
RMSE:  36599.0


In [28]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual']])

cv score:  0.7474
train score:  0.7552
test score:  0.8016
RMSE:  35209.0


In [29]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area'
                   ]])

cv score:  0.7649
train score:  0.7733
test score:  0.8234
RMSE:  33222.0


In [10]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf'
                   ]])

cv score:  0.7718
train score:  0.7845
test score:  0.8486
RMSE:  30758.0


In [11]:
# add in converted ordinal 
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
#                    'bsmt_qual', #increased RMSE
                    'bsmt_cond'
                   ]])

cv score:  0.7699
train score:  0.7845
test score:  0.849
RMSE:  30721.0


In [12]:
# add in converted ordinal 
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
#                    'bsmt_qual', #increased RMSE
                    'bsmt_cond',
                    'exter_cond'
                   ]])

cv score:  0.7698
train score:  0.7846
test score:  0.8494
RMSE:  30677.0


In [13]:
# add in converted ordinal 
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu'
                   ]])

cv score:  0.7853
train score:  0.7997
test score:  0.8552
RMSE:  30079.0


In [14]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
#                    'garage_qual', #increased RMSE
#                    'garage_cond', #increased RMSE
#                    'heating_qc', #increased RMSE
#                    'pool_qc' #increased RMSE
#                    'totrms_abvgrd'#increased RMSE
#                    'year_built' #increased RMSE
                    'full_bath'
                   ]])

cv score:  0.7826
train score:  0.7997
test score:  0.8554
RMSE:  30066.0


In [15]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
#                    'garage_qual', #increased RMSE
#                    'garage_cond', #increased RMSE
#                    'heating_qc', #increased RMSE
#                    'pool_qc' #increased RMSE
#                    'totrms_abvgrd'#increased RMSE
#                    'year_built' #increased RMSE
                    'full_bath',
                    'exter_cond',
                    'lot_shape'
                   ]])

cv score:  0.7809
train score:  0.8002
test score:  0.8569
RMSE:  29901.0


In [16]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual_gr_area', 
                    #'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
#                    'garage_qual', #increased RMSE
#                    'garage_cond', #increased RMSE
#                    'heating_qc', #increased RMSE
#                    'pool_qc' #increased RMSE
#                    'totrms_abvgrd'#increased RMSE
#                    'year_built' #increased RMSE
                    'full_bath',
                    'lot_shape',
                   ]])

cv score:  0.7765
train score:  0.8023
test score:  0.8658
RMSE:  28963.0


In [17]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual_gr_area', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
                    'full_bath',
                    'exter_cond',
                    'lot_shape',
                    'utilities'
                   ]])

cv score:  0.7765
train score:  0.8031
test score:  0.8676
RMSE:  28767.0


## Linear Regression Submission

In [31]:
lr = LinearRegression()
features = ['overall_qual_gr_area', 
                    #'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
                    'full_bath',
                    'exter_cond',
                    'lot_shape',
                    'utilities']
X = df[features]
y = df.saleprice
lr.fit(X, y)
holdout_preds = lr.predict(holdout[features])

submission = pd.DataFrame({'Id': holdout['id'], 'SalePrice': holdout_preds})
submission = submission.reset_index(drop=True)
submission.to_csv('../submissions/kaggle_sub_linear.csv', index=False)

### Ridge Regression

In [61]:
def ridge_reg_score(X, y = np.log(df['saleprice'])):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    ridge_model = RidgeCV(cv=5)
    ridge_model.fit(X_train_sc, y_train)
    print('optimal alpha: ', ridge_model.alpha_)
        
    ridge = Ridge(alpha = ridge_model.alpha_)
    ridge.fit(X_train_sc, y_train)
    print('cv score: ', round(cross_val_score(ridge, X_train_sc, y_train, cv=5).mean(), 4))

    # Generate predictions
    ridge_preds = ridge.predict(X_test_sc)
    ridge_preds_train = ridge.predict(X_train_sc)

    # Evaluate model.
    print('r2 train: ', r2_score(y_train, ridge_preds_train))
    print('r2 test: ', r2_score(np.exp(y_test), np.exp(ridge_preds)))
    print('RMSE: ', round(mean_squared_error(np.exp(y_test), np.exp(ridge_preds))**0.5, 0))
    
    coefs = pd.DataFrame({'predictor': X.columns, 'ridge coef': abs(ridge.coef_)})
    return ridge

In [62]:
ridge_reg_score(df.drop(columns='saleprice'))

optimal alpha:  10.0
cv score:  0.8715
r2 train:  0.9394803210032941
r2 test:  0.9209425199115208
RMSE:  22227.0


Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

#### Ridge Prediction Submission

In [41]:
matching_cols= [col for col in df.columns if col in holdout.columns]

X = df[matching_cols]
y = np.log(df.saleprice)
ridge = RidgeCV(cv=5)
ridge.fit(X, y)
holdout_preds = np.exp(ridge.predict(holdout[matching_cols]))

submission = pd.DataFrame({'Id': holdout['id'], 'SalePrice': holdout_preds})
submission = submission.reset_index(drop=True)
submission.to_csv('../submissions/ridge.csv', index=False)

### Lasso Regression

In [54]:
def lasso_reg_score(X, y = np.log(df['saleprice'])):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    lasso_model = LassoCV(cv=5)
    lasso_model.fit(X_train_sc, y_train)
    print('optimal alpha: ', lasso_model.alpha_)
        
    lasso = Lasso(alpha = lasso_model.alpha_)
    lasso.fit(X_train_sc, y_train)
    print('cv score: ', round(cross_val_score(lasso, X_train_sc, y_train, cv=5).mean(), 4))

    # Generate predictions
    lasso_preds = lasso.predict(X_test_sc)
    lasso_preds_train = lasso.predict(X_train_sc)

    # Evaluate model.
    print('r2 train: ', r2_score(y_train, lasso_preds_train))
    print('r2 test: ', r2_score(y_test, lasso_preds))
    print('RMSE: ', round(mean_squared_error(np.exp(y_test), np.exp(lasso_preds))**0.5, 0))
    
    coefs = pd.DataFrame({'predictor': X.columns, 'lasso coef': abs(lasso.coef_)})
    return lasso

In [55]:
lasso_reg_score(df.drop(columns='saleprice'))

optimal alpha:  0.0012662886799966583
cv score:  0.8697
r2 train:  0.9326467711029777
r2 test:  0.8921929283050496
RMSE:  21162.0


Lasso(alpha=0.0012662886799966583, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

#### Lasso Predictions Submission

In [66]:
matching_cols= [col for col in df.columns if col in holdout.columns]

X = df[matching_cols]
y = np.log(df.saleprice)
lasso = LassoCV(cv=5)
lasso.fit(X, y)
holdout_preds = np.exp(lasso.predict(holdout[matching_cols]))

submission = pd.DataFrame({'Id': holdout['id'], 'SalePrice': holdout_preds})
submission = submission.reset_index(drop=True)
submission.to_csv('../submissions/lasso.csv', index=False)


LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

### Elastic Net Regression

In [50]:
def enet_reg_score(X, y = np.log(df['saleprice'])):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    enet_model = ElasticNetCV(cv=5)
    enet_model.fit(X_train_sc, y_train)
    print('optimal alpha: ', enet_model.alpha_)
        
    enet = ElasticNet(alpha = enet_model.alpha_)
    enet.fit(X_train_sc, y_train)
    print('cv score: ', round(cross_val_score(enet, X_train_sc, y_train, cv=5).mean(), 4))

    # Generate predictions
    enet_preds = enet.predict(X_test_sc)
    enet_preds_train = enet.predict(X_train_sc)

    # Evaluate model.
    print('r2 train: ', r2_score(y_train, enet_preds_train))
    print('r2 test: ', r2_score(y_test, enet_preds))
    print('RMSE: ', round(mean_squared_error(np.exp(y_test), np.exp(enet_preds))**0.5, 0))
    
    coefs = pd.DataFrame({'predictor': X.columns, 'enet coef': abs(enet.coef_)})
    return enet

In [51]:
enet_reg_score(df.drop(columns='saleprice'))

optimal alpha:  0.0023618901221704367
cv score:  0.8695
r2 train:  0.9325337914838794
r2 test:  0.8921424615743792
RMSE:  21153.0


ElasticNet(alpha=0.0023618901221704367, copy_X=True, fit_intercept=True,
      l1_ratio=0.5, max_iter=1000, normalize=False, positive=False,
      precompute=False, random_state=None, selection='cyclic', tol=0.0001,
      warm_start=False)

#### ElasticNet Prediction Submission

In [76]:
matching_cols= [col for col in df.columns if col in holdout.columns]

X = df[matching_cols]
y = np.log(df.saleprice)
enet = ElasticNetCV(cv=5)
enet.fit(X, y)
holdout_preds = np.exp(enet.predict(holdout[matching_cols]))

submission = pd.DataFrame({'Id': holdout['id'], 'SalePrice': holdout_preds})
submission = submission.reset_index(drop=True)
submission.to_csv('../submissions/enet.csv', index=False)