In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 90)
pd.set_option('display.max_rows', 90)

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
df = pd.read_csv('../datasets/df_engineered.csv', index_col=0)
holdout = pd.read_csv('../datasets/holdout_engineered.csv', index_col=0)

## Start Modeling with a simple Linear Regression
#### Using only as Overall Quality as a feature, CV score is 0.63, RMSE is 45539

In [3]:
X = df[['overall_qual']]
y = df['saleprice']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('cv score', cross_val_score(lr, X_train, y_train, cv=5).mean())
print ('train score ', lr.score(X_train, y_train))
print('test score ', lr.score(X_test, y_test))
print('RMSE: ', round(mean_squared_error(y_test, y_pred)**0.5, 0))

cv score 0.6276121552873748
train score  0.6311306764237692
test score  0.6638051214676336
RMSE:  46702.0


## Define a function to streamline this

In [4]:
def linear_reg_score(features, y = df['saleprice']):
    X = features
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    print('cv score: ', round(cross_val_score(lr, X_train, y_train, cv=5).mean(), 4))
    print('train score: ', round(lr.score(X_train, y_train), 4))
    print('test score: ', round(lr.score(X_test, y_test), 4))
    print('RMSE: ', round(mean_squared_error(y_test, y_pred)**0.5, 0))

## Try out adding a bunch of highly-correlated features

In [5]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area']])

cv score:  0.7032
train score:  0.7143
test score:  0.7587
RMSE:  39566.0


In [6]:
linear_reg_score(df[['overall_qual_gr_area']])

cv score:  0.65
train score:  0.6739
test score:  0.7699
RMSE:  38641.0


In [7]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual']])

cv score:  0.7291
train score:  0.7416
test score:  0.7946
RMSE:  36508.0


In [8]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual']])

cv score:  0.7392
train score:  0.7519
test score:  0.8045
RMSE:  35616.0


In [9]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area'
                   ]])

cv score:  0.7565
train score:  0.7699
test score:  0.8255
RMSE:  33650.0


In [10]:
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf'
                   ]])

cv score:  0.761
train score:  0.7833
test score:  0.8449
RMSE:  31723.0


In [11]:
# add in converted ordinal 
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
#                    'bsmt_qual', #increased RMSE
                    'bsmt_cond'
                   ]])

cv score:  0.7604
train score:  0.7834
test score:  0.8452
RMSE:  31689.0


In [12]:
# add in converted ordinal 
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
#                    'bsmt_qual', #increased RMSE
                    'bsmt_cond',
                    'exter_cond'
                   ]])

cv score:  0.7605
train score:  0.7838
test score:  0.8447
RMSE:  31737.0


In [13]:
# add in converted ordinal 
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu'
                   ]])

cv score:  0.7782
train score:  0.7996
test score:  0.8491
RMSE:  31292.0


In [14]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
#                    'garage_qual', #increased RMSE
#                    'garage_cond', #increased RMSE
#                    'heating_qc', #increased RMSE
#                    'pool_qc' #increased RMSE
#                    'totrms_abvgrd'#increased RMSE
#                    'year_built' #increased RMSE
                    'full_bath'
                   ]])

cv score:  0.777
train score:  0.7997
test score:  0.8482
RMSE:  31380.0


In [15]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
#                    'garage_qual', #increased RMSE
#                    'garage_cond', #increased RMSE
#                    'heating_qc', #increased RMSE
#                    'pool_qc' #increased RMSE
#                    'totrms_abvgrd'#increased RMSE
#                    'year_built' #increased RMSE
                    'full_bath',
                    'exter_cond',
                    'lot_shape'
                   ]])

cv score:  0.7771
train score:  0.8004
test score:  0.8494
RMSE:  31261.0


In [16]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual_gr_area', 
                    #'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
#                    'garage_cars', #increased RMSE
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
#                    'garage_qual', #increased RMSE
#                    'garage_cond', #increased RMSE
#                    'heating_qc', #increased RMSE
#                    'pool_qc' #increased RMSE
#                    'totrms_abvgrd'#increased RMSE
#                    'year_built' #increased RMSE
                    'full_bath',
                    'lot_shape',
                   ]])

cv score:  0.7721
train score:  0.8007
test score:  0.8628
RMSE:  29839.0


In [17]:
# add in additional highly correlated values
linear_reg_score(df[['overall_qual_gr_area', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
                    'full_bath',
                    'exter_cond',
                    'lot_shape',
                    'utilities'
                   ]])

cv score:  0.7726
train score:  0.8013
test score:  0.8652
RMSE:  29575.0


## Linear Regression Submission

In [18]:
lr = LinearRegression()
features = ['overall_qual_gr_area', 
                    'gr_liv_area', 
                    'exter_qual', 
                    'kitchen_qual',
                    'garage_area',
                    'total_bsmt_sf',
                    'bsmt_qual', 
                    'bsmt_cond',
                    'exter_cond',
                    'fireplace_qu',
                    'full_bath',
                    'exter_cond',
                    'lot_shape',
                    'utilities']
X = df[features]
y = df.saleprice
lr.fit(X, y)
holdout_preds = lr.predict(holdout[features])

submission = pd.DataFrame({'Id': holdout['id'], 'SalePrice': holdout_preds})
submission = submission.reset_index(drop=True)
submission.to_csv('kaggle_sub_linear.csv', index=False)