In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV


%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

In [3]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
house_price_df = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

house_price_df.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
house_price_df = pd.concat([house_price_df, pd.get_dummies(house_price_df.neighborhood, prefix="neighborhood", drop_first=True)], axis=1)

house_price_df = pd.concat([house_price_df, pd.get_dummies(house_price_df.housestyle, prefix="housestyle", drop_first=True)], axis=1)

dummy_column_names = list(pd.get_dummies(house_price_df.neighborhood, prefix="neighborhood", drop_first=True).columns)

dummy_column_names = list(pd.get_dummies(house_price_df.housestyle, prefix="housestyle", drop_first=True).columns)

In [9]:
house_price_df['totalsf'] = house_price_df['totalbsmtsf'] + house_price_df['firstflrsf'] + house_price_df['secondflrsf']

house_price_df['int_over_sqf'] = house_price_df['totalsf'] * house_price_df["overallqual"]

Y = np.log1p(house_price_df["saleprice"])
X = house_price_df[["overallqual", "garagecars", "garagearea", "grlivarea", "lotarea", "totalsf", "int_over_sqf"] + dummy_column_names]

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = .2, random_state = 465)

print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

The number of observations in training set is 1168
The number of observations in test set is 292


In [12]:
lrm = LinearRegression()

lrm.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in training set is: 0.8188477194252737
-----Test set statistics-----
R-squared of the model in test set is: 0.8053906737009002
Mean absolute error of the prediction is: 0.13007073405734865
Mean squared error of the prediction is: 0.0324502583795627
Root mean squared error of the prediction is: 0.1801395525129412
Mean absolute percentage error of the prediction is: 1.0881946465672505


In [23]:
ridge_cv = RidgeCV(alphas=alphas, cv=5)

ridge_cv.fit(X_train, y_train)

y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)

print("The Best Alpha value is: {}".format(ridge_cv.alpha_))
print("R-squared of the model in training set is: {}".format(ridge_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(ridge_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

The Best Alpha value is: 0.1
R-squared of the model in training set is: 0.8188470712899761
-----Test set statistics-----
R-squared of the model in test set is: 0.8055135016693561
Mean absolute error of the prediction is: 0.1300131429110852
Mean squared error of the prediction is: 0.032429777350269695
Root mean squared error of the prediction is: 0.18008269586573192
Mean absolute percentage error of the prediction is: 1.087716361236828


In [24]:
lasso_cv = LassoCV(alphas=alphas, cv=5)

lasso_cv.fit(X_train, y_train)

y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)

print("The Best Alpha value is: {}".format(lasso_cv.alpha_))
print("R-squared of the model on the training set is: {}".format(lasso_cv.score(X_train, y_train)))
print("--------------Test Set Statistics------------------")
print("R-squared of the model on the test set is: {}".format(lasso_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test/y_test))*100)))

The Best Alpha value is: 1e-10
R-squared of the model on the training set is: 0.8188477194252736
--------------Test Set Statistics------------------
R-squared of the model on the test set is: 0.8053906743477249
Mean absolute error of the prediction is: 0.13007073374204914
Mean squared error of the prediction is: 0.032450258271707506
Root mean squared error of the prediction is: 0.18013955221357553
Mean absolute percentage error of the prediction is: 1103.6467690495408


In [21]:
elasticnet_cv = ElasticNetCV(alphas=alphas, cv=5)

elasticnet_cv.fit(X_train, y_train)

y_preds_train = elasticnet_cv.predict(X_train)
y_preds_test = elasticnet_cv.predict(X_test)

print("Best Alpha Value: {}".format(elasticnet_cv.alpha_))
print("R-squared of the model on the training set is: {}".format(elasticnet_cv.score(X_train, y_train)))
print("--------------Test Set Statistics------------------")
print("R-squared of the model on the test set is: {}".format(elasticnet_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test/y_test))*100)))

Best Alpha Value: 1e-10
R-squared of the model on the training set is: 0.8188477194252736
--------------Test Set Statistics------------------
R-squared of the model on the test set is: 0.8053906740976966
Mean absolute error of the prediction is: 0.13007073386556622
Mean squared error of the prediction is: 0.03245025831339864
Root mean squared error of the prediction is: 0.18013955232929454
Mean absolute percentage error of the prediction is: 1103.6467690497318


Looking at all of the models, it appears that they all score about the same. Any one of the models would be an exceptable predictor.