In [6]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, warnings
import statsmodels.formula.api as smf, statsmodels.api as sm
from sklearn import linear_model
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine
from scipy.stats.mstats import winsorize

# Visual Preference
%matplotlib inline
warnings.filterwarnings(action='ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

house_df = pd.read_sql_query('select * from houseprices', con=engine)
engine.dispose()
house_df.head(10)

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [7]:
# Create the model
house_df = house_df.select_dtypes(exclude=['object'])
house_df = house_df.dropna()
sklearn_pca = PCA(n_components=25)
Y = house_df['saleprice']
X = sm.add_constant(sklearn_pca.fit_transform(scale(house_df)))

# Split to train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

# Fit OLS model with sklearn
lrm = LinearRegression()
lrm.fit(X_train, y_train)

# Predictions
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

# Print results
print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.8587720455474126
-----Test set statistics-----
R-squared of the model in the test set is: 0.7353580506331248
Mean absolute error of the prediction is: 21817.376178413877
Mean squared error of the prediction is: 1744034854.817914
Root mean squared error of the prediction is: 41761.64334431673
Mean absolute percentage error of the prediction is: 11.9918705837304


In [62]:
# Ridge Regression
ridgeregr = Ridge(alpha=10**3)
ridgeregr.fit(X_train, y_train)
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model on the training set is: {}".format(ridgeregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(ridgeregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

# Check with cross validation
cv = cross_val_score(ridgeregr, X_train, y_train, cv=10)
print('cross validation, max value: ', cv.max())
print('cross validation, min value: ', cv.min())
print('range of values: ', cv.max() - cv.min())

R-squared of the model on the training set is: 0.8297145466388033
-----Test set statistics-----
R-squared of the model on the test set is: 0.7740628565432126
Mean absolute error of the prediction is: 21817.376178413877
Mean squared error of the prediction is: 1744034854.817914
Root mean squared error of the prediction is: 41761.64334431673
Mean absolute percentage error of the prediction is: 11.9918705837304
cross validation, max value:  0.8862233017207037
cross validation, min value:  0.6174061171318823
range of values:  0.2688171845888214


In [51]:
from sklearn.linear_model import Lasso

lassoregr = Lasso(alpha=10**3) 
lassoregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lassoregr.predict(X_train)
y_preds_test = lassoregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(lassoregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(lassoregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

# Check with cross validation
cv = cross_val_score(ridgeregr, X_train, y_train, cv=10)
print('cross validation, max value: ', cv.max())
print('cross validation, min value: ', cv.min())
print('range of values: ', cv.max() - cv.min())

R-squared of the model on the training set is: 0.855205898066404
-----Test set statistics-----
R-squared of the model on the test set is: 0.7602431172918325
Mean absolute error of the prediction is: 21315.882427295288
Mean squared error of the prediction is: 1580038089.6751091
Root mean squared error of the prediction is: 39749.692950702265
Mean absolute percentage error of the prediction is: 11.714635676877085
cross validation, max value:  0.8862233017207037
cross validation, min value:  0.6174061171318823
range of values:  0.2688171845888214


In [72]:
# ElasticNet Regression
elasticregr = ElasticNet(alpha=10**3, l1_ratio=0.5) 
elasticregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = elasticregr.predict(X_train)
y_preds_test = elasticregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(elasticregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(elasticregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

# Check with cross validation
cv = cross_val_score(ridgeregr, X_train, y_train, cv=10)
print('cross validation, max value: ', cv.max())
print('cross validation, min value: ', cv.min())
print('range of values: ', cv.max() - cv.min())

R-squared of the model on the training set is: 0.025376938278406147
-----Test set statistics-----
R-squared of the model on the test set is: 0.025710320287323674
Mean absolute error of the prediction is: 58075.14443825707
Mean squared error of the prediction is: 6420732480.8988695
Root mean squared error of the prediction is: 80129.473234877
Mean absolute percentage error of the prediction is: 33.744130530820435
cross validation, max value:  0.8862233017207037
cross validation, min value:  0.6174061171318823
range of values:  0.2688171845888214
