In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Data

In [3]:
df = pd.read_csv('USA Housing Dataset Final.csv')

In [5]:
df.head()

Unnamed: 0,yr_sold,mth_sold,day_sold,price,log_price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,...,SeaTac,Seattle,Shoreline,Skykomish,Snoqualmie,Snoqualmie Pass,Tukwila,Vashon,Woodinville,Yarrow Point
0,2014,5,9,376000.0,12.837344,3,2.0,1340,1384,3.0,...,False,True,False,False,False,False,False,False,False,False
1,2014,5,9,800000.0,13.592367,4,3.25,3540,159430,2.0,...,False,False,False,False,False,False,False,False,False,False
2,2014,5,9,2238888.0,14.62149,5,6.5,7270,130017,2.0,...,False,False,False,False,False,False,False,False,False,False
3,2014,5,9,324000.0,12.688499,3,2.25,998,904,2.0,...,False,True,False,False,False,False,False,False,False,False
4,2014,5,10,549900.0,13.217492,5,2.75,3060,7015,1.0,...,False,True,False,False,False,False,False,False,False,False


### Train/Test Split & Scaling Data

In [4]:
X = df.drop(['price', 'log_price'], axis=1)
y = df['log_price']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Model

In [8]:
from sklearn.linear_model import ElasticNet
elastic_model = ElasticNet()

In [10]:
param_grid = {'alpha': [0.01, 0.1, 0.5, 0.8, 1, 5, 10], 
              'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 1]}

In [11]:
from sklearn.model_selection import GridSearchCV
grid_model = GridSearchCV(estimator=elastic_model,
                          param_grid=param_grid,
                          scoring='neg_mean_squared_error',
                          cv=5,
                          verbose=1)

In [12]:
grid_model.fit(X_train, y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


In [13]:
grid_model.best_estimator_

In [20]:
log_y_pred = grid_model.predict(X_test)

### Model Performance

In [45]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
log_mae = mean_absolute_error(y_test, log_y_pred)
log_mse = mean_squared_error(y_test, log_y_pred)
log_rmse = np.sqrt(log_mae)

In [46]:
log_mae

0.20176337296582503

In [47]:
log_mse

0.08380146715727589

In [48]:
log_rmse

0.4491807798268143

#### Convert from log scale to dollars

In [49]:
# predictions are off by +/- 22% on average
np.exp(log_mae)

1.2235584468472804

In [51]:
# predictions are off by +/- 56% on average
np.exp(log_rmse)

1.567027918630447

#### MAE and RMSE in Dollars

In [69]:
usd_price_pred = np.exp(log_y_pred)
usd_price_actual = np.exp(y_test)

In [73]:
mae_usd = mean_absolute_error(usd_price_actual, usd_price_pred)
mae_usd

114743.3769331774

In [74]:
rmse_usd = np.sqrt(mean_squared_error(usd_price_actual, usd_price_pred))
rmse_usd

198173.40579311692

Since the RMSE was significantly larger than the MAE, this suggests that there are large individual prediction errors, which is heavily punished by RMSE. This makes sense for this dataset as the price distribution was skewed by high-priced homes. The median house price was 461000, the 75th percentile was 657000, and the max was over two million dollars.

In [84]:
df.describe()['price']

count    4.065000e+03
mean     5.356545e+05
std      3.094762e+05
min      7.800000e+03
25%      3.250000e+05
50%      4.610000e+05
75%      6.575000e+05
max      2.300000e+06
Name: price, dtype: float64