In [1]:
from IPython.core.display import display, HTML
display(HTML('<style>'
        '#notebook { padding-top:0px !important; } ' 
        '.container { width:100% !important; } '
        '.end_space { min-height:0px !important; } '
    '</style>'
))

# Housing Price Exmaple

- Check out notebook "7_ML_00_FeatureEng_DataPrep" for Feature Engineering and Data Preparation septs.
- Here we present comparison b/w different ML models.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Ames_Housing_Data after Feature Eng. and Data Prep. septs
df = pd.read_csv("data/Ames_Housing_Data_cleaned_prepared_data.csv")
df.head()

Unnamed: 0,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,...,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,141.0,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,...,0,0,0,0,1,0,0,0,1,0
1,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,...,0,0,0,0,1,0,0,0,1,0
2,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,...,0,0,0,0,1,0,0,0,1,0
3,93.0,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,...,0,0,0,0,1,0,0,0,1,0
4,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,...,0,0,0,0,1,0,0,0,1,0


In [4]:
# Get X and y
X = df.drop('SalePrice',axis=1)
y = df['SalePrice']

## Linear Regression with GridSearchCV

In [5]:
# Split into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
train_test_split(X,y,test_size=0.1,random_state=101)

In [6]:
# Scaling
from sklearn.preprocessing import StandardScaler

scalar  = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test  = scalar.transform(X_test)

In [7]:
# Model
from sklearn.linear_model import ElasticNet

base_model = ElasticNet(max_iter=1000000)

In [8]:
# Parameter grid
parameter_grid = {'alpha':[10,100,1000,10000],
                  'l1_ratio':[.1,.5,.7,.95,.99,1]}

In [9]:
# GridSearchV
from sklearn.model_selection import GridSearchCV

grid_model = GridSearchCV(estimator=base_model,
                          param_grid=parameter_grid,
                          scoring='neg_mean_squared_error',
                          cv=5,verbose=0)
grid_model.fit(X_train,y_train);

In [10]:
# Best model
grid_model.best_estimator_

ElasticNet(alpha=100, l1_ratio=1, max_iter=1000000)

In [11]:
# Final performance
from sklearn.metrics import mean_absolute_error,mean_squared_error

test_pred = grid_model.predict(X_test)
mae  = mean_absolute_error(y_test,test_pred)
rmse = mean_squared_error(y_test,test_pred)**0.5
print(f"Errors relative to mean house price \n")
print(f"MAE  = {round(mae/y.mean()*100,2)}  %")
print(f"RMSE = {round(rmse/y.mean()*100,2)} %")

Errors relative to mean house price 

MAE  = 7.85  %
RMSE = 11.37 %


- Notice that the optimal l1_ratio=1, meaning L1 regularization is preferred. This means that a lot of features may not be important and can be removed.

In [77]:
# Finding which features are important 
coeffs = pd.DataFrame(abs(grid_model.best_estimator_.coef_))
imp_features_idx = coeffs[coeffs[0]>0.0].index
imp_features = X.columns[imp_features_idx]
X_imp = X[imp_features]
X_imp.shape

(2925, 207)