In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
import time

from sklearn.preprocessing import scale
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve, cross_val_score, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
import lightgbm
from catboost import CatBoostClassifier, cv, Pool
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsRegressor


from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy import stats

from bayes_opt import BayesianOptimization
from skopt import BayesSearchCV


In [4]:

import sklearn

print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.24.1.


Imput homes_onehot instead and rescale after splitting on just the training datasets to remove data leakage

In [2]:
homes_onehot = pd.read_csv('Homes_Onehot.csv')
homes_onehot = homes_onehot.drop(['Unnamed: 0'], axis=1)

A few features had some remaining nans, removed them

In [3]:
homes_onehot = homes_onehot.dropna()

Create location columns filter

In [4]:
filter_col1 = [col for col in homes_onehot if col.startswith('ZIP') or col.startswith('City') or col.startswith('Township') or col.startswith('Subdivision')]

Create X for all columns except locations, split data

In [24]:
X_big = homes_onehot.drop(columns='Sales Price')
X_big = X_big.drop(columns=filter_col1)
y = homes_onehot[['Sales Price']]
X_big_train, X_big_test, y_big_train, y_big_test = train_test_split(X_big, y, test_size=0.25, random_state=25)

In [29]:
X_big.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4595 entries, 0 to 4670
Data columns (total 134 columns):
 #    Column                          Dtype  
---   ------                          -----  
 0    List Price                      int64  
 1    Tax Amount                      float64
 2    Total Market Value              int64  
 3    Market Value Land               int64  
 4    Market Value Improvement        int64  
 5    Estimated Value                 float64
 6    Lot SQFT                        float64
 7    Beds                            float64
 8    Baths                           float64
 9    Total Building Area             int64  
 10   Living Area SQFT                float64
 11   Garage SQFT                     float64
 12   Parking spaces                  int64  
 13   County Use Code_03              int64  
 14   County Use Code_04              int64  
 15   County Use Code_05              int64  
 16   County Use Code_111             int64  
 17   County Use C

In [30]:
X_med = homes_onehot.drop(columns='Sales Price')
X_med = X_med.drop(columns=filter_col1)
X_med = X_med.drop(columns=['List Price', 'Estimated Value'])
y = homes_onehot[['Sales Price']]
X_med_train, X_med_test, y_med_train, y_med_test = train_test_split(X_med, y, test_size=0.25, random_state=25)

In [31]:
X_med2 = homes_onehot.drop(columns=['Sales Price'])
X_med2 = X_med2.drop(columns=filter_col1)
X_med2 = X_med2.drop(columns=['List Price', 'Estimated Value', 'Total Market Value', 'Market Value Land', 'Market Value Improvement'])
y = homes_onehot[['Sales Price']]
X_med2_train, X_med2_test, y_med2_train, y_med2_test = train_test_split(X_med2, y, test_size=0.25, random_state=25)

Create X for only location columns, split data

In [50]:

X_loc = homes_onehot[filter_col1]
X_loc_train, X_loc_test, y_loc_train, y_loc_test = train_test_split(X_loc, y, test_size=0.25, random_state=25)

Create filter for streamlined model columns

In [51]:
filter_col6 = [col for col in homes_onehot if col.startswith('Porch') or col.startswith('Has Pool') or col.startswith('Stories')]
filter_9 = ['Lot SQFT', 'Beds', 'Baths', 'Living Area SQFT']
filter_join = filter_col6 + filter_9

Create X for streamlined model, split data

In [52]:
X_small = homes_onehot[filter_join]
X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(X_small, y, test_size=0.25, random_state=25)

Create Pipeline for logistic regression

In [24]:
y_train.shape


(3446, 1)

SVR

In [22]:
start = time.time()
gsc = GridSearchCV(
        estimator=SVR(kernel='rbf'),
        param_grid={
            'C': [0.1, 1, 100, 1000],
            'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
            'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result = gsc.fit(X_small_train, y_train)
best_params = grid_result.best_params_
best_svr = SVR(kernel='rbf', C=best_params["C"], epsilon=best_params["epsilon"], gamma=best_params["gamma"],
                   coef0=0.1, shrinking=True,
                   tol=0.001, cache_size=200, verbose=False, max_iter=-1)


print('It takes %s minutes' % ((time.time() - start)/60))

y_pred = gsc.predict(X_small_test)


  return f(*args, **kwargs)


It takes 27.238007338841758 minutes


In [25]:
y_tr_pred = gsc.predict(X_small_train)

mae = mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_pred)
mae
r2 = r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)
r2

(116322.37808970716, 109472.98517081025)

Random Forest Regressor with pipeline

In [29]:
start = time.time()

steps = [('scaler', StandardScaler()), ('rfr', RandomForestRegressor())]
pipe = Pipeline(steps)
randomforest = GridSearchCV(
        estimator=pipe,
        param_grid={
            'rfr__max_depth': range(3,7),
            'rfr__n_estimators': (10, 50, 100, 1000, 1500),
         },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result = randomforest.fit(X_small_train, y_train)
best_params = randomforest.best_params_
print(best_params)


print('It takes %s minutes' % ((time.time() - start)/60))

y_pred_rf = randomforest.predict(X_small_test)


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


{'rfr__max_depth': 6, 'rfr__n_estimators': 1000}
It takes 1.7904481609662375 minutes


In [30]:
y_tr_pred_rf = randomforest.predict(X_small_train)

median_mae_rf = mean_absolute_error(y_train, y_tr_pred_rf), mean_absolute_error(y_test, y_pred_rf)
median_mae_rf

(72059.48733429158, 77196.72983940732)

In [None]:
modified from: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

Random Forest with more hyperparameters and random search

In [35]:
start = time.time()

steps = [('scaler', StandardScaler()), ('rfr', RandomForestRegressor())]
pipe = Pipeline(steps)


n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'rfr__n_estimators': n_estimators,
               'rfr__max_features': max_features,
               'rfr__max_depth': max_depth,
               'rfr__min_samples_split': min_samples_split,
               'rfr__min_samples_leaf': min_samples_leaf,
               'rfr__bootstrap': bootstrap}

randomforest2 = RandomizedSearchCV(
        estimator=pipe,
        param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)

grid_result2 = randomforest2.fit(X_small_train, y_train)
best_params2 = randomforest2.best_params_
print(best_params)


print('It takes %s minutes' % ((time.time() - start)/60))

y_pred_rf2 = randomforest2.predict(X_small_test)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


{'rfr__max_depth': 6, 'rfr__n_estimators': 1000}
It takes 27.31163688500722 minutes


In [36]:
y_tr_pred_rf2 = randomforest2.predict(X_small_train)
print(best_params2)

median_mae_rf2 = mean_absolute_error(y_train, y_tr_pred_rf2), mean_absolute_error(y_test, y_pred_rf2)
median_mae_rf2

{'rfr__max_depth': 6, 'rfr__n_estimators': 1000}


(67369.7948395221, 74960.16758888104)

Lasso Regression Model

In [38]:
start = time.time()
lasso = GridSearchCV(
        estimator=Lasso(),
        param_grid={
            'alpha': np.array([5, 0.5, 0.05, 0.005, 0.0005, 1, 0.1, 0.01, 0.001, 0.0001, 0 ]),      
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result_lasso = lasso.fit(X_small_train, y_train)
best_params_lasso = lasso.best_params_



print('It takes %s minutes' % ((time.time() - start)/60))

y_pred_lasso = lasso.predict(X_small_test)
y_tr_pred_lasso = lasso.predict(X_small_train)
print(best_params_lasso)

median_mae_lasso = mean_absolute_error(y_train, y_tr_pred_lasso), mean_absolute_error(y_test, y_pred_lasso)
median_mae_lasso

It takes 0.03310169378916423 minutes
{'rfr__max_depth': 6, 'rfr__n_estimators': 1000}


(77402.07323408558, 76580.30985726205)

In [48]:
start = time.time()
lasso2 = GridSearchCV(
        estimator=Lasso(),
        param_grid={
            'alpha': np.array([5, 0.5, 0.05, 0.005, 0.0005, 1, 0.1, 0.01, 0.001, 0.0001, 0 ]),      
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result_lasso2 = lasso2.fit(X_loc_train, y_train)
best_params_lasso2 = lasso2.best_params_



print('It takes %s minutes' % ((time.time() - start)/60))

y_pred_lasso2 = lasso2.predict(X_loc_test)
y_tr_pred_lasso2 = lasso2.predict(X_loc_train)
print(best_params_lasso2)

median_mae_lasso2 = mean_absolute_error(y_train, y_tr_pred_lasso2), mean_absolute_error(y_test, y_pred_lasso2)
median_mae_lasso2

  model = cd_fast.enet_coordinate_descent(


It takes 16.987457585334777 minutes
{'rfr__max_depth': 6, 'rfr__n_estimators': 1000}


(18660.171607341425, 100715.1710630098)

In [36]:
start = time.time()
lasso3 = GridSearchCV(
        estimator=Lasso(max_iter = 100000),
        param_grid={
            'alpha': np.array([9, 7, 5, 0.5, 0.05, 0.005, 0.0005, 1, 0.1, 0.01, 0.001, 0.0001, 0 ]),      
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result_lasso3 = lasso3.fit(X_big_train, y_big_train)
best_params_lasso3 = lasso3.best_params_



print('It takes %s minutes' % ((time.time() - start)/60))

y_pred_lasso3 = lasso3.predict(X_big_test)
y_tr_pred_lasso3 = lasso3.predict(X_big_train)
print(best_params_lasso3)

median_mae_lasso3 = mean_absolute_error(y_big_train, y_tr_pred_lasso3), mean_absolute_error(y_big_test, y_pred_lasso3)
print(median_mae_lasso3)
r2_score(y_big_train, y_tr_pred_lasso3), r2_score(y_big_test, y_pred_lasso3)

It takes 248.02722691694896 minutes
{'alpha': 9.0}
(64532.72864320237, 66370.50859484464)


(0.6807059395705326, 0.584563960811793)

In [6]:
start = time.time()
lasso4 = GridSearchCV(
        estimator=Lasso(),
        param_grid={
            'alpha': np.array([5, 0.5, 0.05, 0.005, 0.0005, 1, 0.1, 0.01, 0.001, 0.0001, 0 ]),      
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result_lasso4 = lasso4.fit(X_big_train, y_big_train)
best_params_lasso4 = lasso4.best_params_



print('It takes %s minutes' % ((time.time() - start)/60))

y_pred_lasso4 = lasso4.predict(X_big_test)
y_tr_pred_lasso4 = lasso4.predict(X_big_train)
print(best_params_lasso4)

median_mae_lasso4 = mean_absolute_error(y_big_train, y_tr_pred_lasso4), mean_absolute_error(y_big_test, y_pred_lasso4)
print(median_mae_lasso4)
r2_score(y_big_train, y_tr_pred_lasso4), r2_score(y_big_test, y_pred_lasso4)

  model = cd_fast.enet_coordinate_descent(


It takes 13.890787335236867 minutes
{'alpha': 5.0}
(17779.552013933346, 78696.5119368566)


(0.9736190822228539, 0.42084488741792847)

Lasso 4 is the best model. It doesn't have the best MAE but it is only 3% worse than lasso 3 but runs in less than 1% of the time.

In [17]:
start = time.time()
lasso5 = GridSearchCV(
        estimator=Lasso(),
        param_grid={
            'alpha': np.array([5, 0.5, 0.05, 0.005, 0.0005, 1, 0.1, 0.01, 0.001, 0.0001, 0 ]),      
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result_lasso5 = lasso5.fit(X_med2_train, y_med2_train)
best_params_lasso5 = lasso5.best_params_



print('It takes %s minutes' % ((time.time() - start)/60))

y_pred_lasso5 = lasso5.predict(X_med2_test)
y_tr_pred_lasso5 = lasso5.predict(X_med2_train)
print(best_params_lasso5)

median_mae_lasso5 = mean_absolute_error(y_med2_train, y_tr_pred_lasso5), mean_absolute_error(y_med2_test, y_pred_lasso5)
print(median_mae_lasso5)
r2_score(y_med2_train, y_tr_pred_lasso5), r2_score(y_med2_test, y_pred_lasso5)

It takes 10.389898284276326 minutes
{'alpha': 5.0}
(0.247553744947415, 0.27477989663850255)


(0.9999999999870326, 0.9999999999277683)

In [58]:
X_big_train.head()

Unnamed: 0,List Price,Tax Amount,Total Market Value,Market Value Land,Market Value Improvement,Estimated Value,Lot SQFT,Beds,Baths,Total Building Area,...,Porch Type_Porch covered,Has Pool_None,Has Pool_Pool & Spa (both),Has Pool_Pool (yes),Has Pool_Spa or Hot Tub (only),Stories_1 Story,Stories_1.5 Stories,Stories_2 Stories,Stories_2.5 Stories,Stories_3 Stories
1183,292900,5309.21,260119,70000,190119,303099.0,5662.8,3.0,2.0,1865,...,1,1,0,0,0,1,0,0,0,0
2431,1050000,17083.26,849500,190752,658748,685313.0,20386.08,5.0,5.1,5946,...,0,0,1,0,0,0,0,1,0,0
2876,280000,6586.17,277883,60000,217883,275393.0,1742.4,2.0,2.1,1663,...,0,1,0,0,0,0,0,1,0,0
200,359900,6938.22,306984,75000,231984,387451.0,6534.0,3.0,2.0,2218,...,1,1,0,0,0,1,0,0,0,0
3180,1200000,22770.08,1115238,250000,865238,763293.0,20037.6,5.0,5.1,5266,...,1,0,0,1,0,0,0,1,0,0


In [None]:
Ridge Regression

In [39]:
start = time.time()
ridge = GridSearchCV(
        estimator=Ridge(),
        param_grid={
            'alpha': np.array([5, 0.5, 0.05, 0.005, 0.0005, 1, 0.1, 0.01, 0.001, 0.0001, 0 ]),      
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result_ridge = ridge.fit(X_small_train, y_train)
best_params_ridge = ridge.best_params_



print('It takes %s minutes' % ((time.time() - start)/60))

y_pred_ridge = ridge.predict(X_small_test)
y_tr_pred_ridge = ridge.predict(X_small_train)
print(best_params_ridge)

median_mae_ridge = mean_absolute_error(y_train, y_tr_pred_ridge), mean_absolute_error(y_test, y_pred_ridge)
median_mae_ridge

It takes 0.01819832722345988 minutes
{'rfr__max_depth': 6, 'rfr__n_estimators': 1000}


(77419.21510192832, 76551.25721976442)

In [44]:
r2_score(y_train, y_tr_pred_ridge), r2_score(y_test, y_pred_ridge)

(0.5543591126489431, 0.4843397034606334)

In [19]:
start = time.time()
ridge2 = GridSearchCV(
        estimator=Ridge(),
        param_grid={
            'alpha': np.array([5, 0.5, 0.05, 0.005, 0.0005, 1, 0.1, 0.01, 0.001, 0.0001, 0 ]),      
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result_ridge2 = ridge2.fit(X_big_train, y_big_train)
best_params_ridge2 = ridge2.best_params_



print('It takes %s minutes' % ((time.time() - start)/60))

y_pred_ridge2 = ridge2.predict(X_big_test)
y_tr_pred_ridge2 = ridge2.predict(X_big_train)
print(best_params_ridge2)

median_mae_ridge2 = mean_absolute_error(y_big_train, y_tr_pred_ridge2), mean_absolute_error(y_big_test, y_pred_ridge2)
median_mae_ridge2

print(median_mae_ridge2)
r2_score(y_big_train, y_tr_pred_ridge2), r2_score(y_big_test, y_pred_ridge2)

It takes 0.12040959596633911 minutes
{'alpha': 5.0}
(64574.6400763234, 66138.87389433612)


(0.6800450662050445, 0.5862190599510184)

In [43]:
r2_score(y_train, y_tr_pred_ridge2), r2_score(y_test, y_pred_ridge2)

(1.0, 1.0)

In [None]:
start = time.time()
ridge2 = GridSearchCV(
        estimator=Ridge(),
        param_grid={
            'alpha': np.array([5, 0.5, 0.05, 0.005, 0.0005, 1, 0.1, 0.01, 0.001, 0.0001, 0 ]),      
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result_ridge2 = ridge2.fit(X_big_train, y_big_train)
best_params_ridge2 = ridge2.best_params_



print('It takes %s minutes' % ((time.time() - start)/60))

y_pred_ridge2 = ridge2.predict(X_big_test)
y_tr_pred_ridge2 = ridge2.predict(X_big_train)
print(best_params_ridge2)

median_mae_ridge2 = mean_absolute_error(y_big_train, y_tr_pred_ridge2), mean_absolute_error(y_test, y_pred_ridge2)
median_mae_ridge2

print(median_mae_ridge2)
r2_score(y_big_train, y_tr_pred_ridge2), r2_score(y_big_test, y_pred_ridge2)

Create pipeline for XGBoost

Timing function

In [None]:
start = time.time()

print('It takes %s minutes' % ((time.time() - start)/60))

In [55]:
search.score(X,y)

0.9373219373219374

In [16]:
from xgboost import XGBRegressor

In [26]:
start = time.time()

steps = [('scaler', StandardScaler()), ('xgbr', XGBRegressor())]
pipe = Pipeline(steps)

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [1,2,3,4,5,6,7,8,9,10]
eta = [.001,.005,.01,.025,.05,.1,.2,.3]
subsample = [.25,.5,.75,1]
colsample_bytree = [.25,.5,.75,1]

random_grid = {'xgbr__n_estimators': n_estimators,
               'xgbr__max_depth': max_depth,
               'xgbr__eta': eta,
               'xgbr__subsample': subsample,
               'xgbr__colsample_bytree': colsample_bytree}

xgboost = RandomizedSearchCV(
        estimator=pipe,
        param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)

grid_result = xgboost.fit(X_big_train, y_big_train)
best_params = xgboost.best_params_
print(best_params)

y_pred_xgboost = xgboost.predict(X_big_test)
y_tr_pred_xgboost = xgboost.predict(X_big_train)



print('It takes %s minutes' % ((time.time() - start)/60))
median_mae_xgboost = mean_absolute_error(y_big_train, y_tr_pred_xgboost), mean_absolute_error(y_big_test, y_pred_xgboost)
median_mae_xgboost

print(median_mae_xgboost)


r2_score(y_big_train, y_tr_pred_xgboost), r2_score(y_big_test, y_pred_xgboost)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'xgbr__subsample': 0.5, 'xgbr__n_estimators': 1800, 'xgbr__max_depth': 2, 'xgbr__eta': 0.005, 'xgbr__colsample_bytree': 0.75}
It takes 75.25825051069259 minutes
(60930.19888094893, 64929.436901653615)




(0.7200536463831246, 0.5873665760609141)

In [108]:
homes_onehot.describe(include=float)

Unnamed: 0,Tax Amount,Estimated Value,Lot SQFT,Beds,Baths,Living Area SQFT,Garage SQFT,Sales Price
count,4595.0,4595.0,4595.0,4595.0,4595.0,4595.0,4595.0,4595.0
mean,7841.613012,418842.4,8708.085319,3.688139,2.606855,2640.295756,490.321872,411620.8
std,3076.983622,146150.9,7185.751792,0.776239,0.783748,935.340892,131.812314,154518.2
min,1151.5,132179.0,1.0,2.0,0.0,819.0,12.0,81300.0
25%,5766.575,317639.0,6229.08,3.0,2.0,1930.0,420.0,306600.0
50%,7155.77,384892.0,7840.8,4.0,2.1,2440.0,450.0,385900.0
75%,9186.005,481708.0,9583.2,4.0,3.1,3244.0,528.0,495000.0
max,25076.55,1305136.0,220674.96,6.0,6.2,10282.0,2576.0,1150000.0


In [65]:
search.score(X,y)

0.9344729344729344

Saving which modules use which parameters for gridsearch

In [None]:
SVC()
params = dict()
params['C'] = (1e-6, 100.0, 'log-uniform')
params['gamma'] = (1e-6, 100.0, 'log-uniform')
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']

KNeighborsRegressor()
params = dict()
params['n_neighbors'] = (2,25)
params['weights'] = ['uniform', 'distance']



SVR()
params = dict()
params['C'] = (1e-6, 100.0, 'log-uniform')
params['gamma'] = (1e-6, 100.0, 'log-uniform')
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']

LinearRegression()


Ridge()
params = dict()
params['alpha'] = (0, 1)

Lasso()
params = dict()
params['alpha'] = (0, 1)

BayesianRidge()

ElasticNet()
params = dict()
params['alpha'] = (1e, 100.0)
params['l1_ratio'] = (0, 1)

XGBRegression()
params = dict()
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [1,2,3,4,5,6,7,8,9,10]
eta = [.001,.005,.01,.025,.05,.1,.2,.3]
subsample = [.25,.5,.75,1]
colsample_bytree = [.25,.5,.75,1]

In [13]:
X_small_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Porch Type_None,1149.0,0.29852,0.457808,0.0,0.0,0.0,1.0,1.0
Porch Type_Porch,1149.0,0.00087,0.029501,0.0,0.0,0.0,0.0,1.0
Porch Type_Porch - Open,1149.0,0.326371,0.469089,0.0,0.0,0.0,1.0,1.0
Porch Type_Porch covered,1149.0,0.374238,0.484136,0.0,0.0,0.0,1.0,1.0
Has Pool_None,1149.0,0.743255,0.437028,0.0,0.0,1.0,1.0,1.0
Has Pool_Pool & Spa (both),1149.0,0.046127,0.209852,0.0,0.0,0.0,0.0,1.0
Has Pool_Pool (yes),1149.0,0.199304,0.399651,0.0,0.0,0.0,0.0,1.0
Has Pool_Spa or Hot Tub (only),1149.0,0.011314,0.105811,0.0,0.0,0.0,0.0,1.0
Stories_1 Story,1149.0,0.437772,0.496329,0.0,0.0,0.0,1.0,1.0
Stories_1.5 Stories,1149.0,0.033943,0.18116,0.0,0.0,0.0,0.0,1.0
