In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Imputer

from scipy.stats import skew
import warnings

%matplotlib inline
%config InlineBackend.figure_format = 'png' #set 'png' here when working on notebook

warnings.filterwarnings('ignore') 

In [2]:
df = pd.read_csv("../data/iowa-housing/train.csv")
df.shape

(1460, 81)

In [3]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
# Find all categorical data
cats = [col for col in df.columns.values if df[col].dtype == 'object']

In [5]:
# Create separate datasets for continuous and categorical data
df_cont = df.drop(cats, axis=1)
df_cats = df[cats]
df_cont.shape

(1460, 38)

In [6]:
df_cont.BedroomAbvGr.value_counts()

3    804
2    358
4    213
1     50
5     21
6      7
0      6
8      1
Name: BedroomAbvGr, dtype: int64

In [7]:
df_cont.isnull().values.any()

True

In [8]:
df_cont.isnull().sum()

Id                 0
MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
SalePrice          0
dtype: int64

In [9]:
# Calculate total square footage of each house
df_cont['TotalSF'] = df_cont['1stFlrSF'] + df_cont['2ndFlrSF'] + df_cont['TotalBsmtSF']
# y = df_cont.SalePrice
y = np.log(df_cont.SalePrice)

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

def executeModelCV(model, X):
    # Negate the scores since neg_mean_squared_error returns negative scores, take the mean of the 10 scores, and 
    # since it is squared, take the square root
    
    # cross_val_score override scoring function to compute inverse log
    scores = np.sqrt(-cross_val_score(model, X, y, cv = 10, scoring = 'neg_mean_squared_error').mean())
    return(scores)

In [11]:
from sklearn.linear_model import LinearRegression

feature_cols = ['1stFlrSF', '2ndFlrSF', 'BedroomAbvGr']
executeModelCV(LinearRegression(), df_cont[feature_cols])

0.26387383469757764

In [None]:
# Don't use 'YearBuilt' because in a time value problem, newer houses typically sell for more money
feature_cols = ['TotalSF', 'FullBath', 'OverallQual', 'OverallCond', 'TotRmsAbvGrd', 'LotArea']
executeModelCV(LinearRegression(), df_cont[feature_cols])

In [None]:
from sklearn.tree import DecisionTreeRegressor

max_depth_range = range(1, 8)
RMSE_scores = []    # list to store the average RMSE for each value of max_depth

# For each depth, instantiate a DecisionTreeRegressor
for depth in max_depth_range:
    RMSE_scores.append(executeModelCV(DecisionTreeRegressor(max_depth = depth, random_state = 1), df_cont[feature_cols]))

In [None]:
# plot max_depth (x-axis) versus RMSE (y-axis)
md = max_depth_range
plt.plot(max_depth_range, RMSE_scores)
plt.xlabel('max_depth')
plt.ylabel('RMSE (lower is better)')

In [None]:
feature_cols = ['TotalSF', 'FullBath', 'OverallQual', 'OverallCond', 'TotRmsAbvGrd', 'LotArea']

treereg = DecisionTreeRegressor(max_depth = 6, random_state = 1)
# The DecisionTreeRegressor instance must be fitted to be able to call treereg.feature_importances_
treereg.fit(df_cont[feature_cols], y)
executeModelCV(treereg, df_cont[feature_cols])

In [None]:
# "Gini importance" of each feature: the (normalized) total reduction of error 
# brought by that feature
newdf = pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_})
newdf.sort_values('importance', ascending = False)

In [None]:
# Remove LotFrontage and 'MasVnrArea' because of NaN values
feature_cols = ['OverallQual', 'GrLivArea', 'BedroomAbvGr', 'GarageCars', 'MSSubClass', 'YearBuilt', 'TotRmsAbvGrd', 
                'BsmtFullBath', 'KitchenAbvGr', 'YearRemodAdd', 'OverallCond', 'Fireplaces', 'TotalSF',
                'MoSold', 'ScreenPorch', 'LotArea']

treereg = DecisionTreeRegressor(max_depth = 6, random_state = 1)
# The DecisionTreeRegressor instance must be fitted to be able to call treereg.feature_importances_
treereg.fit(df_cont[feature_cols], y)
executeModelCV(treereg, df_cont[feature_cols])

In [None]:
# "Gini importance" of each feature: the (normalized) total reduction of error 
# brought by that feature
newdf = pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_})
newdf.sort_values('importance', ascending = False)

In [None]:
# create a GraphViz file
from sklearn.tree import export_graphviz

# Easier to read the .svg file in a browser than a .png file
export_graphviz(treereg, out_file = 'iowa_house-decision-tree.dot', feature_names = feature_cols)
! dot -Tsvg iowa_house-decision-tree.dot -o iowa_house-decision-tree.svg
! dot -Tpng iowa_house-decision-tree.dot -o iowa_house-decision-tree.png

In [None]:
from sklearn.ensemble import RandomForestRegressor

maxEstimatorRange = range(3, 16)
RMSE_scores = []    # list to store the average RMSE for each value of max_depth

# For each number of estimators, instantiate a DecisionTreeRegressor
for numEst in maxEstimatorRange:
    RMSE_scores.append(executeModelCV(RandomForestRegressor(max_depth = 6, n_estimators = numEst, max_features = 1), 
                                                            df_cont[feature_cols]))

In [None]:
# plot max_estimators (x-axis) versus RMSE (y-axis)
plt.plot(maxEstimatorRange, RMSE_scores)
plt.xlabel('max_estimators')
plt.ylabel('RMSE (lower is better)')

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor

names = ["Linear Regression", "Decision Tree", "Nearest Neighbors", "Random Forest", "AdaBoost"]

models = [
    LinearRegression(),
    DecisionTreeRegressor(max_depth = 6, random_state = 1),
    KNeighborsRegressor(n_neighbors = 6),
    RandomForestRegressor(max_depth = 6, n_estimators = 12, max_features = 1),
    AdaBoostRegressor()]

# iterate over classifiers
for modelName, model in zip(names, models):
    print(modelName, executeModelCV(model, df_cont[feature_cols]))

In [None]:
from sklearn.model_selection import train_test_split

# All Features except 'GarageYrBlt' and 'MasVnrArea' because of NaN values
feature_cols = ['TotalSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 
                'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars', 
                'GrLivArea', 'HalfBath', 'Id', 'KitchenAbvGr', 'LotArea', 'LowQualFinSF', 'MSSubClass', 
                'MiscVal', 'MoSold', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 'ScreenPorch', 
                'TotRmsAbvGrd', 'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'YrSold']

X = df_cont[feature_cols]
y = df_cont.SalePrice
    
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
# The coef_ variable doesn't exist until the fit() method is invoked
linreg.coef_

In [None]:
df = pd.DataFrame({"Feature": feature_cols, "coef":linreg.coef_})
df['coefmagnitude'] = df.coef.map(lambda x: abs(x))
df.sort_values('coefmagnitude', ascending = False)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(linreg, random_state = 1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

In [None]:
# Based on output from model itself  (feature importance)
feature_cols = ['KitchenAbvGr', 'GarageCars', 'OverallQual', 'BsmtFullBath', 'BedroomAbvGr', 'Fireplaces', 'OverallCond', 
                'TotRmsAbvGrd', 'FullBath', 'YearBuilt', 'MoSold', 'MSSubClass', 'YearRemodAdd', 'PoolArea', 'TotalSF', 
                'LotArea']
X = df_cont[feature_cols]
y = df_cont['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
# Based on output from model itself, but this time, remove features with a negative coefficient:  'KitchenAbvGr', 
# 'BedroomAbvGr', 'YrSold', 'HalfBath', 'MoSold', 'MSSubClass', 'PoolArea', 'BsmtFinSF2', 'Id', 'BsmtUnfSF', 'MiscVal'
feature_cols = ['OverallQual', 'GarageCars', 'BsmtFullBath', 'TotRmsAbvGrd', 'BsmtHalfBath', 'FullBath', 'Fireplaces', 
                'OverallCond', 'YearBuilt', 'YearRemodAdd', 'ScreenPorch', 'WoodDeckSF', 'EnclosedPorch', 'GrLivArea', 
                'TotalSF', '3SsnPorch', 'LowQualFinSF', 'BsmtFinSF1', 'OpenPorchSF', 'GarageArea', 'LotArea']
X = df_cont[feature_cols]
y = df_cont.SalePrice

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
# Based on output from eli (feature importance)
feature_cols = ['OverallQual', 'GrLivArea', 'TotalSF', 'TotRmsAbvGrd', 'YearBuilt', 'GarageCars', 'BedroomAbvGr', 'BsmtFinSF1', 
                'MSSubClass', 'BsmtFullBath', 'LotArea', 'Fireplaces', 'WoodDeckSF', 'FullBath', 'KitchenAbvGr', 'YearRemodAdd', 
                'OverallCond', 'ScreenPorch', 'GarageArea', 'BsmtFinSF2']
X = df_cont[feature_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
# preprocessing using zero mean and unit variance scaling
from sklearn.preprocessing import StandardScaler

# Rescale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

linreg = LinearRegression()
linreg.fit(X_train_scaled, y_train)
y_pred = linreg.predict(X_test_scaled)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
feature_cols = ['OverallQual', 'GarageCars', 'BsmtFullBath', 'TotRmsAbvGrd', 'BsmtHalfBath', 'FullBath', 'Fireplaces', 
                'OverallCond', 'YearBuilt', 'YearRemodAdd', 'ScreenPorch', 'WoodDeckSF', 'EnclosedPorch', 'GrLivArea', 
                'TotalSF', '3SsnPorch', 'LowQualFinSF', 'BsmtFinSF1', 'OpenPorchSF', 'GarageArea', 'LotArea']

# Iterate over classifiers using new feature list
for modelName, model in zip(names, models):
    print(modelName, executeModelCV(model, df_cont[feature_cols]))

In [None]:
feature_cols = ['OverallQual', 'GrLivArea', 'TotalSF', 'TotRmsAbvGrd', 'YearBuilt', 'GarageCars', 'BedroomAbvGr', 'BsmtFinSF1', 
                'MSSubClass', 'BsmtFullBath', 'LotArea', 'Fireplaces', 'WoodDeckSF', 'FullBath', 'KitchenAbvGr', 'YearRemodAdd', 
                'OverallCond', 'ScreenPorch', 'GarageArea', 'BsmtFinSF2']

for modelName, model in zip(names, models):
    print(modelName, executeModelCV(model, df_cont[feature_cols]))

## Now work on the Categorical Data

In [None]:
df_cats.head()

In [None]:
df_cats.shape

In [None]:
df_cats.columns

In [None]:
df_cats.isnull().sum()

In [None]:
df_cats.index

In [None]:
# Bar plots of categorical features
for feature in df_cats.dtypes.index:
    sns.countplot(y = feature, data = df_cats, order = df_cats[feature].value_counts().index)
    plt.show()

In [None]:
from sklearn.impute import SimpleImputer

# Assign all of the features to feature_cols, except those with over 5000 NaNs, which will be dropped:  
# 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', and 'MiscFeature'.  
feature_cols = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 
                'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 
                'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 
                'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']

df_cats = df_cats[feature_cols]

# The rest of the features with NaNs are all below 100.  We want to maintain the same number of rows (1,460) so that we can 
# successfully perform an outer join with df_cont, so use the SimpleImputer to impute the missing values, using the most 
# frequent strategy
imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
df_cats_imputed = pd.DataFrame(imp.fit_transform(df_cats))
df_cats_imputed.columns = df_cats.columns
df_cats_imputed.index = df_cats.index

df_cats_imputed.isnull().values.any()

In [None]:
# Original code used to develop encodeCategoricalData()
'''
MSZoning = df_cats_imputed.MSZoning.value_counts()
count = MSZoning.count()
MSZoning

mapping = {}
i = 0
j = MSZoning.count() - 1
# Create a dictionary of key:value value pairs where the value count of the most frequent has the highest encoded value
# featureCount - 1
for feature in MSZoning:
    mapping.update({MSZoning.index[i]:j})
    i += 1
    j -= 1
    
mapping
'''

In [None]:
# Encode the categorical data as numerical values based on value counts, where the most frequent feature values have the 
# largest encoded value such that machine learning algorithms interpret a feature category encoded to 8 to be given 8x more 
# weight than a category within the same feature encoded to 1
def encodeCategoricalDataByValueCounts(feature):
    featureValueCounts = df_cats_imputed[feature].value_counts()    
    mapping = {}
    i = 0
    j = featureValueCounts.count() - 1
    # Build a dictionary of key:value value pairs where the value count of the most frequent has the highest encoded value:
    # featureValueCounts.count() - 1
    for value in featureValueCounts:
        mapping.update({featureValueCounts.index[i]:j})
        i += 1
        j -= 1
    return mapping

In [None]:
df_cats_imputed.head(10)

In [None]:
for feature in df_cats_imputed.dtypes.index:
    mapping = encodeCategoricalDataByValueCounts(feature)
    df_cats_imputed[feature] = df_cats_imputed[feature].map(mapping)

df_cats_imputed.head()

In [None]:
max_depth_range = range(1, 8)
RMSE_scores = []    # list to store the average RMSE for each value of max_depth

# For each depth, instantiate a DecisionTreeRegressor
for depth in max_depth_range:
    RMSE_scores.append(executeModelCV(DecisionTreeRegressor(max_depth = depth, random_state = 1), df_cats_imputed[feature_cols]))

In [None]:
# plot max_depth (x-axis) versus RMSE (y-axis)
md = max_depth_range
plt.plot(max_depth_range, RMSE_scores)
plt.xlabel('max_depth')
plt.ylabel('RMSE (lower is better)')

In [None]:
treereg = DecisionTreeRegressor(max_depth = 6, random_state = 1)
# The DecisionTreeRegressor instance must be fitted to be able to call treereg.feature_importances_
treereg.fit(df_cats_imputed[feature_cols], y)
executeModelCV(treereg, df_cats_imputed[feature_cols])

In [None]:
# "Gini importance" of each feature: the (normalized) total reduction of error 
# brought by that feature
newdf = pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_})
newdf.sort_values('importance', ascending = False)

In [None]:
# Based on output from model itself, with values over 0.018
feature_cols = ['ExterQual', 'BsmtQual', 'Neighborhood', 'GarageFinish', 'KitchenQual', 'BsmtExposure', 'RoofMatl', 'BldgType',
                'SaleCondition']
# Results:
# Linear Regression 55570.426480754395
# Decision Tree 51178.67443758195
# Nearest Neighbors 48393.65693873129
# Random Forest 48739.97880164021
# AdaBoost 54129.10737023764

feature_cols = ['ExterQual', 'BsmtQual', 'Neighborhood', 'GarageFinish', 'KitchenQual', 'BsmtExposure', 'MSZoning', 'ExterQual',
               'CentralAir']
# Results
# Linear Regression 54290.36003260593
# Decision Tree 49684.99993616181
# Nearest Neighbors 48379.85671631011
# Random Forest 47040.59819859272
# AdaBoost 55463.76504232442

# iterate over classifiers using new feature list
for modelName, model in zip(names, models):
    print(modelName, executeModelCV(model, df_cats_imputed[feature_cols]))

In [None]:
X = df_cats_imputed[feature_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [None]:
# The coef_ variable doesn't exist until the fit() method is invoked
linreg.coef_

In [None]:
df = pd.DataFrame({"Feature": feature_cols, "coef":linreg.coef_})
df['coefmagnitude'] = df.coef.map(lambda x: abs(x))
df.sort_values('coefmagnitude', ascending = False)

In [None]:
perm = PermutationImportance(linreg, random_state = 1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

In [None]:
# Based on output from model itself, but remove features with a negative coefficient:  'ExterQual', 'KitchenQual', 'GarageQual', 
# 'BsmtQual', 'LandSlope', 'RoofStyle', 'MasVnrType', 'GarageFinish', 'BsmtExposure', 'LotShape', 'Heating', 'SaleType', 
# 'RoofMatl', 'Exterior1st', 'HouseStyle', 'GarageType', 'BsmtFinType2', 'Neighborhood', 'BsmtFinType1', 
feature_cols = ['Street', 'Utilities', 'CentralAir', 'GarageCond', 'BldgType', 'PavedDrive', 'Electrical', 'MSZoning', 
                'Functional', 'LandContour', 'Foundation', 'HeatingQC', 'Condition2', 'ExterCond', 'SaleCondition', 'BsmtCond', 
                'Condition1', 'Exterior2nd', 'LotConfig']
# Results:
# Linear Regression 66955.98763433268
# Decision Tree 65237.218708631925
# Nearest Neighbors 66935.53467424955
# Random Forest 66180.04137507977
# AdaBoost 70924.95613788051

feature_cols = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 
                'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 
                'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 
                'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
# Results:
# Linear Regression 51750.23690084067
# Decision Tree 55737.72912119848
# Nearest Neighbors 51738.37060932254
# Random Forest 57250.380129610916
# AdaBoost 52913.70551792658

# iterate over classifiers using new feature list
for modelName, model in zip(names, models):
    print(modelName, executeModelCV(model, df_cats_imputed[feature_cols]))

In [None]:
print(df_cont.shape)
print(df_cats_imputed.shape)

In [None]:
df_cont.head()

In [None]:
df = pd.concat([df_cont, df_cats_imputed], axis = 1, join_axes = [df_cont.index], join = 'outer')
df.head()

In [None]:
df.shape

In [None]:
feature_cols = [# df_cont features
    'OverallQual', 'GrLivArea', 'TotalSF', 'TotRmsAbvGrd', 'YearBuilt', 'GarageCars', 'BedroomAbvGr', 'BsmtFinSF1', 
    'MSSubClass', 'BsmtFullBath', 'LotArea', 'Fireplaces', 'WoodDeckSF', 'FullBath', 'KitchenAbvGr', 'YearRemodAdd', 
    'OverallCond', 'ScreenPorch', 'GarageArea', 'BsmtFinSF2',
    # df_cats_imputed features
    'ExterQual', 'BsmtQual', 'Neighborhood', 'GarageFinish', 'KitchenQual', 'BsmtExposure', 'MSZoning', 'ExterQual', 
    'CentralAir']

# iterate over classifiers using new feature list
for modelName, model in zip(names, models):
    print(modelName, executeModelCV(model, df[feature_cols]))