In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read the data
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

categorical_cols = [cname for cname in X_full.columns if X_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_full.columns if X_full[cname].dtype in ['int64', 'float64']]

In [3]:
# Shape
X_full.shape

(1460, 79)

In [4]:
# Missing values, categorical
mis_cat = X_full[categorical_cols].isnull().sum()
mis_cat[mis_cat > 0]

Alley           1369
MasVnrType         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [5]:
# Missing values, continueous
mis_num = X_full[numerical_cols].isnull().sum()
mis_num[mis_num > 0]

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64

# Treatment of missing features

- Missing categorical features (except MiscFeature, Electrical) seem to be caused by simply not having that part of the house, in this case the 'NaN' for feature X can be interpreted as 'does not have feature X in the house' and so deserves a separate category. So it makes sense to impute the NaNs with a new category.
- MiscFeature: Is mostly empty, however 49 of the values are Shed, prompting me to create a new feature HasShed out of this.
- Electrical: Impute by most common category (SBrkr)

- Missing continuous features, we use median for LotFrontage, 0 for MasVnrArea and the median for GarageYrBlt

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# We create an explicit for feature for 'HasShed' from 'MiscFeature'
X_full['Has_shed'] = np.where(X_full['MiscFeature']=='Shed',1,0)
X_full_final = X_full.drop('MiscFeature',axis=1)

# We need a bunch of transformers based on the above
# Preprocessing for numerical data
numerical_transformer_median = SimpleImputer(strategy='median') 

numerical_transformer_constant = SimpleImputer(strategy='constant',fill_value=0) 

# Preprocessing for categorical data
categorical_transformer_constant = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NotPresentInTheHouse')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformer_mode = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformer_other = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Column categories
num_col_median = ['LotFrontage','GarageYrBlt']

num_col_constant = ['MasVnrArea']

cat_col_constant = (['Alley',
                         'MasVnrType',
                         'BsmtQual',
                         'BsmtCond',
                         'BsmtExposure',
                         'BsmtFinType1',
                         'BsmtFinType2',
                         'FireplaceQu',
                         'GarageType',
                         'GarageFinish',
                         'GarageQual',
                         'GarageCond',
                         'PoolQC',
                         'Fence'])

cat_col_mode = ['Electrical']

cat_col_other = (['MSZoning',
             'Street',
             'LotShape',
             'LandContour',
             'Utilities',
             'LotConfig',
             'LandSlope',
             'Neighborhood',
             'Condition1',
             'Condition2',
             'BldgType',
             'HouseStyle',
             'RoofStyle',
             'RoofMatl',
             'Exterior1st',
             'Exterior2nd',
             'ExterQual',
             'ExterCond',
             'Foundation',
             'Heating',
             'HeatingQC',
             'CentralAir',
             'KitchenQual',
             'Functional',
             'PavedDrive',
             'SaleType',
             'SaleCondition'])

In [7]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num_median', numerical_transformer_median, num_col_median),
        ('num_constant', numerical_transformer_constant, num_col_constant),
        ('cat_constant', categorical_transformer_constant, cat_col_constant),
        ('cat_mode', categorical_transformer_mode, cat_col_mode),
        ('cat_other', categorical_transformer_other, cat_col_other),
    ], remainder = 'passthrough' )

In [9]:
from sklearn.model_selection import GridSearchCV

rfr = RandomForestRegressor(random_state=0)

# param grid
param_grid = {
    'n_estimators': [100,200,500],#[2,4,8,16,32,64,128],
    'max_depth' : [2,3,4,5],#,8,10],#[20,10,5], #[200,100,50,20,10],
    'criterion' : ['mae'],
    'min_samples_split' : [2],#,4,8,16], #[0.1,0.3,0.5,0.75,1.0],
    'min_samples_leaf' : [1],#,2,4,8], #np.linspace(0.1,0.5,num=5),
    'max_features' : [0.2,0.4,0.6,0.8,1.0]
}

gscv = GridSearchCV( estimator = rfr, param_grid = param_grid, scoring = 'neg_mean_absolute_error', cv = 5, n_jobs = 10, verbose = 10 )

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('gscv', gscv)
                             ])

my_pipeline.fit(X_full_final,y);

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:   11.5s
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:   38.1s
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:  1.1min
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.8min
[Parallel(n_jobs=10)]: Done  41 tasks      | elapsed:  3.1min
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:  4.6min
[Parallel(n_jobs=10)]: Done  65 tasks      | elapsed:  7.0min
[Parallel(n_jobs=10)]: Done  78 tasks      | elapsed:  7.8min
[Parallel(n_jobs=10)]: Done  93 tasks      | elapsed:  9.8min
[Parallel(n_jobs=10)]: Done 108 tasks      | elapsed: 11.7min
[Parallel(n_jobs=10)]: Done 125 tasks      | elapsed: 14.2min
[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed: 17.7min
[Parallel(n_jobs=10)]: Done 161 tasks      | elapsed: 19.7min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed: 22.7min
[Parallel(n_jobs=10)]: Done 201 tasks      | elapsed: 2

In [10]:
gscv.best_score_

-19645.382488013696

In [11]:
gscv.best_params_

{'criterion': 'mae',
 'max_depth': 5,
 'max_features': 0.4,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}