In [92]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import MEstimateEncoder

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

X = train_df.drop(columns=['SalePrice', 'Id'])
y = train_df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

First, load the data and split the training data into a train and test (validation) set.

In [93]:
num_cols = [col for col in X.columns if X[col].dtypes != 'object']
str_cols = [col for col in X.columns if X[col].dtypes == 'object']

num_na_cols = [num_cols[ind] for ind,na_count in enumerate(X[num_cols].isnull().sum()) if na_count > 0 ]
str_na_cols = [str_cols[ind] for ind,na_count in enumerate(X[str_cols].isnull().sum()) if na_count > 0 ]

Find which variables in the training data have missing values, and record if the data is numeric or non-numeric.

In [94]:
num_na_fill = SimpleImputer(strategy='mean')

X_na_fill_train = num_na_fill.fit_transform(X_train[num_na_cols])
X_na_fill_test = num_na_fill.transform(X_test[num_na_cols])

X_fill_train = X_train.copy()
X_fill_test = X_test.copy()

X_fill_train.loc[:,num_na_cols] = X_na_fill_train
X_fill_test.loc[:,num_na_cols] = X_na_fill_test

For the numeric variables, replace missing values with a mean over observed values.

In [95]:
str_na_cols
str_na_fill = ['NA','None','NA','NA','NA','NA','NA','Mix','NA','NA','NA','NA','NA','NA','NA','NA']

str_na_dict = {}

for col,na_fill in zip(str_na_cols,str_na_fill):
    str_na_dict[col] = na_fill

X_fill_train.fillna(value=str_na_dict, inplace=True)
X_fill_test.fillna(value=str_na_dict, inplace=True)

For the non-numeric variables, replace missing values with an appropriate default value using the description.txt file.

In [96]:
ord_encoded_col = []

ord_encoder = OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']])
ord_encoder_col = ['ExterQual','ExterCond','HeatingQC','KitchenQual']
ord_encoded_col.append(ord_encoder_col)

for col in ord_encoder_col:
    X_fill_train[col] = ord_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = ord_encoder.transform(X_fill_test[[col]])

ord_na_encoder = OrdinalEncoder(categories=[['NA','Po', 'Fa', 'TA', 'Gd', 'Ex']])
ord_na_encoder_col = ['BsmtQual','BsmtCond','FireplaceQu','GarageQual','GarageCond','PoolQC']
ord_encoded_col.append(ord_na_encoder_col)

for col in ord_na_encoder_col:
    X_fill_train[col] = ord_na_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = ord_na_encoder.transform(X_fill_test[[col]])

bst_encoder = OrdinalEncoder(categories=[['NA','No','Mn','Av','Gd']])
bst_encoder_col = ['BsmtExposure']
ord_encoded_col.append(bst_encoder_col)

for col in bst_encoder_col:
    X_fill_train[col] = bst_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = bst_encoder.transform(X_fill_test[[col]])

bst_fintype_encoder = OrdinalEncoder(categories=[['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ']])
bst_fintype_encoder_col = ['BsmtFinType1','BsmtFinType2']
ord_encoded_col.append(bst_fintype_encoder_col)

for col in bst_fintype_encoder_col:
    X_fill_train[col] = bst_fintype_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = bst_fintype_encoder.transform(X_fill_test[[col]])

funct_encoder = OrdinalEncoder(categories=[['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ']])
funct_encoder_col = ['Functional']
ord_encoded_col.append(funct_encoder_col)

for col in funct_encoder_col:
    X_fill_train[col] = funct_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = funct_encoder.transform(X_fill_test[[col]])

fence_encoder = OrdinalEncoder(categories=[['NA','MnWw','GdWo','MnPrv','GdPrv']])
fence_encoder_col = ['Fence']
ord_encoded_col.append(fence_encoder_col)

for col in fence_encoder_col:
    X_fill_train[col] = fence_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = fence_encoder.transform(X_fill_test[[col]])

ord_encoded_col = sum(ord_encoded_col, [])

For categorical variables with some natural order, transform as ordinal.

In [97]:
target_encode_cols = [col for col in str_cols if col not in ord_encoded_col]
target_encoder = MEstimateEncoder(cols = X_train[target_encode_cols])

target_encode_cols

X_train_final = target_encoder.fit_transform(X_fill_train, y_train)

X_test_final = target_encoder.transform(X_fill_test)

For the rest of the categorical variables, use target encoding.

In [112]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[100, 250, 500], 'learning_rate':[0.04, 0.05], 'max_depth':[7,8], 'subsample':[0.25, 0.5]}
xgb_test = xgb.XGBRegressor()
param_search = GridSearchCV(xgb_test, parameters)
param_search.fit(X_train_final, y_train)
print(param_search.best_params_)

{'learning_rate': 0.04, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}


Run best parameter search to find the optimal setting for XGBoost. If we decided to filter through different features to feed into the model, we would have needed to create a subset of the training data just for finding the best parameters for XGBoost, but in this instance, because we are simply opting to use all of the covariates, this additional partition is not needed.

In [115]:
xgbr = xgb.XGBRegressor(learning_rate=0.04,max_depth=7,n_estimators=500,subsample=0.5)

xgbr.fit(X_train_final, y_train)
train_pred = xgbr.predict(X_test_final)

np.mean(abs(train_pred - y_test))

14866.092305222603

Perform a sanity check to see if the model fitted is producing reasonable results. The average absolute error is around $15,000 for the test (validation) data set.

In [129]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

X_train = train_df.drop(columns=['SalePrice', 'Id'])
y_train = train_df['SalePrice']
X_test = test_df.drop(columns=['Id'])

num_cols = [col for col in X_train.columns if X_train[col].dtypes != 'object']
str_cols = [col for col in X_train.columns if X_train[col].dtypes == 'object']

num_na_cols = [num_cols[ind] for ind,na_count in enumerate(X_train[num_cols].isnull().sum()) if na_count > 0 ]
str_na_cols = [str_cols[ind] for ind,na_count in enumerate(X_train[str_cols].isnull().sum()) if na_count > 0 ]

num_na_fill = SimpleImputer(strategy='mean')

X_na_fill_train = num_na_fill.fit_transform(X_train[num_na_cols])
X_na_fill_test = num_na_fill.transform(X_test[num_na_cols])

X_fill_train = X_train.copy()
X_fill_test = X_test.copy()

X_fill_train.loc[:,num_na_cols] = X_na_fill_train
X_fill_test.loc[:,num_na_cols] = X_na_fill_test

str_na_cols
str_na_fill = ['NA','None','NA','NA','NA','NA','NA','Mix','NA','NA','NA','NA','NA','NA','NA','NA']

str_na_dict = {}

for col,na_fill in zip(str_na_cols,str_na_fill):
    str_na_dict[col] = na_fill

X_fill_train.fillna(value=str_na_dict, inplace=True)
X_fill_test.fillna(value=str_na_dict, inplace=True)

Now that the parameters have been chosen, use those same parameters to train the model on the entirety of the training data.

However, we notice that applying the same pre-processing steps to the test data still yields some missing variables that we did not previously encounter. Also, the pre-processing step could be written as a function so that we are not repeating code so often.

In [130]:
X_fill_test.isnull().values.any()

True

In [131]:
num_cols = [col for col in X_fill_test.columns if X_fill_test[col].dtypes != 'object']
str_cols = [col for col in X_fill_test.columns if X_fill_test[col].dtypes == 'object']

num_na_cols = [num_cols[ind] for ind,na_count in enumerate(X_fill_test[num_cols].isnull().sum()) if na_count > 0 ]
str_na_cols = [str_cols[ind] for ind,na_count in enumerate(X_fill_test[str_cols].isnull().sum()) if na_count > 0 ]

num_fill = SimpleImputer(strategy='mean')
X_na_num_fill_test = num_fill.fit_transform(X_fill_test[num_na_cols])
X_fill_test[num_na_cols] = X_na_num_fill_test

str_fill = SimpleImputer(strategy='most_frequent')
X_na_str_fill_test = str_fill.fit_transform(X_fill_test[str_na_cols])
X_fill_test[str_na_cols] = X_na_str_fill_test

Fill the missing values with a mean of the variable if it is numeric and the most frequent value if the variable is non-numeric.

Now the test data set no longer has any missing values.

In [133]:
X_fill_test.isnull().values.any()

False

In [134]:
ord_encoded_col = []

ord_encoder = OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']])
ord_encoder_col = ['ExterQual','ExterCond','HeatingQC','KitchenQual']
ord_encoded_col.append(ord_encoder_col)

for col in ord_encoder_col:
    X_fill_train[col] = ord_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = ord_encoder.transform(X_fill_test[[col]])

ord_na_encoder = OrdinalEncoder(categories=[['NA','Po', 'Fa', 'TA', 'Gd', 'Ex']])
ord_na_encoder_col = ['BsmtQual','BsmtCond','FireplaceQu','GarageQual','GarageCond','PoolQC']
ord_encoded_col.append(ord_na_encoder_col)

for col in ord_na_encoder_col:
    X_fill_train[col] = ord_na_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = ord_na_encoder.transform(X_fill_test[[col]])

bst_encoder = OrdinalEncoder(categories=[['NA','No','Mn','Av','Gd']])
bst_encoder_col = ['BsmtExposure']
ord_encoded_col.append(bst_encoder_col)

for col in bst_encoder_col:
    X_fill_train[col] = bst_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = bst_encoder.transform(X_fill_test[[col]])

bst_fintype_encoder = OrdinalEncoder(categories=[['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ']])
bst_fintype_encoder_col = ['BsmtFinType1','BsmtFinType2']
ord_encoded_col.append(bst_fintype_encoder_col)

for col in bst_fintype_encoder_col:
    X_fill_train[col] = bst_fintype_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = bst_fintype_encoder.transform(X_fill_test[[col]])

funct_encoder = OrdinalEncoder(categories=[['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ']])
funct_encoder_col = ['Functional']
ord_encoded_col.append(funct_encoder_col)

for col in funct_encoder_col:
    X_fill_train[col] = funct_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = funct_encoder.transform(X_fill_test[[col]])

fence_encoder = OrdinalEncoder(categories=[['NA','MnWw','GdWo','MnPrv','GdPrv']])
fence_encoder_col = ['Fence']
ord_encoded_col.append(fence_encoder_col)

for col in fence_encoder_col:
    X_fill_train[col] = fence_encoder.fit_transform(X_fill_train[[col]])
    X_fill_test[col] = fence_encoder.transform(X_fill_test[[col]])

ord_encoded_col = sum(ord_encoded_col, [])

In [135]:
target_encode_cols = [col for col in str_cols if col not in ord_encoded_col]
target_encoder = MEstimateEncoder(cols = X_train[target_encode_cols])

target_encode_cols

X_train_final = target_encoder.fit_transform(X_fill_train, y_train)

X_test_final = target_encoder.transform(X_fill_test)

In [144]:
xgbr_final = xgb.XGBRegressor(learning_rate=0.04,max_depth=7,n_estimators=500,subsample=0.5)

xgbr_final.fit(X_train_final, y_train)
final_pred = xgbr_final.predict(X_test_final)

In [145]:
final_pred_df = pd.DataFrame({'Id': test_df.Id, 'SalePrice': final_pred})
final_pred_df.to_csv('submission.csv', index=False)