# Ames Housing Predictions

In this Notebook I made a model using XGBBoost introduced in https://www.kaggle.com/alexisbcook/xgboost, which results in top 40% accurate predictions.

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

In [2]:
X = pd.read_csv('./ames_housing_train.csv', index_col='Id')
X_test = pd.read_csv('./ames_housing_test.csv', index_col='Id')

# we set our target to be the log of the SalePrice
X.SalePrice = np.log(X.SalePrice)

# drop the row with NaN in 'Electrical'
X = X[pd.notnull(X['Electrical'])]

# create target vector
y = X.SalePrice
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
X.drop(['SalePrice'], axis=1, inplace=True)

print(X.shape, X_test.shape)

(1459, 79) (1459, 79)


In [3]:
#### What are the names of the columns with cardinality larger than 10?
high_cardinality_cols = [cname for cname in X.columns if X[cname].nunique() >= 10 and 
                        X[cname].dtype == "object"]
high_cardinality_cols.remove('Neighborhood')

# Columns like 'PoolQC', 'MiscFeature', 'Alley', 'GarageQual', 'GarageCars', will be removed
rem = ['PoolQC', 'MiscFeature', 'Alley', 'GarageCars', 'PoolArea', 'MiscVal']
remove = rem + high_cardinality_cols
print('these columns will be dropped', remove)

replace = ['Fence', 'FireplaceQu', 'GarageCond', 'GarageQual', 'GarageType', 'GarageFinish', 'BsmtCond', 'BsmtQual', 
         'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
print('these columns will be replaced', replace)

these columns will be dropped ['PoolQC', 'MiscFeature', 'Alley', 'GarageCars', 'PoolArea', 'MiscVal', 'Exterior1st', 'Exterior2nd']
these columns will be replaced ['Fence', 'FireplaceQu', 'GarageCond', 'GarageQual', 'GarageType', 'GarageFinish', 'BsmtCond', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']


In [4]:
def transform_dataset(data_X, to_rep, to_rem):
    
    # replace NaNs with No's
    for i in to_rep:
        data_X[i].fillna("No", inplace = True)
        
    # some features can be label encoded
    data_X.ExterQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.ExterCond.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.BsmtFinType1.replace({'GLQ':6, 'ALQ':5, 'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1, 'No':0})
    data_X.BsmtFinType2.replace({'GLQ':6, 'ALQ':5, 'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1, 'No':0})
    data_X.HeatingQC.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.KitchenQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.Functional.replace({"Typ":7, "Min1":6, "Min2":5, "Mod":4, "Maj1":3, "Maj2":2, "Sev":1, "Sal":0})
    data_X.FireplaceQu.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.GarageCond.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.GarageQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.BsmtQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.BsmtCond.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    
    # fill the NaNs
    data_X['LotFrontage'].fillna((data_X['LotFrontage'].median()), inplace=True)

    LotArea_median = data_X['LotArea'].median()
    func = lambda x: x['LotArea'] > 50000 and LotArea_median or x['LotArea']
    data_X['LotArea'] = data_X.apply(func,axis=1).astype(float)
    
    LotFrontage_median = data_X['LotFrontage'].median()
    func = lambda x: x['LotFrontage'] > 300 and LotFrontage_median or x['LotFrontage']
    data_X['LotFrontage'] = data_X.apply(func,axis=1).astype(float)

    GrLivArea_median = data_X['GrLivArea'].median()
    func = lambda x: x['GrLivArea'] > 4000 and GrLivArea_median or x['GrLivArea']
    data_X['GrLivArea'] = data_X.apply(func,axis=1).astype(float)
    
    data_X['MasVnrType'].fillna("None", inplace = True)
    data_X['GarageYrBlt'].fillna(data_X['GarageYrBlt'].median(), inplace = True)
    data_X['MasVnrArea'].fillna(0, inplace = True)
    
    # drop the features we wanted to drop
    data_X = data_X.drop(to_rem, axis=1)
    
    # define new features
    data_X['Age'] = data_X['YrSold'] - data_X['YearBuilt']
    data_X['AgeRemodel'] = data_X['YrSold'] - data_X['YearRemodAdd']
    data_X['AgeGarage'] = data_X['YrSold'] - data_X['GarageYrBlt']
    
    # drop the features we used to create new ones
    data_X = data_X.drop(['YrSold', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], axis=1)
    
    return data_X

For GridSearchCV we can use the complete training set. Splitting it into '_train' and '_validation' is not necessary.

In [5]:
X_train = transform_dataset(X, replace, remove)
y_train = y
X_test = transform_dataset(X_test, replace, remove)

In [6]:
# One-hot encode the data (to shorten the code, we use pandas)
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [7]:
from xgboost import XGBRegressor

# the inital model
xg_reg = XGBRegressor()
xg_reg.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

Since the submissions are evaluated on 'Root-Mean-Squared-Error (RMSE)' we need to introduce this loss function to train our model efficiently.

In [8]:
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

def my_custom_loss_func(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(my_custom_loss_func, greater_is_better = False)

In [9]:
from sklearn.model_selection import GridSearchCV
test_params = {'max_depth': range(3,5), 'min_child_weight':range(3,5), 'eta': [.3, .2, .35],
               'subsample':[1.0, 0.8, 0.9], 'colsample_bytree':[0.3, 0.4], 
               'n_estimators':[800, 1000, 1200]}

model = GridSearchCV(estimator = xg_reg, param_grid = test_params, cv = 5, n_jobs = 4, scoring = rmse_scorer)
model.fit(X_train,y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,...
                                    subsample=1, verbosity=1),
             iid='warn', n_jobs=4,
             param_grid={'colsample_bytree': [0.3, 0.4],
                         'eta': [0.3, 0.2, 0.35], 'max_depth': range(3, 5),
                         'min_child_weight': range(3, 5),
                         'n_estimators': [800, 1000, 1200],
              

In [10]:
new_params = model.best_params_
print(new_params)

{'colsample_bytree': 0.3, 'eta': 0.3, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 800, 'subsample': 0.8}


In [11]:
# here the model automatically uses the best parameters
preds_test = model.predict(X_test)

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': np.exp(preds_test)})
output.to_csv('submission.csv', index=False)