In [13]:
from xgboost import XGBRegressor
import xgboost as xgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
import seaborn as sns

In [14]:
X = pd.read_csv('./ames_housing_train.csv', index_col='Id')
# we set our target to be the log of the SalePrice
X.SalePrice = np.log(X.SalePrice)
# drop the row with NaN in 'Electrical'
X = X[pd.notnull(X['Electrical'])]
# create target vector
y = X.SalePrice
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
X.drop(['SalePrice'], axis=1, inplace=True)


X_test = pd.read_csv('./ames_housing_test.csv', index_col='Id')

print(X.shape, X_test.shape)

(1459, 79) (1459, 79)


In [15]:
#### What are the names of the columns with cardinality larger than 10?
high_cardinality_cols = [cname for cname in X.columns if X[cname].nunique() >= 10 and 
                        X[cname].dtype == "object"]
high_cardinality_cols.remove('Neighborhood')


# Columns like 'PoolQC', 'MiscFeature', 'Alley', 'GarageQual', 'GarageCars', will be removed
rem = ['PoolQC', 'MiscFeature', 'Alley', 'GarageCars', 'Heating', 'PoolArea', 'MiscVal']
remove = rem + high_cardinality_cols
print('these columns will be dropped', remove)

replace = ['Fence', 'FireplaceQu', 'GarageCond', 'GarageQual', 'GarageType', 'GarageFinish', 'BsmtCond', 'BsmtQual', 
         'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

these columns will be dropped ['PoolQC', 'MiscFeature', 'Alley', 'GarageCars', 'Heating', 'PoolArea', 'MiscVal', 'Exterior1st', 'Exterior2nd']


Add ages instead of years, Encode Functional, remove poolarea, miscval

In [16]:
def transform_dataset(data_X, to_rep, to_rem):
    
    for i in to_rep:
        data_X[i].fillna("No", inplace = True)

    data_X.ExterQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.ExterCond.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.BsmtFinType1.replace({'GLQ':6, 'ALQ':5, 'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1, 'No':0})
    data_X.BsmtFinType2.replace({'GLQ':6, 'ALQ':5, 'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1, 'No':0})
    data_X.HeatingQC.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.KitchenQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.Functional.replace({"Typ":7, "Min1":6, "Min2":5, "Mod":4, "Maj1":3, "Maj2":2, "Sev":1, "Sal":0})
    data_X.FireplaceQu.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.GarageCond.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.GarageQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.BsmtQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.BsmtCond.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    
    data_X['LotFrontage'].fillna((data_X['LotFrontage'].median()), inplace=True)

    LotArea_median = data_X['LotArea'].median()
    func = lambda x: x['LotArea'] > 50000 and LotArea_median or x['LotArea']
    data_X['LotArea'] = data_X.apply(func,axis=1).astype(float)
    
    LotFrontage_median = data_X['LotFrontage'].median()
    func = lambda x: x['LotFrontage'] > 300 and LotFrontage_median or x['LotFrontage']
    data_X['LotFrontage'] = data_X.apply(func,axis=1).astype(float)

    GrLivArea_median = data_X['GrLivArea'].median()
    func = lambda x: x['GrLivArea'] > 4000 and GrLivArea_median or x['GrLivArea']
    data_X['GrLivArea'] = data_X.apply(func,axis=1).astype(float)
    
    data_X['MasVnrType'].fillna("None", inplace = True)
    data_X['GarageYrBlt'].fillna(data_X['GarageYrBlt'].median(), inplace = True)
    data_X['MasVnrArea'].fillna(0, inplace = True)
    
    data_X = data_X.drop(to_rem, axis=1)
    
    data_X['Age'] = data_X['YrSold'] - data_X['YearBuilt']
    data_X['AgeRemodel'] = data_X['YrSold'] - data_X['YearRemodAdd']
    data_X['AgeGarage'] = data_X['YrSold'] - data_X['GarageYrBlt']
    
    data_X = data_X.drop(['YrSold', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], axis=1)
    
    return data_X

In [17]:
X = transform_dataset(X, replace, remove)
X_test = transform_dataset(X_test, replace, remove)

In [18]:
# break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.67, test_size=0.33, random_state=7)

In [19]:
# One-hot encode the data (to shorten the code, we use pandas)
X = pd.get_dummies(X)
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
X_valid = pd.get_dummies(X_valid)

X_train, X_valid = X_train.align(X_valid, join='left', axis=1)

## First attempt

We first use the XGBoost algorithm, introduced in https://www.kaggle.com/alexisbcook/xgboost.

In [20]:
first_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
first_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)
preds = first_model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.140743


Now we can get more fancy and try to improve the error

In [21]:
# we define an initial set of parameters
params = {
    'max_depth':5,
    'objective':'reg:squarederror',
    'eval_metric':'rmse',
    'learning_rate':0.05,
    'n_jobs':4
}

# maximum number of boosting rounds we allow
num_boost_round = 2000

# number of rounds without improvements after which we should stop
early_stopping_rounds = 300

model = xgb.train(
    params,
    dtrain,
    num_boost_round = num_boost_round,
    evals = [(dvalid, 'SalePrice')],
    early_stopping_rounds = early_stopping_rounds
)

print("Best RMSE: {:.3f} with {} rounds".format(model.best_score, model.best_iteration+1))

We now introduce cross-validation to tune other parameters

grid_search_params = [
    (max_depth, min_child_weight)
    for max_depth in range(2,6)
    for min_child_weight in range(2,8)
]

# Define initial best params and rmse
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in grid_search_params:
    # print("CV with max_depth={}, min_child_weight={}".format(max_depth, min_child_weight))
    # update parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=early_stopping_rounds
    )
    # update best rmse
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    #print("\tMAE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_rmse))

In [22]:
##### 
xg_reg = xgb.XGBRegressor(**params)
xg_reg.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0, importance_type='gain', learning_rate=0.05,
             max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
             n_estimators=100, n_jobs=4, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
             subsample=1, verbosity=1)

In [23]:
from sklearn.metrics import make_scorer
def my_custom_loss_func(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(my_custom_loss_func, greater_is_better = False)

In [24]:
from sklearn.model_selection import GridSearchCV
test_params = {'max_depth': range(3,6), 'min_child_weight':range(3,6), 'eta': [.3, .2, .35],
               'subsample':[1.0, 0.8, 0.9], 'colsample_bytree':[0.3, 0.4], 
               'n_estimators':[1000, 1500]}

model = GridSearchCV(estimator = xg_reg, param_grid = test_params, cv = 5, n_jobs = 4, scoring = rmse_scorer)
model.fit(X_train,y_train)
print(model.best_params_)

KeyboardInterrupt: 

In [26]:
new_params = model.best_params_
print(new_params)

{'colsample_bytree': 0.4, 'eta': 0.3, 'max_depth': 5, 'min_child_weight': 3, 'subsample': 1.0, 'n_estimators': 1000}


In [None]:
preds = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, preds))
print("RMSE: %f" % (rmse))

In [71]:
best_model.save_model("my_model.model")

loaded_model = xgb.Booster()
loaded_model.load_model("my_model.model")
preds_test = loaded_model.predict(dtest)

In [159]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': np.exp(preds_test)})
output.to_csv('submission.csv', index=False)