In [12]:
from xgboost import XGBRegressor
import xgboost as xgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
import seaborn as sns

In [13]:
X = pd.read_csv('./ames_housing_train.csv', index_col='Id')
# we set our target to be the log of the SalePrice
X.SalePrice = np.log(X.SalePrice)
# drop the row with NaN in 'Electrical'
X = X[pd.notnull(X['Electrical'])]
# create target vector
y = X.SalePrice
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
X.drop(['SalePrice'], axis=1, inplace=True)


X_test = pd.read_csv('./ames_housing_test.csv', index_col='Id')

print(X.shape, X_test.shape)

(1459, 79) (1459, 79)


In [14]:
#### What are the names of the columns with cardinality larger than 10?
high_cardinality_cols = [cname for cname in X.columns if X[cname].nunique() >= 10 and 
                        X[cname].dtype == "object"]
high_cardinality_cols.remove('Neighborhood')


# Columns like 'PoolQC', 'MiscFeature', 'Alley', 'GarageQual', 'GarageCars', 'GarageYrBlt', will be removed
rem = ['PoolQC', 'MiscFeature', 'Alley', 'GarageCars', 'GarageYrBlt', 'Heating']
remove = rem + high_cardinality_cols
print('these columns will be dropped', remove)

replace = ['Fence', 'FireplaceQu', 'GarageCond', 'GarageQual', 'GarageType', 'GarageFinish', 'BsmtCond', 'BsmtQual', 
         'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

these columns will be dropped ['PoolQC', 'MiscFeature', 'Alley', 'GarageCars', 'GarageYrBlt', 'Heating', 'Exterior1st', 'Exterior2nd']


In [15]:
def transform_dataset(data_X, to_rep, to_rem):
    
    for i in to_rep:
        data_X[i].fillna("No", inplace = True)

    data_X.ExterQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.ExterCond.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.BsmtFinType1.replace({'GLQ':6, 'ALQ':5, 'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1, 'No':0})
    data_X.BsmtFinType2.replace({'GLQ':6, 'ALQ':5, 'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1, 'No':0})
    data_X.HeatingQC.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})
    data_X.KitchenQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1})

    data_X.FireplaceQu.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.GarageCond.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.GarageQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.BsmtQual.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    data_X.BsmtCond.replace({"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "No":0})
    
    data_X['LotFrontage'].fillna((data_X['LotFrontage'].median()), inplace=True)

    LotArea_median = data_X['LotArea'].median()
    func = lambda x: x['LotArea'] > 50000 and LotArea_median or x['LotArea']
    data_X['LotArea'] = data_X.apply(func,axis=1).astype(float)
    
    LotFrontage_median = data_X['LotFrontage'].median()
    func = lambda x: x['LotFrontage'] > 300 and LotFrontage_median or x['LotFrontage']
    data_X['LotFrontage'] = data_X.apply(func,axis=1).astype(float)

    GrLivArea_median = data_X['GrLivArea'].median()
    func = lambda x: x['GrLivArea'] > 4000 and GrLivArea_median or x['GrLivArea']
    data_X['GrLivArea'] = data_X.apply(func,axis=1).astype(float)
    
    data_X['MasVnrType'].fillna("None", inplace = True)
    data_X['GarageYrBlt'].fillna(data_X['GarageYrBlt'].median(), inplace = True)
    data_X['MasVnrArea'].fillna(0, inplace = True)
    
    data_X = data_X.drop(to_rem, axis=1)
    
    return data_X

In [16]:
X = transform_dataset(X, replace, remove)
X_test = transform_dataset(X_test, replace, remove)

In [69]:
# break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.67, test_size=0.33, random_state=7)

In [80]:
# One-hot encode the data (to shorten the code, we use pandas)
X = pd.get_dummies(X)
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
X_valid = pd.get_dummies(X_valid)

X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)
#X_train_copy = X_train.copy()
#X_valid_copy = X_valid.copy()

## First attempt

We first use the XGBoost algorithm, introduced in https://www.kaggle.com/alexisbcook/xgboost.

In [168]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)
preds = my_model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.141253


Now we can get more fancy and try to improve the error

In [169]:
# load the data into the matrices
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest = xgb.DMatrix(X_test)

In [132]:
# we define an initial set of parameters
params = {
    'max_depth':5,
    #'min_child_weight': 1,
    #'subsample': 1,
    #'colsample_bytree': 1,
    'objective':'reg:squarederror',
    'eval_metric':'rmse',
    'learning_rate':0.05,
    'n_jobs':4
}
# maximum number of boosting rounds we allow
num_boost_round = 2000
# number of rounds without improvements after which we should stop
early_stopping_rounds = 10

In [133]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round = num_boost_round,
    evals = [(dvalid, 'SalePrice')],
    early_stopping_rounds = early_stopping_rounds
)

#print("Best RMSE: {:.3f} with {} rounds".format(model.best_score, model.best_iteration+1))

[0]	SalePrice-rmse:10.9579
Will train until SalePrice-rmse hasn't improved in 10 rounds.
[1]	SalePrice-rmse:10.4115
[2]	SalePrice-rmse:9.89241
[3]	SalePrice-rmse:9.3993
[4]	SalePrice-rmse:8.93087
[5]	SalePrice-rmse:8.48587
[6]	SalePrice-rmse:8.06316
[7]	SalePrice-rmse:7.66123
[8]	SalePrice-rmse:7.27939
[9]	SalePrice-rmse:6.91662
[10]	SalePrice-rmse:6.57236
[11]	SalePrice-rmse:6.24556
[12]	SalePrice-rmse:5.93449
[13]	SalePrice-rmse:5.63966
[14]	SalePrice-rmse:5.3589
[15]	SalePrice-rmse:5.0925
[16]	SalePrice-rmse:4.83974
[17]	SalePrice-rmse:4.59903
[18]	SalePrice-rmse:4.3706
[19]	SalePrice-rmse:4.15385
[20]	SalePrice-rmse:3.9469
[21]	SalePrice-rmse:3.75112
[22]	SalePrice-rmse:3.56512
[23]	SalePrice-rmse:3.38844
[24]	SalePrice-rmse:3.2202
[25]	SalePrice-rmse:3.05998
[26]	SalePrice-rmse:2.90779
[27]	SalePrice-rmse:2.76393
[28]	SalePrice-rmse:2.62731
[29]	SalePrice-rmse:2.49753
[30]	SalePrice-rmse:2.37426
[31]	SalePrice-rmse:2.25644
[32]	SalePrice-rmse:2.14476
[33]	SalePrice-rmse:2.03916
[3

[278]	SalePrice-rmse:0.144415
[279]	SalePrice-rmse:0.144396
[280]	SalePrice-rmse:0.144394
[281]	SalePrice-rmse:0.144373
[282]	SalePrice-rmse:0.144335
[283]	SalePrice-rmse:0.144291
[284]	SalePrice-rmse:0.144252
[285]	SalePrice-rmse:0.144233
[286]	SalePrice-rmse:0.144238
[287]	SalePrice-rmse:0.144205
[288]	SalePrice-rmse:0.144148
[289]	SalePrice-rmse:0.144192
[290]	SalePrice-rmse:0.144215
[291]	SalePrice-rmse:0.144192
[292]	SalePrice-rmse:0.144175
[293]	SalePrice-rmse:0.144134
[294]	SalePrice-rmse:0.144122
[295]	SalePrice-rmse:0.144144
[296]	SalePrice-rmse:0.144136
[297]	SalePrice-rmse:0.144142
[298]	SalePrice-rmse:0.144152
[299]	SalePrice-rmse:0.144114
[300]	SalePrice-rmse:0.144088
[301]	SalePrice-rmse:0.144062
[302]	SalePrice-rmse:0.144089
[303]	SalePrice-rmse:0.144112
[304]	SalePrice-rmse:0.144121
[305]	SalePrice-rmse:0.144096
[306]	SalePrice-rmse:0.144094
[307]	SalePrice-rmse:0.144107
[308]	SalePrice-rmse:0.144085
[309]	SalePrice-rmse:0.144078
[310]	SalePrice-rmse:0.144072
[311]	Sale

We now introduce cross-validation to tune other parameters

In [141]:
cv_results = xgb.cv(
    params,
    # in cv we use all available data
    dmatrix,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=early_stopping_rounds
)
cv_results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,10.955624,0.003084,10.955615,0.013119
1,10.409056,0.002943,10.409049,0.013289
2,9.889837,0.002809,9.889827,0.013456
3,9.396594,0.002681,9.396585,0.013614
4,8.928036,0.002561,8.928028,0.013767


In [142]:
cv_results['test-rmse-mean'].min()

0.13557740000000001

In [123]:
grid_search_params = [
    (max_depth, min_child_weight)
    for max_depth in range(2,15)
    for min_child_weight in range(2,10)
]

# Define initial best params and rmse
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in grid_search_params:
    #print("CV with max_depth={}, min_child_weight={}".format(max_depth, min_child_weight))
    # update parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=early_stopping_rounds
    )
    # update best rmse
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    #print("\tMAE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_rmse))

Best params: 3, 3, MAE: 0.12076120000000001


In [167]:
xg_reg = xgb.XGBRegressor(**params)
print(params)
xg_reg.fit(X_train, y_train)

{'max_depth': 5, 'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'learning_rate': 0.05, 'n_jobs': 4}


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0, importance_type='gain', learning_rate=0.05,
             max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
             n_estimators=100, n_jobs=4, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
             subsample=1, verbosity=1)

In [147]:
from sklearn.model_selection import GridSearchCV
test_params = {'max_depth': range(2,5), 'min_child_weight':range(2,5), 'eta': [.3, .2, .1, .05, .01],
              'subsample':[0.3, 0.5, 0.7, 0.9], 'colsample_bytree':[0.3, 0.5, 0.7, 0.9]}

model = GridSearchCV(estimator = xg_reg, param_grid = test_params, cv = 5, n_jobs = 4)
model.fit(X_train,y_train)
print(model.best_params_)

{'colsample_bytree': 0.3, 'eta': 0.3, 'max_depth': 4, 'min_child_weight': 4, 'subsample': 0.9}


In [149]:
new_params = {'colsample_bytree': 0.3, 'eta': 0.3, 'max_depth': 4, 'min_child_weight': 4, 'subsample': 0.9, 
              'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'n_jobs':4}

In [164]:
best_model = xgb.train(
    new_params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dvalid, 'SalePrice')],
    early_stopping_rounds=10
)

[0]	SalePrice-rmse:8.08277
Will train until SalePrice-rmse hasn't improved in 10 rounds.
[1]	SalePrice-rmse:5.66878
[2]	SalePrice-rmse:3.97844
[3]	SalePrice-rmse:2.79965
[4]	SalePrice-rmse:1.97609
[5]	SalePrice-rmse:1.39892
[6]	SalePrice-rmse:0.996055
[7]	SalePrice-rmse:0.718739
[8]	SalePrice-rmse:0.526113
[9]	SalePrice-rmse:0.398425
[10]	SalePrice-rmse:0.312468
[11]	SalePrice-rmse:0.253592
[12]	SalePrice-rmse:0.219841
[13]	SalePrice-rmse:0.200648
[14]	SalePrice-rmse:0.188358
[15]	SalePrice-rmse:0.181899
[16]	SalePrice-rmse:0.17706
[17]	SalePrice-rmse:0.174024
[18]	SalePrice-rmse:0.172705
[19]	SalePrice-rmse:0.170325
[20]	SalePrice-rmse:0.169362
[21]	SalePrice-rmse:0.167665
[22]	SalePrice-rmse:0.166161
[23]	SalePrice-rmse:0.165289
[24]	SalePrice-rmse:0.163265
[25]	SalePrice-rmse:0.162085
[26]	SalePrice-rmse:0.161942


Exception ignored in: <function Booster.__del__ at 0x1a197ff1e0>
Traceback (most recent call last):
  File "/Users/iuliiaskobleva/anaconda3/lib/python3.7/site-packages/xgboost/core.py", line 957, in __del__
    if self.handle is not None:
AttributeError: 'Booster' object has no attribute 'handle'


[27]	SalePrice-rmse:0.161849
[28]	SalePrice-rmse:0.161858
[29]	SalePrice-rmse:0.161109
[30]	SalePrice-rmse:0.160359
[31]	SalePrice-rmse:0.159947
[32]	SalePrice-rmse:0.158869
[33]	SalePrice-rmse:0.158676
[34]	SalePrice-rmse:0.158418
[35]	SalePrice-rmse:0.158631
[36]	SalePrice-rmse:0.15872
[37]	SalePrice-rmse:0.158394
[38]	SalePrice-rmse:0.158315
[39]	SalePrice-rmse:0.158065
[40]	SalePrice-rmse:0.157195
[41]	SalePrice-rmse:0.157131
[42]	SalePrice-rmse:0.156106
[43]	SalePrice-rmse:0.156085
[44]	SalePrice-rmse:0.155751
[45]	SalePrice-rmse:0.155588
[46]	SalePrice-rmse:0.154855
[47]	SalePrice-rmse:0.155025
[48]	SalePrice-rmse:0.155435
[49]	SalePrice-rmse:0.155157
[50]	SalePrice-rmse:0.155151
[51]	SalePrice-rmse:0.155365
[52]	SalePrice-rmse:0.155232
[53]	SalePrice-rmse:0.155097
[54]	SalePrice-rmse:0.154758
[55]	SalePrice-rmse:0.15476
[56]	SalePrice-rmse:0.15455
[57]	SalePrice-rmse:0.154582
[58]	SalePrice-rmse:0.154758
[59]	SalePrice-rmse:0.154928
[60]	SalePrice-rmse:0.154954
[61]	SalePrice-rm

In [158]:
best_model.save_model("my_model.model")

loaded_model = xgb.Booster()
loaded_model.load_model("my_model.model")
preds_test = loaded_model.predict(dtest)

In [159]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': np.exp(preds_test)})
output.to_csv('submission.csv', index=False)