This kernel adopts the idea of [Eliot Barril](https://www.kaggle.com/eliotbarr/house-prices-advanced-regression-techniques/stacking-starter/code). To understand this Ensemble Model, please review this [article](http://mlwave.com/kaggle-ensembling-guide/).

In [1]:
import pandas as pd
import numpy as np
import pickle
import os

from scipy.stats import skew
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge, Ridge, RidgeCV, LinearRegression, ElasticNet, LassoCV, Lasso

from math import sqrt

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')



# Load data

In [2]:
with open("input/preprocessed_data_2.pkl", "rb") as f:
    train_data = pickle.load(f)
    test_data = pickle.load(f)
    ids = pickle.load(f)
    labels = pickle.load(f)

In [3]:
type(train_data)

pandas.core.frame.DataFrame

In [4]:
len(labels)

1458

# Build model

In [5]:
TARGET = 'SalePrice'
NFOLDS = 5
SEED = 3
NROWS = None
SUBMISSION_FILE = 'output/xgstacker_try.csv'

ntrain = train_data.shape[0]
ntest = test_data.shape[0]

In [6]:
x_train = np.array(train_data)
x_test = np.array(test_data)
y_train = labels

In [7]:
y_train = y_train.reset_index()
del y_train['index']
y_train = y_train['SalePrice']

In [8]:
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [9]:
class SklearnWrapper(object):
    def __init__(self, clf, params=None):
        #params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

In [10]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [11]:
#out of folds
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [12]:
#for i, (train_index, test_index) in enumerate(kf):
    #print(i)
    #print(train_index, test_index)

In [13]:
#Bayesian Ridge
br_params = {
    'alpha_1' : 0.05, 
    'alpha_2' : 0.05
}

#Lasso
ls_params = {
    'alpha': 0.005
}


#Ridge
rd_params = {
    'alpha': 10
}

#Linear
ln_params = {
   'normalize': False 
}

"""
#RandomForestRegressor
rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}
"""

#XGBRegressor
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'nrounds': 500
}

In [14]:
br = SklearnWrapper(clf=BayesianRidge, params=br_params)
ls = SklearnWrapper(clf=Lasso, params=ls_params)
rd = SklearnWrapper(clf=Ridge, params=rd_params)
ln = SklearnWrapper(clf=LinearRegression, params=ln_params)
xg = XgbWrapper(seed=SEED, params=xgb_params)

"""
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
"""

'\net = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)\nrf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)\n'

In [15]:
xg_oof_train, xg_oof_test = get_oof(xg)
br_oof_train, br_oof_test = get_oof(br)
ls_oof_train, ls_oof_test = get_oof(ls)
rd_oof_train, rd_oof_test = get_oof(rd)
ln_oof_train, ln_oof_test = get_oof(ln)

In [16]:
print("BR-CV: {}".format(sqrt(mean_squared_error(y_train, br_oof_train))))
print("LS-CV: {}".format(sqrt(mean_squared_error(y_train, ls_oof_train))))
print("RD-CV: {}".format(sqrt(mean_squared_error(y_train, rd_oof_train))))
print("LN-CV: {}".format(sqrt(mean_squared_error(y_train, ln_oof_train))))
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))

BR-CV: 0.1106361242763708
LS-CV: 0.12607783885714882
RD-CV: 0.11064132573776658
LN-CV: 0.11858291727477233
XG-CV: 0.11695101975189966


In [17]:
x_train = np.concatenate((xg_oof_train, br_oof_train, ls_oof_train, rd_oof_train, ln_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, br_oof_test, ls_oof_test, rd_oof_test, ln_oof_test), axis=1)

In [18]:
print("{},{}".format(x_train.shape, x_test.shape))

(1458, 5),(1459, 5)


In [19]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

In [20]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

In [21]:
res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

[0]	train-rmse:11.4162+0.00276865	test-rmse:11.4162+0.0084551
[10]	train-rmse:10.3275+0.00274975	test-rmse:10.3275+0.00854742
[20]	train-rmse:9.34329+0.00240024	test-rmse:9.34329+0.00896903
[30]	train-rmse:8.45315+0.00239675	test-rmse:8.45315+0.00900525
[40]	train-rmse:7.64802+0.00247651	test-rmse:7.64813+0.00894958
[50]	train-rmse:6.91951+0.00222154	test-rmse:6.91956+0.00913904
[60]	train-rmse:6.26075+0.00192904	test-rmse:6.26067+0.0093442
[70]	train-rmse:5.66496+0.00175648	test-rmse:5.66469+0.00964097
[80]	train-rmse:5.12599+0.00172636	test-rmse:5.12577+0.00982131
[90]	train-rmse:4.6384+0.0016244	test-rmse:4.63831+0.0102032
[100]	train-rmse:4.19742+0.00151585	test-rmse:4.19735+0.0106315
[110]	train-rmse:3.79843+0.00145737	test-rmse:3.79825+0.0108589
[120]	train-rmse:3.43766+0.00150784	test-rmse:3.43749+0.011204
[130]	train-rmse:3.11131+0.00141747	test-rmse:3.11119+0.0114555
[140]	train-rmse:2.81611+0.00117272	test-rmse:2.81598+0.0116987
[150]	train-rmse:2.54928+0.00100227	test-rmse:2

In [22]:
best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

In [23]:
print('Ensemble-CV: {0} (+/- {1})'.format(cv_mean, cv_std))

Ensemble-CV: 0.1141095 (+/- 0.0047181826215185844)


In [24]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

In [25]:
"""
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)
saleprice = np.exp(submission['SalePrice'])-1
submission['SalePrice'] = saleprice
submission.to_csv('output/xgstacker_try.csv', index=None)
"""

"\nsubmission = pd.read_csv(SUBMISSION_FILE)\nsubmission.iloc[:, 1] = gbdt.predict(dtest)\nsaleprice = np.exp(submission['SalePrice'])-1\nsubmission['SalePrice'] = saleprice\nsubmission.to_csv('output/xgstacker_try.csv', index=None)\n"

In [26]:
def modelSubmit(model, dtest, name):
    ln_pred = model.predict(dtest)
    pred = np.expm1(ln_pred)
    submission = pd.DataFrame({"Id": ids, "SalePrice": pred})
    filename = os.path.join(name + ".csv")
    submission.to_csv('output/' + filename, index=False)

In [27]:
modelSubmit(gbdt, dtest, 'xgstacker_3')