This kernel adopts the idea of [Eliot Barril](https://www.kaggle.com/eliotbarr/house-prices-advanced-regression-techniques/stacking-starter/code). To understand this Ensemble Model, please review this [article](http://mlwave.com/kaggle-ensembling-guide/).

In [1]:
import pandas as pd
import numpy as np

from scipy.stats import skew
import xgboost as xgb
import pickle
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso

from math import sqrt

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')



In [2]:
TARGET = 'SalePrice'
NFOLDS = 4
SEED = 0
NROWS = None
SUBMISSION_FILE = 'output'

# Load data

In [3]:
with open("input/preprocessed_data.pkl", "rb") as f:
    train_data = pickle.load(f)
    test_data = pickle.load(f)
    ids = pickle.load(f)
    labels = pickle.load(f)

In [4]:
x_train = np.array(train_data)
x_test = np.array(test_data)
y_train = labels

In [5]:
y_train = y_train.reset_index()
del y_train['index']

In [23]:
y_train = y_train['SalePrice']

In [24]:
y_train

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
5       11.870607
6       12.634606
7       12.206078
8       11.774528
9       11.678448
10      11.771444
11      12.751303
12      11.877576
13      12.540761
14      11.964007
15      11.790565
16      11.911708
17      11.407576
18      11.976666
19      11.842236
20      12.692506
21      11.845110
22      12.345839
23      11.774528
24      11.944714
25      12.454108
26      11.811555
27      12.631344
28      12.242891
29      11.134604
          ...    
1428    12.165985
1429    11.875838
1430    11.074436
1431    12.136192
1432    11.982935
1433    12.066816
1434    11.699413
1435    12.885673
1436    11.916395
1437    12.190964
1438    12.160034
1439    11.913720
1440    12.644331
1441    11.703554
1442    12.098493
1443    11.767575
1444    11.969724
1445    12.388398
1446    11.626263
1447    11.429555
1448    11.820418
1449    12.567555
1450    11.884496
1451    11.344519
1452    12

In [25]:
#y_train.shape

# Data preprocessing

In [26]:
ntrain = train_data.shape[0]
ntest = test_data.shape[0]

In [27]:
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [28]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

In [29]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [30]:
#out of folds
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [31]:
#for i, (train_index, test_index) in enumerate(kf):
    #print(i)
    #print(train_index, test_index)

In [32]:
#Ridge
rd_params={
    'alpha': 10
}

#Lasso
ls_params={
    'alpha': 0.005
}

#ExtraTreesRegressor
et_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

#RandomForestRegressor
rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

#XGBRegressor
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'nrounds': 500
}

In [33]:
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
xg = XgbWrapper(seed=SEED, params=xgb_params)

In [34]:
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)

In [35]:
print("RD-CV: {}".format(sqrt(mean_squared_error(y_train, rd_oof_train))))
print("LS-CV: {}".format(sqrt(mean_squared_error(y_train, ls_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))

RD-CV: 0.11375430249684435
LS-CV: 0.14006360540917803
ET-CV: 0.142537948821056
RF-CV: 0.13774711107505816
XG-CV: 0.11841654652236004


In [36]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, rd_oof_train, ls_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, rd_oof_test, ls_oof_test), axis=1)

In [37]:
print("{},{}".format(x_train.shape, x_test.shape))

(1458, 5),(1459, 5)


In [38]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

In [39]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

In [None]:
res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

[0]	train-rmse:11.4159+0.00662062	test-rmse:11.4159+0.0200578
[10]	train-rmse:10.3273+0.00572085	test-rmse:10.3273+0.0210188
[20]	train-rmse:9.34276+0.00517384	test-rmse:9.34274+0.0216428
[30]	train-rmse:8.45237+0.00479396	test-rmse:8.45235+0.0220878
[40]	train-rmse:7.64725+0.004685	test-rmse:7.6473+0.0220515
[50]	train-rmse:6.91917+0.00419865	test-rmse:6.91933+0.0202726
[60]	train-rmse:6.26046+0.00370524	test-rmse:6.26058+0.0187389
[70]	train-rmse:5.66454+0.00325087	test-rmse:5.6646+0.0173257
[80]	train-rmse:5.12565+0.00291116	test-rmse:5.12557+0.0162067
[90]	train-rmse:4.63814+0.00271316	test-rmse:4.63796+0.0153115
[100]	train-rmse:4.19711+0.00255786	test-rmse:4.19689+0.0145295
[110]	train-rmse:3.79832+0.00231825	test-rmse:3.79812+0.0140298
[120]	train-rmse:3.43747+0.00205918	test-rmse:3.43702+0.013716
[130]	train-rmse:3.11121+0.00188217	test-rmse:3.11085+0.0132946
[140]	train-rmse:2.81582+0.00172974	test-rmse:2.81543+0.0129332
[150]	train-rmse:2.54883+0.00158316	test-rmse:2.54843+0.

In [None]:
best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

In [None]:
print('Ensemble-CV: {0} (+/- {1})'.format(cv_mean, cv_std))

In [None]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

In [None]:
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)
saleprice = np.exp(submission['SalePrice'])-1
submission['SalePrice'] = saleprice
submission.to_csv('output/xgstacker_starter.csv', index=None)