This kernel adopts the idea of [Eliot Barril](https://www.kaggle.com/eliotbarr/house-prices-advanced-regression-techniques/stacking-starter/code). To understand this Ensemble Model, please review this [article](http://mlwave.com/kaggle-ensembling-guide/).

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from math import sqrt

from scipy.stats import skew
import xgboost as xgb
from sklearn.cross_validation import KFold

from sklearn.linear_model import BayesianRidge, ElasticNet, HuberRegressor, Lasso, Ridge
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
#from sklearn.ensemble import ExtraTreesRegressor, DecisionTreeRegressor
from sklearn import svm
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')



# Load data

In [2]:
with open("input/preprocessed_data_5.pkl", "rb") as f:
    train_data = pickle.load(f)
    test_data = pickle.load(f)
    ids = pickle.load(f)
    labels = pickle.load(f)

In [3]:
type(train_data)

pandas.core.frame.DataFrame

In [4]:
len(labels)

1420

# Build model

In [5]:
TARGET = 'SalePrice'
NFOLDS = 5
SEED = 3
NROWS = None
SUBMISSION_FILE = 'output/xgstacker_try.csv'

ntrain = train_data.shape[0]
ntest = test_data.shape[0]

In [6]:
x_train = np.array(train_data)
x_test = np.array(test_data)
y_train = labels

In [7]:
y_train = y_train.reset_index()
del y_train['index']
y_train = y_train['SalePrice']

In [8]:
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [9]:
class SklearnWrapper(object):
    def __init__(self, clf, params=None):
        #params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

In [10]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [11]:
#out of folds
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [12]:
#for i, (train_index, test_index) in enumerate(kf):
    #print(i)
    #print(train_index, test_index)

In [13]:
#Bayesian Ridge
br_params = {
    'alpha_1' : 0.05, 
    'alpha_2' : 0.05
}

#Lasso
ls_params = {
    'alpha': 0.005
}


#Ridge
rd_params = {
    'alpha': 10
}

#HuberRegressor
hr_params = {'alpha': 1, 
             'epsilon': 1.2
    
}
"""
#SVM RBF
svr_params = {'C': 1.0, 
              'epsilon': 2, 
              'tol': 0.0001
    
}
"""

#RandomForestRegressor
rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}


#XGBRegressor
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'nrounds': 500
}
#AdaBoostRegressor
ab_params = {'learning_rate': 0.001,
            'n_estimators': 50
    
}

In [14]:
br = SklearnWrapper(clf=BayesianRidge, params=br_params)
ls = SklearnWrapper(clf=Lasso, params=ls_params)
rd = SklearnWrapper(clf=Ridge, params=rd_params)
rf = SklearnWrapper(clf=RandomForestRegressor, params=rf_params)
ab = SklearnWrapper(clf=AdaBoostRegressor, params=ab_params)
xg = XgbWrapper(seed=SEED, params=xgb_params)


#et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
#rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)

In [15]:
xg_oof_train, xg_oof_test = get_oof(xg)
br_oof_train, br_oof_test = get_oof(br)
ls_oof_train, ls_oof_test = get_oof(ls)
rd_oof_train, rd_oof_test = get_oof(rd)
rf_oof_train, rf_oof_test = get_oof(rf)
ab_oof_train, ab_oof_test = get_oof(ab)

In [16]:
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("BR-CV: {}".format(sqrt(mean_squared_error(y_train, br_oof_train))))
print("LS-CV: {}".format(sqrt(mean_squared_error(y_train, ls_oof_train))))
print("RD-CV: {}".format(sqrt(mean_squared_error(y_train, rd_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("AB-CV: {}".format(sqrt(mean_squared_error(y_train, ab_oof_train))))

XG-CV: 0.117698934661447
BR-CV: 0.11169755822112706
LS-CV: 0.12520341935253773
RD-CV: 0.11160359084744062
RF-CV: 0.13453672277723452
AB-CV: 0.1986166247675586


In [17]:
x_train = np.concatenate((xg_oof_train, br_oof_train, ls_oof_train, rd_oof_train, rf_oof_train, ab_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, br_oof_test, ls_oof_test, rd_oof_test, rf_oof_test, ab_oof_test), axis=1)

In [18]:
print("{},{}".format(x_train.shape, x_test.shape))

(1420, 6),(1459, 6)


In [19]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

In [20]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 3,
    'num_parallel_tree': 1,
    'min_child_weight': 5,
    'eval_metric': 'rmse',
}

In [21]:
res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

[0]	train-rmse:11.4031+0.00473003	test-rmse:11.4031+0.0142113
[10]	train-rmse:10.3157+0.00445397	test-rmse:10.3157+0.0144785
[20]	train-rmse:9.3323+0.00387658	test-rmse:9.33229+0.0150352
[30]	train-rmse:8.44287+0.00360863	test-rmse:8.44286+0.0152838
[40]	train-rmse:7.63841+0.00324941	test-rmse:7.6384+0.0156389
[50]	train-rmse:6.91108+0.0030138	test-rmse:6.91107+0.0156107
[60]	train-rmse:6.25293+0.00276954	test-rmse:6.25296+0.0153456
[70]	train-rmse:5.6577+0.00259847	test-rmse:5.6577+0.0147551
[80]	train-rmse:5.11911+0.00236158	test-rmse:5.11923+0.0139281
[90]	train-rmse:4.63217+0.00203154	test-rmse:4.63232+0.0132733
[100]	train-rmse:4.19166+0.00179689	test-rmse:4.19173+0.012631
[110]	train-rmse:3.79319+0.00172125	test-rmse:3.79329+0.0117889
[120]	train-rmse:3.4326+0.00152884	test-rmse:3.43271+0.0110269
[130]	train-rmse:3.10652+0.00125546	test-rmse:3.10687+0.0103919
[140]	train-rmse:2.81152+0.00116311	test-rmse:2.81179+0.00954027
[150]	train-rmse:2.5445+0.0010476	test-rmse:2.54488+0.008

In [22]:
best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

In [23]:
print('Ensemble-CV: {0} (+/- {1})'.format(cv_mean, cv_std))

Ensemble-CV: 0.11483025000000001 (+/- 0.00760544030201408)


In [24]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

In [25]:
"""
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)
saleprice = np.exp(submission['SalePrice'])-1
submission['SalePrice'] = saleprice
submission.to_csv('output/xgstacker_try.csv', index=None)
"""

"\nsubmission = pd.read_csv(SUBMISSION_FILE)\nsubmission.iloc[:, 1] = gbdt.predict(dtest)\nsaleprice = np.exp(submission['SalePrice'])-1\nsubmission['SalePrice'] = saleprice\nsubmission.to_csv('output/xgstacker_try.csv', index=None)\n"

In [26]:
def modelSubmit(model, dtest, name):
    ln_pred = model.predict(dtest)
    pred = np.expm1(ln_pred)
    submission = pd.DataFrame({"Id": ids, "SalePrice": pred})
    filename = os.path.join(name + ".csv")
    submission.to_csv('output/' + filename, index=False)

In [27]:
modelSubmit(gbdt, dtest, 'xgstacker_5_2')