This kernel adopts the idea of [Eliot Barril](https://www.kaggle.com/eliotbarr/house-prices-advanced-regression-techniques/stacking-starter/code). To understand this Ensemble Model, please review this [article](http://mlwave.com/kaggle-ensembling-guide/).

In [1]:
import pandas as pd
import numpy as np

from scipy.stats import skew
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso

from math import sqrt

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')



In [2]:
TARGET = 'SalePrice'
NFOLDS = 4
SEED = 0
NROWS = None
SUBMISSION_FILE = 'output/xgstacker_starter.csv'

# Load data

In [3]:
## Load the data ##
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")

ntrain = train.shape[0]
ntest = test.shape[0]

# Data preprocessing

In [4]:
y_train = np.log(train[TARGET]+1)
train.drop([TARGET], axis=1, inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

In [5]:
#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

all_data = pd.get_dummies(all_data)

In [6]:
skewed_feats

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtHalfBath', 'KitchenAbvGr',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal'],
      dtype='object')

In [7]:
#filling NA's with the mean of the column:
all_data = all_data.fillna(all_data.mean())

In [8]:
#creating matrices for sklearn:

x_train = np.array(all_data[:train.shape[0]])
x_test = np.array(all_data[train.shape[0]:])

In [9]:
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [10]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

In [11]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [12]:
#out of folds
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [21]:
#for i, (train_index, test_index) in enumerate(kf):
    #print(i)
    #print(train_index, test_index)

In [13]:
#Ridge
rd_params={
    'alpha': 10
}

#Lasso
ls_params={
    'alpha': 0.005
}

#ExtraTreesRegressor
et_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

#RandomForestRegressor
rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

#XGBRegressor
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'nrounds': 500
}

In [14]:
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
xg = XgbWrapper(seed=SEED, params=xgb_params)

In [15]:
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)

In [16]:
print("RD-CV: {}".format(sqrt(mean_squared_error(y_train, rd_oof_train))))
print("LS-CV: {}".format(sqrt(mean_squared_error(y_train, ls_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))

RD-CV: 0.12948165396809874
LS-CV: 0.1434706431519249
ET-CV: 0.1448541734413712
RF-CV: 0.14109927488749133
XG-CV: 0.12423630369301894


In [17]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, rd_oof_train, ls_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, rd_oof_test, ls_oof_test), axis=1)

In [18]:
print("{},{}".format(x_train.shape, x_test.shape))

(1460, 5),(1459, 5)


In [19]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

In [22]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

In [23]:
res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

[0]	train-rmse:11.4159+0.00727902	test-rmse:11.4159+0.0221769
[10]	train-rmse:10.3275+0.00647517	test-rmse:10.3275+0.0230641
[20]	train-rmse:9.3429+0.00554988	test-rmse:9.34287+0.024078
[30]	train-rmse:8.45265+0.00513113	test-rmse:8.45262+0.0246051
[40]	train-rmse:7.64738+0.00467982	test-rmse:7.64739+0.0251128
[50]	train-rmse:6.91914+0.00420257	test-rmse:6.91935+0.0248506
[60]	train-rmse:6.26005+0.00376315	test-rmse:6.26036+0.0244044
[70]	train-rmse:5.66419+0.0032681	test-rmse:5.66445+0.023992
[80]	train-rmse:5.12536+0.00307779	test-rmse:5.12549+0.023425
[90]	train-rmse:4.63793+0.00261757	test-rmse:4.63797+0.0230213
[100]	train-rmse:4.19691+0.00230221	test-rmse:4.19705+0.022602
[110]	train-rmse:3.79807+0.00209661	test-rmse:3.79819+0.0221507
[120]	train-rmse:3.43723+0.00190219	test-rmse:3.43743+0.0218567
[130]	train-rmse:3.11101+0.00184395	test-rmse:3.11112+0.0213777
[140]	train-rmse:2.81563+0.00157335	test-rmse:2.81592+0.0211086
[150]	train-rmse:2.54879+0.00151812	test-rmse:2.54916+0.0

In [24]:
best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

In [28]:
print('Ensemble-CV: {0} (+/- {1})'.format(cv_mean, cv_std))

Ensemble-CV: 0.12152400000000001 (+/- 0.010640178593425961)


In [29]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

In [30]:
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)
saleprice = np.exp(submission['SalePrice'])-1
submission['SalePrice'] = saleprice
submission.to_csv('output/xgstacker_starter.csv', index=None)