In [40]:
import pandas as pd
import numpy as np
import pickle
import os
from math import sqrt
from scipy.stats import skew

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [41]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [42]:
from sklearn.linear_model import BayesianRidge, Ridge, RidgeCV, LinearRegression, ElasticNet, LassoCV, Lasso
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from xgboost import XGBRegressor

# Load data

In [43]:
with open("../input/preprocessed_data.pkl", "rb") as f:
    train_data = pickle.load(f)
    test_data = pickle.load(f)
    ids = pickle.load(f)
    log_train_lables = pickle.load(f)
    test_labels = pickle.load(f)

In [44]:
print('There are {0} instances in training data'.format(train_data.shape[0]))
print('There are {0} instances in testing data'.format(test_data.shape[0]))
print('There are {0} features'.format(test_data.shape[1]))

There are 1304 instances in training data
There are 146 instances in testing data
There are 282 features


# Build model

In [45]:
# Set-up
TARGET = 'SalePrice'
NFOLDS = 5
SEED = 3
NROWS = None
ntrain = train_data.shape[0]
ntest = test_data.shape[0]

In [46]:
# Convert from df to np array
x_train = np.array(train_data)
x_test = np.array(test_data)
y_train = log_train_lables

In [56]:
y_train

0       11.198228
1       12.735968
2       12.546871
3       11.385103
4       12.103492
5       12.449023
6       11.856522
7       11.794345
8       12.287657
9       11.648339
10      11.944714
11      12.384223
12      12.596404
13      11.964007
14      12.445093
15      12.013707
16      12.660331
17      12.323416
18      12.834684
19      12.672950
20      11.512935
21      11.728045
22      11.931642
23      11.775297
24      11.695255
25      11.849405
26      11.898195
27      12.072547
28      12.421188
29      12.692506
          ...    
1274    12.727841
1275    11.686887
1276    12.193499
1277    12.206078
1278    11.691080
1279    11.711785
1280    11.927687
1281    11.982935
1282    11.245059
1283    11.845827
1284    11.964007
1285    11.924379
1286    12.278398
1287    11.824087
1288    12.066816
1289    11.740069
1290    12.373708
1291    12.556733
1292    11.976666
1293    11.850832
1294    11.585255
1295    11.824087
1296    12.577640
1297    12.521981
1298    12

In [47]:
# Reset index. If not do this one, there is a bug later
y_train = y_train.reset_index()
del y_train['index']
y_train = y_train['SalePrice']

In [48]:
# Create k-fold
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED).split(x_train)

In [49]:
class SklearnWrapper(object):
    def __init__(self, clf, params=None):
        #params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

In [50]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [51]:
#out of folds
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [52]:
#Bayesian Ridge
br_params = {
    'alpha_1' : 0.05, 
    'alpha_2' : 0.05
}

#Lasso
ls_params = {
    'alpha': 0.005
}


#Ridge
rd_params = {
    'alpha': 10
}

#Linear
ln_params = {
   'normalize': False 
}

"""
#RandomForestRegressor
rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}
"""

#XGBRegressor
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'nrounds': 500
}

In [53]:
br = SklearnWrapper(clf=BayesianRidge, params=br_params)
ls = SklearnWrapper(clf=Lasso, params=ls_params)
rd = SklearnWrapper(clf=Ridge, params=rd_params)
ln = SklearnWrapper(clf=LinearRegression, params=ln_params)
xg = XgbWrapper(seed=SEED, params=xgb_params)

In [54]:
xg_oof_train, xg_oof_test = get_oof(xg)
br_oof_train, br_oof_test = get_oof(br)
ls_oof_train, ls_oof_test = get_oof(ls)
rd_oof_train, rd_oof_test = get_oof(rd)
ln_oof_train, ln_oof_test = get_oof(ln)

In [55]:
xg_oof_train

array([[ 11.33780193],
       [ 12.6322794 ],
       [ 12.55568504],
       ..., 
       [ 11.67632294],
       [ 11.63257027],
       [ 12.32003117]])

In [16]:
print("BR-CV: {}".format(sqrt(mean_squared_error(y_train, br_oof_train))))
print("LS-CV: {}".format(sqrt(mean_squared_error(y_train, ls_oof_train))))
print("RD-CV: {}".format(sqrt(mean_squared_error(y_train, rd_oof_train))))
print("LN-CV: {}".format(sqrt(mean_squared_error(y_train, ln_oof_train))))
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))

BR-CV: 12.016625713494248
LS-CV: 12.016625713494248
RD-CV: 12.016625713494248
LN-CV: 12.016625713494248
XG-CV: 0.12011081772882488


In [17]:
x_train = np.concatenate((xg_oof_train, br_oof_train, ls_oof_train, rd_oof_train, ln_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, br_oof_test, ls_oof_test, rd_oof_test, ln_oof_test), axis=1)

In [18]:
print("{},{}".format(x_train.shape, x_test.shape))

(1304, 5),(146, 5)


In [19]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

In [20]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

In [30]:
res = xgb.cv(xgb_params, dtrain, num_boost_round=1500, nfold=NFOLDS, seed=SEED, stratified=False,
             early_stopping_rounds=100, verbose_eval=10, show_stdv=True)

[0]	train-rmse:11.4019+0.0076195	test-rmse:11.4019+0.0308878
[10]	train-rmse:10.3148+0.00686126	test-rmse:10.3147+0.0316798
[20]	train-rmse:9.33147+0.00642421	test-rmse:9.33142+0.0321502
[30]	train-rmse:8.442+0.00578244	test-rmse:8.44195+0.0328378
[40]	train-rmse:7.63787+0.00524816	test-rmse:7.63783+0.0334055
[50]	train-rmse:6.9106+0.00467674	test-rmse:6.91076+0.0334286
[60]	train-rmse:6.25276+0.00424265	test-rmse:6.25288+0.0330725
[70]	train-rmse:5.65766+0.00378794	test-rmse:5.65761+0.0328235
[80]	train-rmse:5.1195+0.00349112	test-rmse:5.11938+0.0322148
[90]	train-rmse:4.63269+0.00319562	test-rmse:4.63274+0.0316225
[100]	train-rmse:4.19244+0.00284032	test-rmse:4.19246+0.0310339
[110]	train-rmse:3.7942+0.00267005	test-rmse:3.79435+0.0301588
[120]	train-rmse:3.43385+0.00260083	test-rmse:3.43399+0.0291548
[130]	train-rmse:3.10802+0.00240649	test-rmse:3.10838+0.0283365
[140]	train-rmse:2.81329+0.00214099	test-rmse:2.81345+0.0274562
[150]	train-rmse:2.54685+0.00202091	test-rmse:2.54688+0.0

In [31]:
best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

In [32]:
best_nrounds

1499

In [33]:
print('Ensemble-CV: {0} (+/- {1})'.format(cv_mean, cv_std))

Ensemble-CV: 0.12371940000000001 (+/- 0.014839649707456033)


In [34]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

In [35]:
def modelSubmit(model, dtest, name):
    ln_pred = model.predict(dtest)
    pred = np.expm1(ln_pred)
    submission = pd.DataFrame({"Id": ids, "SalePrice": pred})
    filename = os.path.join(name + ".csv")
    submission.to_csv('../output/' + filename, index=False)

In [36]:
modelSubmit(gbdt, dtest, 'xgstacker_3')

In [37]:
def scoreCalculate(model):
    ln_pred = model.predict(dtest)
    pred = np.expm1(ln_pred)
    log_pred = np.log(pred)
    log_labels = np.log(test_labels)
    return sqrt(mean_squared_error(log_labels, log_pred))

In [38]:
scoreCalculate(gbdt)

0.10678209856551041

# Check accuracy of other models