### Stacking with multiple regressors

In [163]:
import importlib
import helper_functions
import pandas as pd
importlib.reload(helper_functions)
from helper_functions import *
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion, _transform_one
from sklearn.externals.joblib import Parallel, delayed
from sklearn.ensemble import BaggingRegressor

In [164]:
# Defining pipeline as in other examples
trans_pipeline = Pipeline([
    ('impute_numerical', DFTransform(lambda X: fill_numerical_nans(X))),
    ('impute_categorical', DFTransform(lambda X: impute_categorical(X))),
    ('impute_special_cases', DFTransform(lambda X: impute_special_cases(X))),
    ('ordinal_features', DFTransform(lambda X: encode_ordinals(X))),
    ('encode_dummies', DFTransform(lambda X: create_dummies(X))),
    ('check_nans', DFTransform(lambda X: check_nans(X))),
    ('create_sellingage', DFTransform(lambda X: create_sellingage(X))),
    ('combined_livingspace', DFTransform(lambda X: combined_livingspace(X)))
])

In [165]:
# Load the data
train_df =  pd.read_csv('data/train.csv')
X_train = train_df.drop(['SalePrice','Id'], axis=1)
y_train = train_df['SalePrice']
X_test = pd.read_csv('data/test.csv').drop(['Id'], axis=1)
X_train, y_train = prepare_inputs(X_train, y_train)

# Transforming the input
X_combined = pd.concat((X_train, X_test)).reset_index(drop=True) 
X_tranformed = trans_pipeline.fit_transform(X_combined)

# Split the transformed input back
X_train_trans = X_tranformed[:X_train.shape[0]] 
X_test_trans = X_tranformed[X_train.shape[0]:]

Creating dummies...
Starting with input of shape: (2915, 79)
Returning output of shape: (2915, 221)


In [166]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train_trans, y_train, test_size=.33, random_state=42)

In [167]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import Ridge

# Define first estimator
adaboost_estimator = AdaBoostRegressor(base_estimator=Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='cholesky', tol=0.001),
         learning_rate=0.0001, loss='square', n_estimators=500,
         random_state=42)

In [168]:
# Train the first estimator on the first subset
adaboost_estimator.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='cholesky', tol=0.001),
         learning_rate=0.0001, loss='square', n_estimators=500,
         random_state=42)

In [169]:
# Predict with the first estimator on the second subset
prediction_adaboost = adaboost_estimator.predict(X_train)

In [170]:
from sklearn.linear_model import Lasso

# Define second regressor
lasso_estimator = BaggingRegressor(
        Pipeline(memory=None, steps=[('lasso_regression', Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=42,
   selection='cyclic', tol=0.0001, warm_start=False))]), 
        n_estimators=500, bootstrap=True, n_jobs=4
)

In [171]:
# Train the second estimator on the first subset
lasso_estimator.fit(X_train, y_train)

BaggingRegressor(base_estimator=Pipeline(memory=None,
     steps=[('lasso_regression', Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=42,
   selection='cyclic', tol=0.0001, warm_start=False))]),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=500, n_jobs=4, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [172]:
# Predict with the second estimator on the second subset
prediction_lasso = lasso_estimator.predict(X_train)

In [173]:
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import GridSearchCV   
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import make_scorer

param_dict = {'robust_scaler__with_scaling' : [True, False],
              'robust_scaler__with_centering' : [True, False],
              'ridge_regression__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]
             }
     
scorer = make_scorer(rmse_score, greater_is_better=False)
parameter_estimator = GridSearchCV(Pipeline([
        ('robust_scaler', RobustScaler()),
        ('ridge_regression', Ridge(solver='cholesky'))
            ]), param_dict, scoring=scorer)

parameter_estimator.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('robust_scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('ridge_regression', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='cholesky', tol=0.001))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'robust_scaler__with_scaling': [True, False], 'robust_scaler__with_centering': [True, False], 'ridge_regression__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(rmse_score, greater_is_better=False), verbose=0)

In [174]:
# Not we define a bagging ensemble for the blending
bagging_estimator = parameter_estimator.best_estimator_

In [175]:
# This blender now we train on the predictions of the first layer
X_blended = np.column_stack((prediction_lasso, prediction_adaboost))

bagging_estimator.fit(X_blended, y_train)

Pipeline(memory=None,
     steps=[('robust_scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=False,
       with_scaling=True)), ('ridge_regression', Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='cholesky', tol=0.001))])

In [176]:
# And now we use the stack to make a prediction on unseen data
test_prediction_ada = adaboost_estimator.predict(X_test)
test_prediction_lasso = lasso_estimator.predict(X_test)

X_test_blended = np.column_stack((test_prediction_lasso, test_prediction_ada))
y_predicted = bagging_estimator.predict(X_test_blended)

In [177]:
print_benchmark(y_test, y_predicted)

R2-score: 0.904038380484
RMSE (log): 0.12244306131555652


In [178]:
# Now we train and execute the stacked model on complete data set
train_df =  pd.read_csv('data/train.csv')
X_train = train_df.drop(['SalePrice','Id'], axis=1)
y_train = train_df['SalePrice']
X_test = pd.read_csv('data/test.csv').drop(['Id'], axis=1)
X_train, y_train = prepare_inputs(X_train, y_train)

# Transforming the input
X_combined = pd.concat((X_train, X_test)).reset_index(drop=True) 
X_tranformed = trans_pipeline.fit_transform(X_combined)

# Split the transformed input back
X_train_trans = X_tranformed[:X_train.shape[0]] 
X_test_trans = X_tranformed[X_train.shape[0]:]

Creating dummies...
Starting with input of shape: (2915, 79)
Returning output of shape: (2915, 221)


In [186]:
# First we train on all the training data
train_df =  pd.read_csv('data/train.csv')
X_train = train_df.drop(['SalePrice','Id'], axis=1)
y_train = train_df['SalePrice']
X_test = pd.read_csv('data/test.csv').drop(['Id'], axis=1)
X_train, y_train = prepare_inputs(X_train, y_train)

# Transforming the input
X_combined = pd.concat((X_train, X_test)).reset_index(drop=True) 
X_tranformed = trans_pipeline.fit_transform(X_combined)

# Split the transformed input back
X_train_trans = X_tranformed[:X_train.shape[0]] 
X_test_trans = X_tranformed[X_train.shape[0]:]

Creating dummies...
Starting with input of shape: (2915, 79)
Returning output of shape: (2915, 221)


In [187]:
# First we fit the layer 1 estimators to all the training data
adaboost_estimator.fit(X_train_trans, y_train)
lasso_estimator.fit(X_train_trans, y_train)

# We also train the blender on all the training data
predictions_train_ada = adaboost_estimator.predict(X_train_trans)
predictions_train_lasso = lasso_estimator.predict(X_train_trans)
X_train_blended = np.column_stack((predictions_train_lasso, predictions_train_ada))
bagging_estimator.fit(X_train_blended, y_train)

Pipeline(memory=None,
     steps=[('robust_scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=False,
       with_scaling=True)), ('ridge_regression', Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='cholesky', tol=0.001))])

In [189]:
# To check we first evaluate the predictions on the whole test set
y_predicted_test = bagging_estimator.predict(X_train_blended)
print_benchmark(y_train, y_predicted_test)

R2-score: 0.933802274265
RMSE (log): 0.1018713919828518


In [188]:
# Now we predict on the test set

# Make predictions in the first layer
predictions_test_ada = adaboost_estimator.predict(X_test_trans)
predictions_test_lasso = lasso_estimator.predict(X_test_trans)

# And put them in the blender
X_test_blended = np.column_stack((test_prediction_lasso, test_prediction_ada))
y_predicted = bagging_estimator.predict(X_test_blended)

In [192]:
write_submission(y_predicted, False)

File written to C:\Source\predicting-house-prices\submissions\20180823091142.csv


Kaggle score: 0.12111