In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
#From within an ipython notebook
%matplotlib inline
cali_housing = fetch_california_housing()
X = cali_housing.data
y = cali_housing.target

In [2]:
bins = np.arange(6)
binned_y = np.digitize(y, bins)

In [3]:
from sklearn.model_selection import train_test_split
X_train_prin, X_test_prin, y_train_prin, y_test_prin = train_test_split(X,y,test_size=0.2,stratify=binned_y,random_state=7)
binned_y_train_prin = np.digitize(y_train_prin, bins)
X_1, X_stack, y_1, y_stack =train_test_split(X_train_prin,y_train_prin,test_size=0.33,stratify=binned_y_train_prin,random_state=7)


### First base model neural network

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
mlp_pipe = Pipeline(steps=[('scale', StandardScaler()),
('neural_net', MLPRegressor())])
param_grid = {'neural_net__alpha': [0.02,0.01,0.005],
               'neural_net__hidden_layer_sizes' : [(50,50,50)],
               'neural_net__activation': ['relu'],
               'neural_net__solver' : ['adam']
               }
neural_net_gs = GridSearchCV(mlp_pipe, param_grid = param_grid,cv=3, n_jobs=-1)
neural_net_gs.fit(X_1, y_1)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('neural_net', MLPRegressor())]),
             n_jobs=-1,
             param_grid={'neural_net__activation': ['relu'],
                         'neural_net__alpha': [0.02, 0.01, 0.005],
                         'neural_net__hidden_layer_sizes': [(50, 50, 50)],
                         'neural_net__solver': ['adam']})

In [5]:
neural_net_gs.best_params_

{'neural_net__activation': 'relu',
 'neural_net__alpha': 0.02,
 'neural_net__hidden_layer_sizes': (50, 50, 50),
 'neural_net__solver': 'adam'}

In [6]:
neural_net_gs.best_score_

0.7816385770866708

In [7]:
nn_best = neural_net_gs.best_estimator_

In [8]:
import pickle
f = open('nn_best.save', 'wb')
pickle.dump(nn_best, f, protocol = pickle.HIGHEST_PROTOCOL)
f.close()

### Second base model – gradient boost ensemble

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [10]:
param_grid = {'learning_rate': [0.1,0.05,0.03,0.01],
           'loss': ['huber'],
           'max_depth': [5,7,10],
           'max_features': [0.4,0.6,0.8,1.0],
           'min_samples_leaf': [2,3,5],
           'n_estimators': [100],
           'warm_start': [True], 'random_state':[7]
           }
boost_gs = RandomizedSearchCV(GradientBoostingRegressor(),param_distributions = param_grid,cv=3, n_jobs=-1,n_iter=25)
boost_gs.fit(X_1, y_1)
#注意这一步耗时很久，大概有15分钟左右

RandomizedSearchCV(cv=3, estimator=GradientBoostingRegressor(), n_iter=25,
                   n_jobs=-1,
                   param_distributions={'learning_rate': [0.1, 0.05, 0.03,
                                                          0.01],
                                        'loss': ['huber'],
                                        'max_depth': [5, 7, 10],
                                        'max_features': [0.4, 0.6, 0.8, 1.0],
                                        'min_samples_leaf': [2, 3, 5],
                                        'n_estimators': [100],
                                        'random_state': [7],
                                        'warm_start': [True]})

In [11]:
boost_gs.best_score_

0.8250732670290075

In [12]:
boost_gs.best_params_

{'warm_start': True,
 'random_state': 7,
 'n_estimators': 100,
 'min_samples_leaf': 5,
 'max_features': 0.4,
 'max_depth': 10,
 'loss': 'huber',
 'learning_rate': 0.1}

In [13]:
gbt_inst = GradientBoostingRegressor(**{'learning_rate': 0.1,
             'loss': 'huber',
             'max_depth': 10,
             'max_features': 0.4,
             'min_samples_leaf': 5,
             'n_estimators': 4000,
             'warm_start': True, 'random_state':7}).fit(X_1, y_1)

In [14]:
def pickle_func(filename, saved_object):
     import pickle
     f = open(filename, 'wb')
     pickle.dump(saved_object, f, protocol = pickle.HIGHEST_PROTOCOL)
     f.close()
     return None
pickle_func('grad_boost.save', gbt_inst)

### Third base model – bagging regressor of gradient boost ensembles

In [15]:
from sklearn.ensemble import BaggingRegressor,GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
  'max_samples': [0.5,1.0],
  'max_features' : [0.5,1.0],
  'oob_score' : [True, False],
  'base_estimator__min_samples_leaf': [4,5],
  'n_estimators': [20]
}
single_estimator = GradientBoostingRegressor(**{'learning_rate':0.1,
    'loss': 'huber',
    'max_depth': 10,
    'max_features': 0.4,
    'n_estimators': 20,
    'warm_start': True, 'random_state':7})
ensemble_estimator = BaggingRegressor(base_estimator =single_estimator)
pre_gs_inst_bag = RandomizedSearchCV(ensemble_estimator,param_distributions = param_dist,
    cv=3,
    n_iter = 5,
    n_jobs=-1)
pre_gs_inst_bag.fit(X_1, y_1)

RandomizedSearchCV(cv=3,
                   estimator=BaggingRegressor(base_estimator=GradientBoostingRegressor(loss='huber',
                                                                                       max_depth=10,
                                                                                       max_features=0.4,
                                                                                       n_estimators=20,
                                                                                       random_state=7,
                                                                                       warm_start=True)),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'base_estimator__min_samples_leaf': [4,
                                                                             5],
                                        'max_features': [0.5, 1.0],
                                        'max_samples': [0.5, 1.0],
                   

  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "


In [16]:
 pre_gs_inst_bag.best_score_

0.7735076481942181

In [17]:
pre_gs_inst_bag.best_params_

{'oob_score': True,
 'n_estimators': 20,
 'max_samples': 0.5,
 'max_features': 1.0,
 'base_estimator__min_samples_leaf': 4}

In [18]:
pickle_func('bag_gbm.save', pre_gs_inst_bag.best_estimator_)

### Some functions of the stacker

In [19]:
def handle_X_set(X_train_set_in):
    X_train_set = X_train_set_in.copy()
    y_pred_nn = neural_net.predict(X_train_set)
    y_pred_gbt = gbt.predict(X_train_set)
    y_pred_bag = bag_gbm.predict(X_train_set)
    preds_df = pd.DataFrame(columns = ['nn', 'gbt','bag'])
    preds_df['nn'] = y_pred_nn
    preds_df['gbt'] = y_pred_gbt
    preds_df['bag'] = y_pred_bag
    return preds_df
def predict_from_X_set(X_train_set_in):
    X_train_set = X_train_set_in.copy()
    return final_etr.predict(handle_X_set(X_train_set))

In [20]:
def pickle_load_func(filename):
    f = open(filename, 'rb')
    to_return = pickle.load(f)
    f.close()
    return to_return
neural_net = pickle_load_func('nn_best.save')
gbt = pickle_load_func('grad_boost.save')
bag_gbm = pickle_load_func('bag_gbm.save')

In [21]:
preds_df = handle_X_set(X_stack)
print (preds_df.corr())

           nn       gbt       bag
nn   1.000000  0.954076  0.959999
gbt  0.954076  1.000000  0.978489
bag  0.959999  0.978489  1.000000


### Meta-learner – extra trees regressor

In [22]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV
param_dist = {'max_features' : ['sqrt','log2',1.0],
    'min_samples_leaf' : [1, 2, 3, 7, 11],
    'n_estimators': [50, 100],
    'oob_score': [True, False]}
pre_gs_inst = RandomizedSearchCV(ExtraTreesRegressor(warm_start=True,bootstrap=True,random_state=7),
param_distributions = param_dist,
    cv=3,
    n_iter = 15,random_state=7)
pre_gs_inst.fit(preds_df.values, y_stack)

RandomizedSearchCV(cv=3,
                   estimator=ExtraTreesRegressor(bootstrap=True, random_state=7,
                                                 warm_start=True),
                   n_iter=15,
                   param_distributions={'max_features': ['sqrt', 'log2', 1.0],
                                        'min_samples_leaf': [1, 2, 3, 7, 11],
                                        'n_estimators': [50, 100],
                                        'oob_score': [True, False]},
                   random_state=7)