In [3]:
import pandas as pd
import numpy as np
import pickle
import altair as alt
from aoc import timeit
%load_ext autoreload
%autoreload 2

def load(filename):
    f = open(filename,"rb")
    return pickle.load(f)
    
def save(model, filename='bestmodel.pickle'):
    with open('output/'+filename, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

def save_feature_selection(cols, filename='feat_selection.pickle'):
    with open('output/'+filename, 'wb') as handle:
        pickle.dump(cols, handle, protocol=pickle.HIGHEST_PROTOCOL)

def submit(model):
    pred = model.predict(final_test)
    final_test['SalePrice'] = np.exp(pred)
    final_test[['Id','SalePrice']].to_csv('output/submission.csv', index=False)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import data

In [4]:
f = open("output/engineered_datasets.pickle","rb")
train_x, train_y, final_test, num_x, cat_x, cat_x_ind = pickle.load(f)
f = open("output/feat_selection.pickle","rb")
cols = pickle.load(f)

## Import preprocessing pipelines & models

In [5]:

from utils.sklearn_custom_steps import DFSimpleImputer, DFOneHotEncoder,DFMinMaxScaler,DFColumnTransformer,DFOutlierExtractor,DFOutlierExtractor,DFStandardScaler,DFRobustScaler,DFSmartImputer, DFUnSkewer, DFPowerTransformer
from utils.sklearn_custom_steps import get_pipeline
from utils.model_hyperparameters import models

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet,SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_validate
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler
from utils.model_hyperparameters import AutoCatBoostRegressor

## CV methods

In [6]:
def cross_val_models(to_test,train_x=train_x,**kwargs):
    for name in to_test:
        print(f"{name.ljust(20)}", end = ': ')
        pipe = get_pipeline(models[name].model, **models[name].preprocess, **kwargs)
        test_pipeline(pipe, train_x = train_x)
         
def test_model(model,train_x = train_x,param=None):
    if not param: param = {}
    pipe = get_pipeline(model,**param)
    return test_pipeline(pipe, train_x=train_x)

def test_pipeline(pipe,train_x = train_x):
    # print(train_x.shape)
    num_fold = 5
    
    scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)
    print(f"test {-1 * sum(scores['test_score'])/num_fold:.7f}, train {-1 * sum(scores['train_score'])/num_fold:.7f}")
    return pipe

## Trying to hyperparameter search on preprocessing pipeline

In [7]:
f = open("output/hyperparam_tuning.pickle","rb")
results = pickle.load(f)
def get_estimator(model_name, results):
    model = get_pipeline(models[model_name].model, **models[model_name].preprocess)
    model.set_params(**results[model_name].best_params_)
    return model

In [11]:
def hyperparam_search_pipeline(model_name, pipe):
   print('start', model_name)
   param_grid = {
      # 'preprocess__col_trans__category_cat_to_num': [DFOneHotEncoder(handle_unknown="ignore")],
      # 'preprocess__col_trans__numeric__unskew_num' : [DFUnSkewer(),'passthrough'],
      'preprocess__col_trans__numeric__scale_num' : [DFStandardScaler(),DFRobustScaler(),DFMinMaxScaler(),DFPowerTransformer()],
      'preprocess__col_trans__numeric__impute_num__strategy': ['mean','median','most_frequent'],
      'preprocess__col_trans__category__impute_cat__strategy': ['most_frequent','constant']}
   search = GridSearchCV(pipe, param_grid, cv=5,scoring='neg_root_mean_squared_error',verbose=1).fit(train_x[cols], train_y)
   frame =pd.DataFrame(search.cv_results_)
   frame.sort_values(by='rank_test_score', inplace=True)
   return frame
pipe_search = dict()
for model_name in results:
   pipe_search[model_name] = hyperparam_search_pipeline(model_name, get_estimator(model_name, results))
   save(pipe_search,  'hyperparam_pipe.pickle')


start AutoCatBoostRegressor
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 22.1min finished
start ElasticNet
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.0min finished
start KernelRidge
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.2min finished
start Lasso
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   37.2s finished
start xgb.XGBRegressor
Fitting 5 folds for each of 24 candidates, totalli

In [6]:
pipe_search = load('output/hyperparam_pipe.pickle')
for model_name, res in pipe_search.items():
    res = res.reset_index()
    print(model_name)
    print('best score', res['mean_test_score'][0])
    print('best params', 
        res['param_preprocess__col_trans__category__impute_cat__strategy'][0],
        res['param_preprocess__col_trans__numeric__impute_num__strategy'][0],
        res['param_preprocess__col_trans__numeric__scale_num'][0],
    )

AutoCatBoostRegressor
best score -0.1153902521105318
best params constant mean DFMinMaxScaler
ElasticNet
best score -0.11587168950921545
best params most_frequent median DFStandardScaler
KernelRidge
best score -0.11674511912812666
best params most_frequent median DFStandardScaler
Lasso
best score -0.1158660391027462
best params most_frequent median DFStandardScaler
xgb.XGBRegressor
best score -0.11712434284626357
best params most_frequent most_frequent DFMinMaxScaler
lgb.LGBMRegressor
best score -0.11844457036420222
best params constant median DFRobustScaler


## Test of all models

In [11]:
# full dataset
for model_name in results:
    print(model_name)
    test_pipeline(get_estimator(model_name, results),train_x=train_x)

AutoCatBoostRegressor
test 0.1167024, train 0.0267797
ElasticNet
test 0.1176089, train 0.0997127
KernelRidge
test 0.1211799, train 0.1044793
Lasso
test 0.1175548, train 0.0994643
xgb.XGBRegressor
test 0.1217188, train 0.0448069
lgb.LGBMRegressor
test 0.1217541, train 0.0508283


In [12]:
# selected columns dataset
for model_name in results:
    print(model_name)
    test_pipeline(get_estimator(model_name, results),train_x=train_x[cols])

AutoCatBoostRegressor
test 0.1153903, train 0.0287469
ElasticNet
test 0.1158717, train 0.1040336
KernelRidge
test 0.1167451, train 0.1066567
Lasso
test 0.1158660, train 0.1038959
xgb.XGBRegressor
test 0.1171243, train 0.0469092
lgb.LGBMRegressor
test 0.1184446, train 0.0548421


## Hyperparameter tuning

In [9]:
from skopt import BayesSearchCV, callbacks
from skopt.space import Real, Categorical, Integer

TRAIN_TIME = 1500
NUM_ITERATIONS = 150
NO_IMPROVEMENT_STOP_THRES = 25

def gen_opt_settings(model_name):
    model = {'model': [models[model_name].model]}
    for k,v in models[model_name].hyper.items():
        model['model__'+k] = v
    if models[model_name].hyper:
        return (model, NUM_ITERATIONS)
    else:
        return (model, 1)

def optimize_model(model_name,train_x = train_x):
    print('running', model_name)
    def no_improvement_detector(optim_result):
        score = opt.best_score_
        # print(optim_result.x)
        print(f"{'best score':15}{score}")
        if score > opt.train_status['current_score']:
            opt.train_status['current_score'] = score
            opt.train_status['not_improving'] = 0
        else:
            opt.train_status['not_improving'] += 1
            if opt.train_status['not_improving'] == opt.train_status['stop_thres']: return True
    checkpointsaver = callbacks.CheckpointSaver("output/" + model_name + "_skopt.pkl")
    deadlinestopper = callbacks.DeadlineStopper(TRAIN_TIME)

    opt = BayesSearchCV(
        get_pipeline(models[model_name].model, **models[model_name].preprocess),
        [gen_opt_settings(model_name)],
        cv=5, 
        scoring = 'neg_root_mean_squared_error',
        return_train_score = True,
        random_state = 112,
        refit=False
        )
    opt.train_status = { 'current_score': -100, 'not_improving': 0, 'stop_thres' :NO_IMPROVEMENT_STOP_THRES}
    opt.fit(train_x,train_y, callback = [no_improvement_detector,checkpointsaver,deadlinestopper])
    return opt

def hashing(self): return 8398398478478 
CatBoostRegressor.__hash__ = hashing # otherwise skopt flips

to_test = [k for k in models]
to_test = [
    'AutoCatBoostRegressor',
    'ElasticNet',
    'KernelRidge',
    'Lasso',
    'xgb.XGBRegressor',
    'lgb.LGBMRegressor']
results = {}
for name in to_test:
    results[name] = optimize_model(name, train_x[cols])
save(results,'hyperparam_tuning.pickle')

running AutoCatBoostRegressor
best score     -0.11642536345507444
best score     -0.11642536345507444
best score     -0.11642536345507444
best score     -0.11642536345507444
best score     -0.11642536345507444
best score     -0.11541756067751201
best score     -0.11541756067751201
best score     -0.11541756067751201
best score     -0.11541756067751201
best score     -0.11541756067751201
best score     -0.11541756067751201
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
best score     -0.1153902521105318
running ElasticNet
best score     -0.15391917297445676
best score     -0.13207174365947225
best score     -0.1199795307

In [5]:
f = open("output/hyperparam_tuning.pickle","rb")
results = pickle.load(f)

In [10]:
#summarize tuning results
for model in results:
    best_run = results[model].cv_results_['rank_test_score'].index(1)
    mean_test_score = -1 * results[model].cv_results_['mean_test_score'][best_run]
    std_test_score = results[model].cv_results_['std_test_score'][best_run]
    mean_train_score = -1 * results[model].cv_results_['mean_train_score'][best_run]
    mean_score_time = results[model].cv_results_['mean_score_time'][best_run]
    best_params = results[model].best_params_
    print(f"{model:<30} Best score {mean_test_score:.4f} std {std_test_score:.4f} train {mean_train_score:.4f} time {mean_score_time:.4f}")

AutoCatBoostRegressor          Best score 0.1154 std 0.0062 train 0.0287 time 0.0950
ElasticNet                     Best score 0.1159 std 0.0120 train 0.1038 time 0.1147
KernelRidge                    Best score 0.1167 std 0.0104 train 0.1067 time 0.1265
Lasso                          Best score 0.1159 std 0.0119 train 0.1039 time 0.0852
xgb.XGBRegressor               Best score 0.1171 std 0.0109 train 0.0469 time 0.0592
lgb.LGBMRegressor              Best score 0.1183 std 0.0088 train 0.0675 time 0.0702


In [None]:
AutoCatBoostRegressor          Best score 0.1154 std 0.0062 train 0.0287 time 0.0474
ElasticNet                     Best score 0.1159 std 0.0119 train 0.1040 time 0.1059
KernelRidge                    Best score 0.1168 std 0.0104 train 0.1067 time 0.1179
Lasso                          Best score 0.1159 std 0.0119 train 0.1039 time 0.0895
xgb.XGBRegressor               Best score 0.1177 std 0.0090 train 0.0468 time 0.1028
lgb.LGBMRegressor              Best score 0.1190 std 0.0085 train 0.0544 time 0.0870

## Stacking best models from hyperparam

In [11]:
from sklearn.ensemble import StackingRegressor

def get_estimator(model_name):
    model = get_pipeline(models[model_name].model, **models[model_name].preprocess)
    model.set_params(**results[model_name].best_params_)
    return model

to_stack_list = [
    'AutoCatBoostRegressor',
    'ElasticNet',
    'KernelRidge',
    'Lasso',
    'xgb.XGBRegressor',
    'lgb.LGBMRegressor']

# to_stack_list = to_test
# to_stack = [(model_name, results[model_name].best_estimator_) for model_name in to_stack_list]
# to_stack = [(model_name, results[model_name].best_estimator_) for model_name in results]
to_stack = [(model_name, get_estimator(model_name)) for model_name in to_stack_list]
model = StackingRegressor(to_stack, passthrough = False)
num_fold = 5
scores = cross_validate(model, train_x[cols], train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)
print(f"stacking model train {-1 * sum(scores['train_score'])/num_fold:.4f}, test {-1 * sum(scores['test_score'])/num_fold:.4f}")
model.fit(train_x[cols],train_y)
save(model,'ensemble.pickle')

stacking model train 0.0697, test 0.1108


## Blending, to get rid of some of the overfitting

In [14]:
f = open("output/ensemble.pickle","rb")
model = pickle.load(f)

In [18]:
preds = [estimator.predict(final_test) for estimator in model.estimators_]
weights = np.array([1/7]*len(preds) + [(1-len(preds)*(1/7))])
# weights = np.array([1/6]*len(preds))
print(weights)
preds.append(model.predict(final_test))
print(len(preds))
# weigh the individual models with 0.1 and the stacked regressor with the remainder
weighted_preds = preds * weights[:, None]
final_preds = np.sum(weighted_preds,axis=0)
final_test['SalePrice'] = np.exp(final_preds)
final_test[['Id','SalePrice']].to_csv('output/blend_submission.csv', index=False)

[0.14285714 0.14285714 0.14285714 0.14285714 0.14285714 0.14285714
 0.14285714]
7


In [22]:
results['xgb.XGBRegressor'].best_params_

OrderedDict([('model', XGBRegressor(base_score=None, booster=None,
                           colsample_bylevel=0.8796707319903774, colsample_bynode=None,
                           colsample_bytree=0.9954194772260316, gamma=0.009380863327905845,
                           gpu_id=None, importance_type='gain', interaction_constraints=None,
                           learning_rate=0.032237466373952015,
                           max_delta_step=4.320822620313757, max_depth=4, min_child_weight=1,
                           missing=nan, monotone_constraints=None, n_estimators=3827,
                           n_jobs=None, nthread=-1, num_parallel_tree=None,
                           objective='reg:squarederror', random_state=7, reg_alpha=0.464,
                           reg_lambda=0.4614615602391834, scale_pos_weight=None, silent=True,
                           subsample=0.6426784642167161, tree_method=None,
                           validate_parameters=False, verbosity=None)),
         

## Saving

In [13]:
# model = get_pipeline(CatBoostRegressor(silent=True,cat_features=cat_x),onehot=False)
# model = model.fit(train_x,train_y)
submit(model)
save(model,'ensemble.pickle')