In [1]:
import pandas as pd
import numpy as np
import pickle
import altair as alt
from aoc import timeit
%load_ext autoreload
%autoreload 2
np.random.seed(112)
 
def load(filename):
    f = open(filename,"rb")
    return pickle.load(f)
    
def save(model, filename='bestmodel.pickle'):
    with open('output/'+filename, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

def save_feature_selection(cols, filename='feat_selection.pickle'):
    with open('output/'+filename, 'wb') as handle:
        pickle.dump(cols, handle, protocol=pickle.HIGHEST_PROTOCOL)

def submit(model, filename='submission.csv'):
    pred = model.predict(final_test)
    final_test['SalePrice'] = np.exp(pred)
    final_test[['Id','SalePrice']].to_csv('output/'+filename, index=False)

## Import data

In [2]:
train_x, train_y, final_test, num_x, cat_x, cat_x_ind = load("output/engineered_datasets.pickle")
cols = load("output/feat_selection.pickle")
feat_selector = load('output/feat_selector.pickle')
results = load("output/hyperparam_tuning41selected1800.pickle")

## Import preprocessing pipelines & models

In [4]:

from utils.sklearn_custom_steps import DFSimpleImputer, DFOneHotEncoder,DFMinMaxScaler,DFColumnTransformer,DFOutlierExtractor,DFOutlierExtractor,DFStandardScaler,DFRobustScaler,DFSmartImputer, DFUnSkewer, DFPowerTransformer
from utils.sklearn_custom_steps import get_pipeline
from utils.model_hyperparameters import models,AutoCatBoostRegressor

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet,SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import cross_validate,cross_val_score,KFold,GridSearchCV
from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

## CV methods

In [5]:
def cross_val_models(to_test,train_x=train_x,**kwargs):
    for name in to_test:
        print(f"{name.ljust(20)}", end = ': ')
        pipe = get_pipeline(models[name].model, **models[name].preprocess, **kwargs)
        test_pipeline(pipe, train_x = train_x)
         
def test_model(model,train_x = train_x,param=None):
    if not param: param = {}
    pipe = get_pipeline(model,**param)
    return test_pipeline(pipe, train_x=train_x)

def test_pipeline(pipe,train_x = train_x):
    # print(train_x.shape)
    n_fold = 5
    scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv= KFold(n_splits=n_fold,shuffle=True,random_state=112), return_train_score=True)
    print(f"test {-1 * sum(scores['test_score'])/n_fold:.7f}, train {-1 * sum(scores['train_score'])/n_fold:.7f}")
    return pipe

## Trying to hyperparameter search on preprocessing pipeline

In [6]:
def get_estimator(model_name, results):
    model = get_pipeline(models[model_name].model, **models[model_name].preprocess)
    model.set_params(**results[model_name].best_params_)
    return model

In [7]:
def hyperparam_search_pipeline(model_name, pipe):
   print('start', model_name)
   param_grid = {
      'preprocess__col_trans__numeric__scale_num' : [DFStandardScaler(),DFRobustScaler(),DFMinMaxScaler(),DFPowerTransformer()],
      'preprocess__col_trans__numeric__impute_num__strategy': ['mean','median','most_frequent'],
      'preprocess__col_trans__category__impute_cat__strategy': ['most_frequent','constant']}
   search = GridSearchCV(pipe, param_grid, cv=KFold(n_splits=5,shuffle=True,random_state=112),scoring='neg_root_mean_squared_error',verbose=1).fit                               (train_x[cols], train_y)
   frame =pd.DataFrame(search.cv_results_)
   frame.sort_values(by='rank_test_score', inplace=True)
   return frame
pipe_search = dict()
for model_name in results:
   pipe_search[model_name] = hyperparam_search_pipeline(model_name, get_estimator(model_name, results))
   save(pipe_search,  'hyperparam_pipe.pickle')


start AutoCatBoostRegressor
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 11.6min finished
start ElasticNet
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.8min finished
start KernelRidge
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.7min finished
start Lasso
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.3min finished
start xgb.XGBRegressor
Fitting 5 folds for each of 24 candidates, totalli

In [8]:
pipe_search = load('output/hyperparam_pipe.pickle')
for model_name, res in pipe_search.items():
    res = res.reset_index()
    print(model_name)
    print('best score', res['mean_test_score'][0])
    print('best params', 
        res['param_preprocess__col_trans__category__impute_cat__strategy'][0],
        res['param_preprocess__col_trans__numeric__impute_num__strategy'][0],
        res['param_preprocess__col_trans__numeric__scale_num'][0],
    )

AutoCatBoostRegressor
best score -0.12049625587082247
best params most_frequent median DFStandardScaler
ElasticNet
best score -0.12908311231449535
best params most_frequent median DFStandardScaler
KernelRidge
best score -0.12911514458159468
best params most_frequent median DFRobustScaler
Lasso
best score -0.12758131071494794
best params most_frequent median DFMinMaxScaler
xgb.XGBRegressor
best score -0.12316985608032516
best params most_frequent mean DFStandardScaler
lgb.LGBMRegressor
best score -0.12499353543393807
best params constant median DFRobustScaler


In [None]:
AutoCatBoostRegressor
best score -0.1153902521105318
best params constant mean DFMinMaxScaler
            most frequent, median, DFStandard
ElasticNet
best score -0.11587168950921545
best params most_frequent median DFStandardScaler


KernelRidge
best score -0.11674511912812666
best params most_frequent median DFStandardScaler
                                 DFRobust
Lasso
best score -0.1158660391027462
best params most_frequent median DFStandardScaler
                                 MinMax
xgb.XGBRegressor
best score -0.11712434284626357
best params most_frequent most_frequent DFMinMaxScaler
                          mean          DFstandard
lgb.LGBMRegressor
best score -0.11844457036420222
best params constant median DFRobustScaler


In [9]:
pipe_search['AutoCatBoostRegressor']

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocess__col_trans__category__impute_cat__strategy,param_preprocess__col_trans__numeric__impute_num__strategy,param_preprocess__col_trans__numeric__scale_num,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,6.65898,1.222525,0.0486,0.004751,most_frequent,median,DFStandardScaler,{'preprocess__col_trans__category__impute_cat_...,-0.109297,-0.099115,-0.165491,-0.103332,-0.125246,-0.120496,0.024184,1
5,7.309394,1.064434,0.0708,0.022595,most_frequent,median,DFRobustScaler,{'preprocess__col_trans__category__impute_cat_...,-0.109297,-0.099119,-0.165494,-0.103332,-0.125244,-0.120497,0.024184,2
6,7.119582,1.049703,0.071397,0.039832,most_frequent,median,DFMinMaxScaler,{'preprocess__col_trans__category__impute_cat_...,-0.109297,-0.099116,-0.165495,-0.103331,-0.125256,-0.120499,0.024186,3
18,7.75468,0.535469,0.059803,0.011988,constant,median,DFMinMaxScaler,{'preprocess__col_trans__category__impute_cat_...,-0.110001,-0.096812,-0.166659,-0.105505,-0.12461,-0.120717,0.024672,4
17,8.01748,1.090508,0.070892,0.029152,constant,median,DFRobustScaler,{'preprocess__col_trans__category__impute_cat_...,-0.110001,-0.096815,-0.166657,-0.105507,-0.12461,-0.120718,0.024671,5
16,7.190931,0.872688,0.085801,0.016415,constant,median,DFStandardScaler,{'preprocess__col_trans__category__impute_cat_...,-0.110001,-0.096812,-0.166667,-0.105503,-0.12461,-0.120719,0.024675,6
8,6.977011,0.587341,0.076999,0.04077,most_frequent,most_frequent,DFStandardScaler,{'preprocess__col_trans__category__impute_cat_...,-0.109272,-0.098369,-0.165912,-0.104519,-0.125627,-0.12074,0.024329,7
9,8.363065,0.46497,0.087203,0.031453,most_frequent,most_frequent,DFRobustScaler,{'preprocess__col_trans__category__impute_cat_...,-0.109272,-0.098369,-0.165923,-0.104523,-0.125625,-0.120743,0.024332,8
10,6.715616,0.873299,0.058599,0.015968,most_frequent,most_frequent,DFMinMaxScaler,{'preprocess__col_trans__category__impute_cat_...,-0.109272,-0.09837,-0.165922,-0.104521,-0.125634,-0.120744,0.024332,9
20,6.962792,0.761446,0.0534,0.008162,constant,most_frequent,DFStandardScaler,{'preprocess__col_trans__category__impute_cat_...,-0.112492,-0.097233,-0.165408,-0.107448,-0.122473,-0.121011,0.023644,10


## Test of all models

In [32]:
# selected columns dataset
for model_name in results:
    print(model_name)
    test_pipeline(get_estimator(model_name, results),train_x=train_x[cols])

AutoCatBoostRegressor
test 0.1210077, train 0.0497454
ElasticNet
test 0.1294588, train 0.1032053
KernelRidge
test 0.1306891, train 0.1062492
Lasso
test 0.1321611, train 0.1046623
xgb.XGBRegressor
test 0.1240356, train 0.0472714
lgb.LGBMRegressor
test 0.1256687, train 0.0652470


## Hyperparameter tuning

In [33]:
from skopt import BayesSearchCV, callbacks
from skopt.space import Real, Categorical, Integer

NUM_ITERATIONS = 150
NO_IMPROVEMENT_STOP_THRES = 25

def gen_opt_settings(model_name):
    model = {'model': [models[model_name].model]}
    for k,v in models[model_name].hyper.items():
        model['model__'+k] = v
    if models[model_name].hyper:
        return (model, NUM_ITERATIONS)
    else:
        return (model, 1)

def optimize_model(model_name,train_x = train_x, train_time = 600):
    print('running', model_name)
    def no_improvement_detector(optim_result):
        score = opt.best_score_
        # print(optim_result.x)
        print(f"{'best score':15}{score}")
        if score > opt.train_status['current_score']:
            opt.train_status['current_score'] = score
            opt.train_status['not_improving'] = 0
        else:
            opt.train_status['not_improving'] += 1
            if opt.train_status['not_improving'] == opt.train_status['stop_thres']: return True
    checkpointsaver = callbacks.CheckpointSaver("output/" + model_name + "_skopt.pkl")
    deadlinestopper = callbacks.DeadlineStopper(train_time)

    opt = BayesSearchCV(
        get_pipeline(models[model_name].model, **models[model_name].preprocess),
        [gen_opt_settings(model_name)],
        cv=KFold(n_splits=5,shuffle=True,random_state=112), 
        scoring = 'neg_root_mean_squared_error',
        return_train_score = True,
        random_state = 112,
        refit=False
        )
    opt.train_status = { 'current_score': -100, 'not_improving': 0, 'stop_thres' :NO_IMPROVEMENT_STOP_THRES}
    opt.fit(train_x,train_y, callback = [no_improvement_detector,checkpointsaver,deadlinestopper])
    return opt

def hashing(self): return 8398398478478 
CatBoostRegressor.__hash__ = hashing # otherwise skopt flips

In [17]:
results = load("output/hyperparam_tuning41selected1800")
print_results(results)

AutoCatBoostRegressor          Best score 0.1129 std 0.0077 train 0.0496 time 0.0332
ElasticNet                     Best score 0.1159 std 0.0111 train 0.1034 time 0.1005
KernelRidge                    Best score 0.1162 std 0.0100 train 0.1065 time 0.1085
Lasso                          Best score 0.1153 std 0.0112 train 0.1050 time 0.1251
xgb.XGBRegressor               Best score 0.1160 std 0.0105 train 0.0474 time 0.0592
lgb.LGBMRegressor              Best score 0.1184 std 0.0088 train 0.0665 time 0.0650


In [10]:
#summarize tuning results
def print_results(results):
    for model in results:
        best_run = results[model].cv_results_['rank_test_score'].index(1)
        mean_test_score = -1 * results[model].cv_results_['mean_test_score'][best_run]
        std_test_score = results[model].cv_results_['std_test_score'][best_run]
        mean_train_score = -1 * results[model].cv_results_['mean_train_score'][best_run]
        mean_score_time = results[model].cv_results_['mean_score_time'][best_run]
        best_params = results[model].best_params_
        print(f"{model:<30} Best score {mean_test_score:.4f} std {std_test_score:.4f} train {mean_train_score:.4f} time {mean_score_time:.4f}")

## Stacking best models from hyperparam

In [34]:
from sklearn.ensemble import StackingRegressor

def get_estimator(model_name, results):
    model = get_pipeline(models[model_name].model, **models[model_name].preprocess)
    model.set_params(**results[model_name].best_params_)
    return model

def get_stacked_model(results,train_x=train_x):
    to_stack_list = [
        'AutoCatBoostRegressor',
        'ElasticNet',
        'KernelRidge',
        'Lasso',
        'xgb.XGBRegressor',
        'lgb.LGBMRegressor']

    # to_stack_list = to_test
    # to_stack = [(model_name, results[model_name].best_estimator_) for model_name in to_stack_list]
    # to_stack = [(model_name, results[model_name].best_estimator_) for model_name in results]
    to_stack = [(model_name, get_estimator(model_name, results)) for model_name in to_stack_list]
    model = StackingRegressor(to_stack, final_estimator = (LinearRegression()), passthrough = False)
    scores = cross_validate(model, train_x, train_y, scoring='neg_root_mean_squared_error', cv=KFold(n_splits=5,shuffle=True,random_state=112), return_train_score=True)
    print(f"stacking model train {-1 * sum(scores['train_score'])/5:.4f}, test {-1 * sum(scores['test_score'])/5:.4f}")
    model.fit(train_x,train_y)
    return model

## Blending, to get rid of some of the overfitting

In [35]:
def blend(model, filename):
    preds = [estimator.predict(final_test) for estimator in model.estimators_]
    # weights = np.array([0.2]*len(preds) + [(1-len(preds)*0.2)])
    weights = np.array([1/6]*len(preds))
    print(weights)
    # preds.append(model.predict(final_test))
    print(len(preds))
    # weigh the individual models with 0.1 and the stacked regressor with the remainder
    weighted_preds = preds * weights[:, None]
    final_preds = np.sum(weighted_preds,axis=0)
    final_test['SalePrice'] = np.exp(final_preds)
    final_test[['Id','SalePrice']].to_csv('output/'+filename, index=False)

## Experiment with combination of the amount of runs from feature selection and hyperparameter tune tune

In [36]:
feat_selector = load('output/feat_selector.pickle')
for run_amount, strictness in [(41,'selected')]:
    for train_time in [1800]:
        print(f'starting,{run_amount},{strictness},{train_time}')
        cols = set(feat_selector.named_steps['model'].runs[run_amount][strictness])
        print('len col', len(cols))
        to_test = [k for k in models]
        to_test = [
            'AutoCatBoostRegressor',
            'ElasticNet',
            'KernelRidge',
            'Lasso',
            'xgb.XGBRegressor',
            'lgb.LGBMRegressor']
        results = {}
        for name in to_test:
            results[name] = optimize_model(name, train_x[cols], train_time)
        save(results,'hyperparam_tuning'+str(run_amount)+strictness+str(train_time))
        print('hyperparameter tuning done')
        print(print_results(results))
        model = get_stacked_model(results,train_x=train_x[cols])
        submit(model, 'submission'+str(run_amount)+strictness+str(train_time)+'csv')
        save(model, 'ensemble'+str(run_amount)+strictness+str(train_time)+'pickle')
        blend(model,'blend'+str(run_amount)+strictness+str(train_time)+'.csv')

starting,41,selected,1800
len col 100
running AutoCatBoostRegressor
best score     -0.12098699789403443
best score     -0.12098699789403443
best score     -0.12098699789403443
best score     -0.12098699789403443
best score     -0.12098699789403443
best score     -0.12098699789403443
best score     -0.12098699789403443
best score     -0.12098699789403443
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12035311172818344
best score     -0.12024815523158199
best score     -0.12024815523158199
best score     -0.12024815523158