In [2]:
import pandas as pd
import numpy as np
import pickle

def save(model, filename='bestmodel.pickle'):
    with open('output/'+filename, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
def submit(model):
    model = model.fit(train_x,train_y)
    pred = model.predict(final_test)
    final_test['SalePrice'] = np.exp(pred)
    final_test[['Id','SalePrice']].to_csv('output/submission.csv', index=False)

In [3]:
#make train and test datasets. Splitting labels and features happens later
path_train = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/train.csv"
path_test = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/test.csv"

train = pd.read_csv(path_train)
final_test = pd.read_csv(path_test)
print(train.shape, final_test.shape)

y_col = (set(train.columns) - set(final_test.columns)).pop()
train[y_col] = np.log1p(train[y_col])

# since we use cross validation the train set does not have to be split anymore
# from sklearn.model_selection import train_test_split
# train_x, test_x, train_y, test_y = train_test_split(
#     train.drop([y_col], axis=1),train.loc[:,y_col], test_size=0.33, random_state=42) 
# ds = (train_x,test_x,train_y,test_y)
# for d in ds: print(d.shape)
train_x = train.drop([y_col], axis=1)
train_y = train.loc[:,y_col]
train_x.shape, train_y.shape


(1460, 81) (1459, 80)


((1460, 80), (1460,))

### Preprocessing

In [4]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator/denominator)
    return eta


In [5]:
from sklearn.impute import SimpleImputer
# todo multivariate imputation, possibly with pipelines for numeric and categorical data

from sklearn.preprocessing import MinMaxScaler
# https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
# https://stackoverflow.com/questions/51237635/difference-between-standard-scaler-and-minmaxscaler/51237727
# don't know features are normal so just going with minmax scalar atm

from sklearn.preprocessing import OneHotEncoder
#https://stackoverflow.com/questions/36631163/what-are-the-pros-and-cons-between-get-dummies-pandas-and-onehotencoder-sciki
#The crux of it is that the sklearn encoder creates a function which persists and can then be applied to new data sets which use the same categorical variables, with consistent results.
# So don't use pandas get dummies, but a OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

# We create the preprocessing pipelines for both numeric and categorical data.
corr = train.corr()[y_col]
corr = corr.sort_values(ascending=False)
num_x = list(corr.index)[1:]

numeric_transformer = Pipeline(steps=[
    ('impute_num', SimpleImputer(strategy='median')),
    ('scale_num', MinMaxScaler())])

cat_x = [col for col in final_test.columns if final_test[col].dtype == 'object']
cat_x.sort(key = lambda x: -correlation_ratio(train[x],train[y_col]))

categorical_transformer = Pipeline(steps=[
    ('impute_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot_cat', OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, num_x),
        ('category', categorical_transformer, cat_x)])

### Modeling

In [None]:

# from catboost import CatBoostRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet,SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_validate
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def inv_y(transformed_y):
    return np.exp(transformed_y)

from collections import namedtuple
p = namedtuple('params', ['model','hyper'])

models = {'RidgeCV': p(RidgeCV(),
                {}),
            'svm.SVR': p(svm.SVR(),
                {'gamma': (1e-4,0.9,'log-uniform'),
                'C': [1, 10, 100, 1000, 10000]}),
            'LinearRegression':p(LinearRegression(),
                {}),
            'Lasso':p(Lasso(),
                {'alpha':(0.005,1.0,'log-uniform')}),
            'ElasticNet':p(ElasticNet(),
                {}),
            'KNeighborsRegressor':p(KNeighborsRegressor(),
                {}),
            'RandomForestRegressor':p(RandomForestRegressor(),
                {}),
            'SGDRegressor':p(SGDRegressor(),
                {}),
            'CatBoostRegressor':p(CatBoostRegressor(silent=True),
                {}),
            'xgb.XGBRegressor':p(xgb.XGBRegressor(),
                {}),
            'lgb.LGBMRegressor':
                p(lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20),
                {}),
            'HistGradientBoostingRegressor':p(HistGradientBoostingRegressor(),
                {}),
            'MLPRegressor':p(MLPRegressor(),
                {})
}

def get_pipeline(model):
    return Pipeline([('preprocess', preprocessor),
                   ('model',model)])



In [10]:
for name in models:
    pipe = get_pipeline(models[name].model)
    # score = -1 * cross_val_score(pipe, train_x, train_y,cv=3,scoring='neg_root_mean_squared_error')
    # below is necessary for looking at train scores
    num_fold = 3
    scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)
    # scoring is identical to 
    # make_scorer(mean_squared_error,greater_is_better=False, root=False,squared=False)
    # neg_mean_squared_log_error
    print(f"{name:<25}train {-1 * sum(scores['train_score'])/num_fold:.3f}, test {-1 * sum(scores['test_score'])/num_fold:.3f}")

RidgeCV                  train 0.115, test 0.143
svm.SVR                  train 0.088, test 0.141
LinearRegression         train 0.089, test 0.151
Lasso                    train 0.399, test 0.399
ElasticNet               train 0.399, test 0.399
KNeighborsRegressor      train 0.169, test 0.215
RandomForestRegressor    train 0.055, test 0.147
SGDRegressor             train 0.196, test 0.234
CatBoostRegressor        train 0.032, test 0.122
xgb.XGBRegressor         train 0.005, test 0.150
lgb.LGBMRegressor        train 0.190, test 0.211
HistGradientBoostingRegressortrain nan, test nan
MLPRegressor             train 0.136, test 0.201


In [14]:
pipe = get_pipeline(Lasso())
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocess', 'model', 'preprocess__n_jobs', 'preprocess__remainder', 'preprocess__sparse_threshold', 'preprocess__transformer_weights', 'preprocess__transformers', 'preprocess__verbose', 'preprocess__numeric', 'preprocess__category', 'preprocess__numeric__memory', 'preprocess__numeric__steps', 'preprocess__numeric__verbose', 'preprocess__numeric__impute_num', 'preprocess__numeric__scale_num', 'preprocess__numeric__impute_num__add_indicator', 'preprocess__numeric__impute_num__copy', 'preprocess__numeric__impute_num__fill_value', 'preprocess__numeric__impute_num__missing_values', 'preprocess__numeric__impute_num__strategy', 'preprocess__numeric__impute_num__verbose', 'preprocess__numeric__scale_num__copy', 'preprocess__numeric__scale_num__feature_range', 'preprocess__category__memory', 'preprocess__category__steps', 'preprocess__category__verbose', 'preprocess__category__impute_cat', 'preprocess__category__onehot_cat', 'preprocess__category__impu

## Hyperparameter tuning

In [24]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from aoc import timeit

def model_instance(model_name):
    model = {'model': [models[model_name].model]}
    for k,v in models[model_name].hyper.items():
        model['model__'+k] = v
    return (model, 50)

In [47]:

# callback handler
train_status = { 'current_score': -100, 'not_improving': 0, 'stop_thres' : 2}
def on_step(optim_result):
    score = opt.best_score_
    print(optim_result.x)
    print(f"{'best score':15}{score}")
    # print(f"{'current score':15}{train_status['current_score']}")
    # print(f"{'difference':15}{train_status['current_score']-score}")
    # print(f"{'nip':15}{train_status['not_improving']}")

    if score > train_status['current_score']:
        # print('improv')
        train_status['current_score'] = score
        train_status['not_improving'] = 0
    else:
        # print('NOT')
        train_status['not_improving'] += 1
        if train_status['not_improving'] == train_status['stop_thres']: return True

pipe = get_pipeline(svm.SVR())
to_test = ['Lasso','svm.SVR']
opt = BayesSearchCV(
    pipe,
    [model_instance(name) for name in to_test],
    # cv=3, 
    scoring = 'neg_root_mean_squared_error',
    return_train_score = True,
    random_state = 112 # (parameter space, # of evaluations)
)

with timeit('2x2 20 iter cv=3'): opt.fit(train_x[:1000],train_y[:1000], callback = on_step)
print("val. score: %s" % opt.best_score_)

[Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False), 0.006038940085472412]
best score     -0.16007995002732295
[Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False), 0.006038940085472412]
best score     -0.16007995002732295
[Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False), 0.006038940085472412]
best score     -0.16007995002732295
[SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False), 10, 0.6794963133193531]
best score     -0.16007995002732295
[SVR(C=1.0, cache_

In [44]:
opt.cv_results_

defaultdict(list,
            {'split0_test_score': [-0.14298408221965533,
              -0.16767656330500283,
              -0.16538811396019848,
              -0.39098486470207544,
              -0.1762191966166481,
              -0.19436378887283012,
              -0.19436378887283012,
              -0.3840473270569187,
              -0.27165193046884645,
              -0.37638295597408544,
              -0.3762939490111361,
              -0.19436378887283012,
              -0.39098486470207544],
             'split1_test_score': [-0.14077728538417983,
              -0.1685965805852505,
              -0.1662729954917832,
              -0.40153503739948354,
              -0.16782158150934912,
              -0.17997399798881863,
              -0.17997399798881863,
              -0.39357019526097203,
              -0.26781046573339584,
              -0.38560468291299443,
              -0.3855578562207297,
              -0.17997399798881863,
              -0.40153503739948354],
        

In [None]:
a

In [10]:
opt.best_params_

OrderedDict([('model__C', 2885), ('model__gamma', 0.0001)])

## Saving

In [None]:
train_info = (train_x, train_y, num_x, cat_x)
with open('output/train_info.pickle', 'wb') as handle:
    pickle.dump(train_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
model = get_pipeline(CatBoostRegressor(silent=True))
submit(model)
save(model)

In [0]:
m

In [0]:
ridge, 	        train 0.115, test 0.143
svm, 	        train 0.088, test 0.141
lin, 	        train 0.089, test 0.151
lasso, 	        train 0.109, test 0.137
ElasticNet, 	train 0.399, test 0.399
GammaRegressor, train 0.248, test 0.251
KNeighborsRegrestrain 0.169, test 0.215
RandomForestRegrtrain 0.056, test 0.147
SGDRegressor, 	train 0.196, test 0.233
CatBoostRegressotrain 0.032, test 0.122
xgb.XGBRegressortrain 0.005, test 0.150
lgb.LGBMRegressotrain 0.190, test 0.211
HistGradientBoostrain nan, test nan
MLPRegressor, 	train 0.138, test 0.196