In [1]:
import pandas as pd
import numpy as np
import pickle

def save(model, filename='bestmodel.pickle'):
    with open('output/'+filename, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
def submit(model):
    model = model.fit(train_x,train_y)
    pred = model.predict(final_test)
    final_test['SalePrice'] = np.exp(pred)
    final_test[['Id','SalePrice']].to_csv('output/submission.csv', index=False)

In [2]:
#make train and test datasets. Splitting labels and features happens later
path_train = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/train.csv"
path_test = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/test.csv"

train = pd.read_csv(path_train)
final_test = pd.read_csv(path_test)
print(train.shape, final_test.shape)

y_col = (set(train.columns) - set(final_test.columns)).pop()
train[y_col] = np.log1p(train[y_col])

# since we use cross validation the train set does not have to be split anymore
# from sklearn.model_selection import train_test_split
# train_x, test_x, train_y, test_y = train_test_split(
#     train.drop([y_col], axis=1),train.loc[:,y_col], test_size=0.33, random_state=42) 
# ds = (train_x,test_x,train_y,test_y)
# for d in ds: print(d.shape)
train_x = train.drop([y_col], axis=1)
train_y = train.loc[:,y_col]
train_x.shape, train_y.shape


(1460, 81) (1459, 80)


((1460, 80), (1460,))

### Preprocessing

In [3]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator/denominator)
    return eta


In [51]:
from sklearn.impute import SimpleImputer
# todo multivariate imputation, possibly with pipelines for numeric and categorical data

from sklearn.preprocessing import MinMaxScaler
# https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
# https://stackoverflow.com/questions/51237635/difference-between-standard-scaler-and-minmaxscaler/51237727
# don't know features are normal so just going with minmax scalar atm

from sklearn.preprocessing import OneHotEncoder
#https://stackoverflow.com/questions/36631163/what-are-the-pros-and-cons-between-get-dummies-pandas-and-onehotencoder-sciki
#The crux of it is that the sklearn encoder creates a function which persists and can then be applied to new data sets which use the same categorical variables, with consistent results.
# So don't use pandas get dummies, but a OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

# We create the preprocessing pipelines for both numeric and categorical data.
corr = train.corr()[y_col]
corr = corr.sort_values(ascending=False)
num_x = list(corr.index)[1:]

numeric_transformer = Pipeline(steps=[
    ('impute_num', SimpleImputer(strategy='median')),
    ('scale_num', MinMaxScaler())])

cat_x = [col for col in final_test.columns if final_test[col].dtype == 'object']
cat_x.sort(key = lambda x: -correlation_ratio(train[x],train[y_col]))

categorical_transformer = Pipeline(steps=[
    ('impute_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot_cat', OneHotEncoder(handle_unknown="ignore"))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, num_x),
        ('category', categorical_transformer, cat_x)])

### Modeling

In [80]:

# from catboost import CatBoostRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet,SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_validate
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def inv_y(transformed_y):
    return np.exp(transformed_y)

from collections import namedtuple
p = namedtuple('params', ['model','hyper'])

cat_x_ind = [ind for ind,name in enumerate(train_x.columns) if name in cat_x ]
cat_x_ind = list(range(37,80))


from skopt.space import Real, Categorical, Integer
models = {'Ridge': p(Ridge(),
                {'alpha':(0.00001,1.0,'log-uniform')}),
            'svm.SVR': p(svm.SVR(),
                {'gamma': (1e-4,0.9,'log-uniform'),
                'C': [1, 10, 100, 1000, 10000]}),
            'LinearRegression':p(LinearRegression(),
                {}),
            'Lasso':p(Lasso(),
                {'alpha':(0.00001,1.0,'log-uniform')}),
            'ElasticNet':p(ElasticNet(),
                {}),
            'KNeighborsRegressor':p(KNeighborsRegressor(),
                {}),
            'RandomForestRegressor':p(RandomForestRegressor(),
                {'n_estimators' : (1e-4,0.9,'log-uniform'),
                'max_depth' : [3, 10, 20, 40]}),
            'SGDRegressor':p(SGDRegressor(),
                {'loss' : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                'penalty' : ['l1', 'l2', 'elasticnet'],
                'alpha' : (0.0,1000,'log-uniform'),
                'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
                'class_weight' : [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}],
                'eta0' : [1, 10, 100]}),
            'CatBoostRegressor':p(CatBoostRegressor(silent=True,one_hot_max_size=20, cat_features = cat_x_ind,iterations=1500),
                {'depth': Integer(1, 8)}),


                # {'iterations': Integer(10, 1000),
                #  'depth': Integer(4, 10),
                #  'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                #  'random_strength': Real(1e-9, 10, 'log-uniform'),
                #  'bagging_temperature': Real(0.0, 1.0),
                #  'border_count': Integer(1, 255),
                #  'l2_leaf_reg': Integer(2, 30),
                #  'scale_pos_weight':Real(0.01, 1.0, 'uniform')}),
            'xgb.XGBRegressor':p(xgb.XGBRegressor(silent=True),
                {'max_depth': (4, 10), #9,12
                'min_child_weight': (0, 10), # if leaves with small amount of observations are allowed?
                'gamma' : (0,), # these 3 for model complexity. gemma is a threshold for gain of the new split. 
                'subsample': (1,),
                'colsample_bytree' : (0.5,1.0), # these 3 for making model more robust to noise
                'reg_lambda' : (0.0,2.0), # regularization lambda, reduces similarity scores and therefore lowers gain. reduces sensitivity to individual observations
                'colsample_bylevel': (0.7,1.0),
                'learning_rate': (0.01,0.4), # 0.3 is default
                'max_delta_step': (0.0,10.0),
                'n_estimators': (10,100),
            'lgb.LGBMRegressor':
                p(lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20),
                {}),
            'HistGradientBoostingRegressor':p(HistGradientBoostingRegressor(),
                {}),
            'CatBoostRegressorVanilla':p(CatBoostRegressor(silent=True),
                {}),
            'LassoCV':p(LassoCV(),
                {}),
            'MLPRegressor':p(MLPRegressor(),
                {})
}

# Logistic
# penalty = ['l1', 'l2']
# C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
# class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]

def get_pipeline(model):
    return Pipeline([('preprocess', preprocessor),
                   ('model',model)])



## Getting catboost to work

In [71]:
for name in train_x.columns:
    if name in cat_x:
        train_x[name] = train_x[name].astype(str)
        train_x[name].apply(str)
cat_x_ind = [ind for ind,name in enumerate(train_x.columns) if name in cat_x ]
cat_x_ind = list(range(37,80))
pipe = get_pipeline(CatBoostRegressor(silent=True,one_hot_max_size=20, cat_features = cat_x_ind,iterations=1500))
num_fold = 3
scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)

print(f"{name:<25}train {-1 * sum(scores['train_score'])/num_fold:.3f}, test {-1 * sum(scores['test_score'])/num_fold:.3f}")

SaleCondition            train nan, test nan


In [129]:
fitted = pipe.named_steps['model'].fit(pipe[:-1].transform(train_x[:1000]),train_y[:1000],eval_set=(
    pipe[:-1].transform(train_x[1000:]),train_y[1000:]
))

In [130]:
fits = {}
fits['learn'] = list(fitted.get_evals_result()['learn'].values())[0]
fits['validation'] = list(fitted.get_evals_result()['validation'].values())[0]
df = pd.DataFrame.from_dict(fits)
df.reset_index(inplace=True)
df =  df.melt('index',var_name= 'type',value_name='score')
fitted.get_evals_result()
import altair as alt

alt.Chart(df).mark_line().encode(
    x='index',
    y='score',
    color='type',
)

In [138]:
list(fitted.get_evals_result()['validation'].values())[0][350]

0.12308042395982173

In [None]:
# test_results['Lasso'].cv_results_
test_results['Lasso'].best_params_
test_results['Lasso'].best_score_
# test_results['Lasso'].total_iterations

## Rough test of all models

In [81]:
def cross_val_models(to_test):
    for name in to_test:
        pipe = get_pipeline(models[name].model)
        # score = -1 * cross_val_score(pipe, train_x, train_y,cv=3,scoring='neg_root_mean_squared_error')
        # below is necessary for looking at train scores
        num_fold = 3
        scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)
        # scoring is identical to 
        # make_scorer(mean_squared_error,greater_is_better=False, root=False,squared=False)
        # neg_mean_squared_log_error
        print(f"{name:<25}train {-1 * sum(scores['train_score'])/num_fold:.3f}, test {-1 * sum(scores['test_score'])/num_fold:.3f}")
cross_val_models(models)

Ridge                    train 0.097, test 0.143
svm.SVR                  train 0.087, test 0.143
LinearRegression         train 0.088, test 0.152
Lasso                    train 0.399, test 0.399
ElasticNet               train 0.399, test 0.399
KNeighborsRegressor      train 0.164, test 0.205
RandomForestRegressor    train 0.055, test 0.148
SGDRegressor             train 0.200, test 0.232
CatBoostRegressor        train nan, test nan
xgb.XGBRegressor         train 0.006, test 0.146
lgb.LGBMRegressor        train 0.190, test 0.212
HistGradientBoostingRegressortrain nan, test nan
CatBoostRegressorVanilla train 0.031, test 0.123
LassoCV                  train 0.110, test 0.137
MLPRegressor             train 0.130, test 0.203


## Hyperparameter tuning

In [78]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from aoc import timeit

def model_instance(model_name):
    model = {'model': [models[model_name].model]}
    for k,v in models[model_name].hyper.items():
        model['model__'+k] = v
    return (model, 50)

def run_model(name):
    def on_step(optim_result):
        score = opt.best_score_
        print(f"{'best score':15}{score}")
        if score > opt.train_status['current_score']:
            opt.train_status['current_score'] = score
            opt.train_status['not_improving'] = 0
        else:
            opt.train_status['not_improving'] += 1
            if opt.train_status['not_improving'] == opt.train_status['stop_thres']: return True
    pipe = get_pipeline(LinearRegression())


    opt = BayesSearchCV(
    pipe,
    [model_instance(name)],

    cv=5, 
    scoring = 'neg_root_mean_squared_error',
    return_train_score = True,
    random_state = 112 
    )
    opt.train_status = { 'current_score': -100, 'not_improving': 0, 'stop_thres' : 20}

    with timeit('fitting'+name): opt.fit(train_x,train_y, callback = on_step)
    return opt

pipe = get_pipeline(svm.SVR())
to_test = ['CatBoostRegressor']
run_model('xgb.XGBRegressor')
# test_results = {name: run_model(name) for name in to_test}

best score     -0.13509325166348696
best score     -0.13460671266583757
best score     -0.13460671266583757
best score     -0.13460671266583757
best score     -0.13460671266583757
best score     -0.13460671266583757
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
best score     -0.13128804907177494
INFO:root:fittingxgb.XGBRegr

BayesSearchCV(cv=5, error_score='raise',
              estimator=Pipeline(memory=None,
                                 steps=[('preprocess',
                                         ColumnTransformer(n_jobs=None,
                                                           remainder='drop',
                                                           sparse_threshold=0.3,
                                                           transformer_weights=None,
                                                           transformers=[('numeric',
                                                                          Pipeline(memory=None,
                                                                                   steps=[('impute_num',
                                                                                           SimpleImputer(add_indicator=False,
                                                                                                         copy=True,
               

In [42]:
from catboost import CatBoostRegressor
opt = BayesSearchCV(
    pipe,
    # [model_instance(name)],
    [({'model': [xgb.XGBRegressor()]},
 50)])

In [48]:
    [({'model': [CatBoostRegressor()],
  'model__depth': Integer(1,8)},
 50)]

[({'model': [<catboost.core.CatBoostRegressor at 0x1b7fcea36d8>],
   'model__depth': Integer(low=1, high=8, prior='uniform', transform='identity')},
  50)]

In [41]:
import catboost
clf = catboost.CatBoostRegressor()
search_spaces = {'iterations': (10, 1000),
                 'depth': (1, 10),
                 'learning_rate': (0.001, 0.5),
                 'random_strength': (1e-9, 10)}
pt = BayesSearchCV(clf,
                    search_spaces,
                    n_iter=40)

In [55]:
# test_results['Lasso'].cv_results_
test_results['Lasso'].best_params_
test_results['Lasso'].best_score_
# test_results['Lasso'].total_iterations

-0.13479551731346873

In [64]:
pipe = get_pipeline(Lasso(alpha=0.0006016868735786635))
# score = -1 * cross_val_score(pipe, train_x, train_y,cv=3,scoring='neg_root_mean_squared_error')
# below is necessary for looking at train scores
num_fold = 5
scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)
# scoring is identical to 
# make_scorer(mean_squared_error,greater_is_better=False, root=False,squared=False)
# neg_mean_squared_log_error
print(f"{name:<25}train {-1 * sum(scores['train_score'])/num_fold:.4f}, test {-1 * sum(scores['test_score'])/num_fold:.4f}")

MLPRegressor             train 0.1090, test 0.1348


In [61]:
pipe = get_pipeline(Lasso(alpha=0.0006607161323268261))
# score = -1 * cross_val_score(pipe, train_x, train_y,cv=3,scoring='neg_root_mean_squared_error')
# below is necessary for looking at train scores
num_fold = 5
scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)
# scoring is identical to 
# make_scorer(mean_squared_error,greater_is_better=False, root=False,squared=False)
# neg_mean_squared_log_error
print(f"{name:<25}train {-1 * sum(scores['train_score'])/num_fold:.4f}, test {-1 * sum(scores['test_score'])/num_fold:.4f}")

MLPRegressor             train 0.1107, test 0.1350


## Saving

In [None]:
train_info = (train_x, train_y, num_x, cat_x)
with open('output/train_info.pickle', 'wb') as handle:
    pickle.dump(train_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
model = get_pipeline(CatBoostRegressor(silent=True))
submit(model)
save(model)

In [65]:
[for subsample in [i/10. for i in range(7,11)]

SyntaxError: invalid syntax (<ipython-input-65-1d6f7fb5eef4>, line 1)

In [0]:
m