In [4]:
import pandas as pd
import numpy as np
import pickle

def save(model, filename='bestmodel.pickle'):
    from sklearn.externals import joblib
    joblib.dump(model, 'output/'+filename, compress = 1)

def submit(model):
    pred = model.predict(final_test)
    final_test['SalePrice'] = np.exp(pred)
    final_test[['Id','SalePrice']].to_csv('output/submission.csv', index=False)

In [5]:
#make train and test datasets. Splitting labels and features happens later
path_train = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/train.csv"
path_test = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/test.csv"

train = pd.read_csv(path_train)
final_test = pd.read_csv(path_test)
print(train.shape, final_test.shape)

y_col = (set(train.columns) - set(final_test.columns)).pop()
train[y_col] = np.log1p(train[y_col])

# since we use cross validation the train set does not have to be split anymore
# from sklearn.model_selection import train_test_split
# train_x, test_x, train_y, test_y = train_test_split(
#     train.drop([y_col], axis=1),train.loc[:,y_col], test_size=0.33, random_state=42) 
# ds = (train_x,test_x,train_y,test_y)
# for d in ds: print(d.shape)
train_x = train.drop([y_col], axis=1)
train_y = train.loc[:,y_col]
train_x.shape, train_y.shape


(1460, 81) (1459, 80)


((1460, 80), (1460,))

### Preprocessing

In [6]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator/denominator)
    return eta


In [7]:
from sklearn.impute import SimpleImputer
# todo multivariate imputation, possibly with pipelines for numeric and categorical data

from sklearn.preprocessing import MinMaxScaler
# https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
# https://stackoverflow.com/questions/51237635/difference-between-standard-scaler-and-minmaxscaler/51237727
# don't know features are normal so just going with minmax scalar atm

from sklearn.preprocessing import OneHotEncoder
#https://stackoverflow.com/questions/36631163/what-are-the-pros-and-cons-between-get-dummies-pandas-and-onehotencoder-sciki
#The crux of it is that the sklearn encoder creates a function which persists and can then be applied to new data sets which use the same categorical variables, with consistent results.
# So don't use pandas get dummies, but a OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

# We create the preprocessing pipelines for both numeric and categorical data.
corr = train.corr()[y_col]
corr = corr.sort_values(ascending=False)
num_x = list(corr.index)[1:]

numeric_transformer = Pipeline(steps=[
    ('impute_num', SimpleImputer(strategy='median')),
    ('scale_num', MinMaxScaler())])

cat_x = [col for col in final_test.columns if final_test[col].dtype == 'object']
cat_x.sort(key = lambda x: -correlation_ratio(train[x],train[y_col]))

categorical_transformer = Pipeline(steps=[
    ('impute_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot_cat', OneHotEncoder(handle_unknown="ignore"))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, num_x),
        ('category', categorical_transformer, cat_x)])

### Modeling

In [8]:

# from catboost import CatBoostRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet,SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_validate
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def inv_y(transformed_y):
    return np.exp(transformed_y)

from collections import namedtuple
p = namedtuple('params', ['model','hyper'])

cat_x_ind = [ind for ind,name in enumerate(train_x.columns) if name in cat_x ]
cat_x_ind = list(range(37,80))

from skopt.space import Real, Categorical, Integer

from sklearn.preprocessing import FunctionTransformer

def densify(x): # needs to use a function, lambda gives problems with pickling
    return x.todense()
def get_pipeline(model):
    return Pipeline([('preprocess', preprocessor),
                        ('to_dense',FunctionTransformer(densify, accept_sparse=True)), # only for HistGradientBoostingRegressor and randomforestregressor
                        ('model',model)])

In [19]:
models = {'Ridge': p(Ridge(),
                {'alpha':(0.00001,1.0,'log-uniform')}),
            'svm.SVR': p(svm.SVR(), # https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html#advanced-example
                {'C': (1e-6, 1e+6, 'log-uniform'),
                'gamma': (1e-6, 1e+1, 'log-uniform'),
                'degree': (1, 8),  # integer valued parameter
                'kernel': ['linear', 'poly', 'rbf']}),
            'LinearRegression':p(LinearRegression(),
                {}),
            'Lasso':p(Lasso(),
                {'alpha':(0.00001,1.0,'log-uniform')}),
            'ElasticNet':p(ElasticNet(),
                {'l1_ratio': (0.01,1.1,'log-uniform'),
                'alpha':(0.00001,0.5,1.0,'log-uniform')
                }),
            'KNeighborsRegressor':p(KNeighborsRegressor(),
                {'n_neighbors': (2,3,4,5,6), 
                'weights': ['uniform','distance']}),
            'RandomForestRegressor':p(RandomForestRegressor(),
                {'n_estimators' : (1,100, 'log-uniform'), # gamble
                'max_depth' : (3, 40,'log-uniform')}),
            # SGD does not work atm with bayessearch (not hashable)
            # 'SGDRegressor':p(SGDRegressor(),
            #     {'loss' : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
            #     'penalty' : ['l1', 'l2', 'elasticnet'],
            #     'alpha' : (0.0,1000,'log-uniform'),
            #     'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
            #     'class_weight' : [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}],
            #     'eta0' : [1, 10, 100]}),
            # Catboost does not work atm with bayessearch (not hashable)
            # 'CatBoostRegressor':p(CatBoostRegressor(silent=True,one_hot_max_size=20, cat_features = cat_x_ind,iterations=1500),
            #     {'iterations': Integer(10, 1000),
            #      'depth': Integer(4, 10),
            #      'learning_rate': Real(0.01, 1.0, 'log-uniform'),
            #      'random_strength': Real(1e-9, 10, 'log-uniform'),
            #      'bagging_temperature': Real(0.0, 1.0),
            #      'border_count': Integer(1, 255),
            #      'l2_leaf_reg': Integer(2, 30),
            #      'scale_pos_weight':Real(0.01, 1.0, 'uniform')}),
            'xgb.XGBRegressor':p(xgb.XGBRegressor(silent=True),
                {'max_depth': (4, 10), #9,12
                'min_child_weight': (0, 10), # if leaves with small amount of observations are allowed?
                'gamma' : (0,), # these 3 for model complexity. gemma is a threshold for gain of the new split. 
                'subsample': (1,),
                'colsample_bytree' : (0.5,1.0), # these 3 for making model more robust to noise
                'reg_lambda' : (0.0,2.0), # regularization lambda, reduces similarity scores and therefore lowers gain. reduces sensitivity to individual observations
                'colsample_bylevel': (0.7,1.0),
                'learning_rate': (0.01,0.4), # 0.3 is default
                'max_delta_step': (0.0,10.0),
                'n_estimators': (10,100)}),
            #https://www.kaggle.com/c/LANL-Earthquake-Prediction/discussion/89994
            'lgb.LGBMRegressor': p(lgb.LGBMRegressor(objective='huber',boostingtype= "gbdt"),
                        # num_leaves=31,
                        # learning_rate=0.05,
                        # n_estimators=20,
                {'num_leaves': (8, 92),
                'min_data_in_leaf': (10, 100),
                'max_depth': [3, 4, 5, 6, 8, 12, 16],
                'learning_rate': (0.01, 0,1), #'log_uniform'
                'bagging_freq': (3, 7),
                'bagging_fraction': (0.6, 0.95, 'uniform'),
                'reg_alpha': (0.1, 0.95, 'uniform'),
                'reg_lambda': (0.1, 0.95, 'uniform')
            }),
            # A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array
            'HistGradientBoostingRegressor':p(HistGradientBoostingRegressor(),
                {}),
            'LassoCV':p(LassoCV(),
                {}),
            'MLPRegressor':p(MLPRegressor(),
                {})
                }

## Getting catboost to work

In [None]:
for name in train_x.columns:
    if name in cat_x:
        train_x[name] = train_x[name].astype(str)
        train_x[name].apply(str)
cat_x_ind = [ind for ind,name in enumerate(train_x.columns) if name in cat_x ]
cat_x_ind = list(range(37,80))
pipe = get_pipeline(CatBoostRegressor(silent=True,one_hot_max_size=20, cat_features = cat_x_ind,iterations=1500))
num_fold = 3
scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)

print(f"{name:<25}train {-1 * sum(scores['train_score'])/num_fold:.3f}, test {-1 * sum(scores['test_score'])/num_fold:.3f}")

In [None]:
fitted = pipe.named_steps['model'].fit(pipe[:-1].transform(train_x[:1000]),train_y[:1000],eval_set=(
    pipe[:-1].transform(train_x[1000:]),train_y[1000:]
))

In [None]:
fits = {}
fits['learn'] = list(fitted.get_evals_result()['learn'].values())[0]
fits['validation'] = list(fitted.get_evals_result()['validation'].values())[0]
df = pd.DataFrame.from_dict(fits)
df.reset_index(inplace=True)
df =  df.melt('index',var_name= 'type',value_name='score')
fitted.get_evals_result()
import altair as alt

alt.Chart(df).mark_line().encode(
    x='index',
    y='score',
    color='type',
)

In [None]:
list(fitted.get_evals_result()['validation'].values())[0][350]

## Rough test of all models

In [10]:
def cross_val_models(to_test):
    for name in to_test:
        pipe = get_pipeline(models[name].model)
        # score = -1 * cross_val_score(pipe, train_x, train_y,cv=3,scoring='neg_root_mean_squared_error')
        # below is necessary for looking at train scores
        num_fold = 3
        scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)
        # scoring is identical to 
        # make_scorer(mean_squared_error,greater_is_better=False, root=False,squared=False)
        # neg_mean_squared_log_error
        print(f"{name:<25}train {-1 * sum(scores['train_score'])/num_fold:.3f}, test {-1 * sum(scores['test_score'])/num_fold:.3f}")
cross_val_models(models)

Ridge                    train 0.097, test 0.143
svm.SVR                  train 0.088, test 0.141
LinearRegression         train 0.089, test 1733916935.547
Lasso                    train 0.399, test 0.399
ElasticNet               train 0.399, test 0.399
KNeighborsRegressor      train 0.169, test 0.215
RandomForestRegressor    train 0.056, test 0.147
xgb.XGBRegressor         train 0.006, test 0.151
lgb.LGBMRegressor        train 0.042, test 0.136
HistGradientBoostingRegressortrain 0.041, test 0.137
LassoCV                  train 0.109, test 0.137
MLPRegressor             train 0.117, test 0.186


## Hyperparameter tuning

In [21]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from aoc import timeit

def model_instance(model_name):
    model = {'model': [models[model_name].model]}
    for k,v in models[model_name].hyper.items():
        model['model__'+k] = v
    # print(model)
    return (model, 100)

def run_model(name):
    print('running', name)
    def on_step(optim_result):
        score = opt.best_score_
        # print(optim_result.x)
        print(f"{'best score':15}{score}")
        if score > opt.train_status['current_score']:
            opt.train_status['current_score'] = score
            opt.train_status['not_improving'] = 0
        else:
            opt.train_status['not_improving'] += 1
            if opt.train_status['not_improving'] == opt.train_status['stop_thres']: return True
    
    pipe = get_pipeline(LinearRegression())

    opt = BayesSearchCV(
    pipe,
    [model_instance(name)],

    cv=5, 
    scoring = 'neg_root_mean_squared_error',
    return_train_score = True,
    random_state = 112 
    )
    opt.train_status = { 'current_score': -100, 'not_improving': 0, 'stop_thres' :15}

    with timeit('fitting'+name): opt.fit(train_x,train_y, callback = on_step)
    return opt

pipe = get_pipeline(svm.SVR())

to_test = [k for k in models]
to_test = ['lgb.LGBMRegressor']
results = {name : run_model(name) for name in to_test}
# test_results = {name: run_model(name) for name in to_test}

running lgb.LGBMRegressor
[LGBMRegressor(boosting_type='gbdt', boostingtype='gbdt', class_weight=None,
              colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
              objective='huber', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0), 0.8241917138702464, 7, 0.01, 8, 49, 33, 0.2771673357535492, 0.9131841620044345]
best score     -0.2278452383232711
[LGBMRegressor(boosting_type='gbdt', boostingtype='gbdt', class_weight=None,
              colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
              objective='huber', random_state=None, reg_a

LightGBMError: Check failed: learning_rate >0.0 at D:\a\1\s\python-package\compile\src\io\config_auto.cpp, line 301 .


In [78]:
model = results['LinearRegression'].best_estimator_

In [56]:
# test_results['Lasso'].cv_results_
results['MLPRegressor'].best_params_
# results['MLPRegressor'].best_score_
# test_results['Lasso'].total_iterations

KeyError: 'MLPRegressor'

In [None]:
pipe = get_pipeline(Lasso(alpha=0.0006607161323268261))
# score = -1 * cross_val_score(pipe, train_x, train_y,cv=3,scoring='neg_root_mean_squared_error')
# below is necessary for looking at train scores
num_fold = 5
scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)
# scoring is identical to 
# make_scorer(mean_squared_error,greater_is_better=False, root=False,squared=False)
# neg_mean_squared_log_error
print(f"{name:<25}train {-1 * sum(scores['train_score'])/num_fold:.4f}, test {-1 * sum(scores['test_score'])/num_fold:.4f}")

## Saving

In [None]:
train_info = (train_x, train_y, num_x, cat_x)
with open('output/train_info.pickle', 'wb') as handle:
    pickle.dump(train_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [81]:
# model = get_pipeline(CatBoostRegressor(silent=True))
model = model.fit(train_x,train_y)
submit(model)
save(model)

In [None]:
train_x.head()

In [None]:
train_y.head()

In [96]:
a = CatBoostRegressor()
CatBoostRegressor().

AttributeError: 'CatBoostRegressor' object has no attribute 'keys'

In [89]:
b = LinearRegression()

In [93]:
{a:1}

{SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
              eta0=0.01, fit_intercept=True, l1_ratio=0.15,
              learning_rate='invscaling', loss='squared_loss', max_iter=1000,
              n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
              shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False): 1}