In [1]:
import pandas as pd
import numpy as np
import pickle

def save(model, filename='bestmodel.pickle'):
    with open('output/'+filename, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
def submit(model):
    model = model.fit(train_x,train_y)
    pred = model.predict(final_test)
    final_test['SalePrice'] = np.exp(pred)
    final_test[['Id','SalePrice']].to_csv('output/submission.csv', index=False)

In [2]:
#make train and test datasets. Splitting labels and features happens later
path_train = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/train.csv"
path_test = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/test.csv"

train = pd.read_csv(path_train)
final_test = pd.read_csv(path_test)
print(train.shape, final_test.shape)

y_col = (set(train.columns) - set(final_test.columns)).pop()
train[y_col] = np.log1p(train[y_col])

# since we use cross validation the train set does not have to be split anymore
# from sklearn.model_selection import train_test_split
# train_x, test_x, train_y, test_y = train_test_split(
#     train.drop([y_col], axis=1),train.loc[:,y_col], test_size=0.33, random_state=42) 
# ds = (train_x,test_x,train_y,test_y)
# for d in ds: print(d.shape)
train_x = train.drop([y_col], axis=1)
train_y = train.loc[:,y_col]
train_x.shape, train_y.shape


(1460, 81) (1459, 80)


((1460, 80), (1460,))

### Preprocessing

In [3]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator/denominator)
    return eta


In [4]:
from sklearn.impute import SimpleImputer
# todo multivariate imputation, possibly with pipelines for numeric and categorical data

from sklearn.preprocessing import MinMaxScaler
# https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
# https://stackoverflow.com/questions/51237635/difference-between-standard-scaler-and-minmaxscaler/51237727
# don't know features are normal so just going with minmax scalar atm

from sklearn.preprocessing import OneHotEncoder
#https://stackoverflow.com/questions/36631163/what-are-the-pros-and-cons-between-get-dummies-pandas-and-onehotencoder-sciki
#The crux of it is that the sklearn encoder creates a function which persists and can then be applied to new data sets which use the same categorical variables, with consistent results.
# So don't use pandas get dummies, but a OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

# We create the preprocessing pipelines for both numeric and categorical data.
corr = train.corr()[y_col]
corr = corr.sort_values(ascending=False)
num_x = list(corr.index)[1:]

numeric_transformer = Pipeline(steps=[
    ('impute_num', SimpleImputer(strategy='median')),
    ('scale_num', MinMaxScaler())])

cat_x = [col for col in final_test.columns if final_test[col].dtype == 'object']
cat_x.sort(key = lambda x: -correlation_ratio(train[x],train[y_col]))

categorical_transformer = Pipeline(steps=[
    ('impute_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot_cat', OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, num_x),
        ('category', categorical_transformer, cat_x)])

### Modeling

In [47]:

# from catboost import CatBoostRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet,SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_validate
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def inv_y(transformed_y):
    return np.exp(transformed_y)

from collections import namedtuple
p = namedtuple('params', ['model','hyper'])

models = {'RidgeCV': p(RidgeCV(),
                {}),
            'svm.SVR': p(svm.SVR(),
                {'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],
                'C': [1, 10, 100, 1000, 10000]}),
            'LinearRegression':p(LinearRegression(),
                {}),
            'Lasso':p(Lasso(),
                {}),
            'ElasticNet':p(ElasticNet(),
                {}),
            'KNeighborsRegressor':p(KNeighborsRegressor(),
                {}),
            'RandomForestRegressor':p(RandomForestRegressor(),
                {}),
            'SGDRegressor':p(SGDRegressor(),
                {}),
            'CatBoostRegressor':p(CatBoostRegressor(silent=True),
                {}),
            'xgb.XGBRegressor':p(xgb.XGBRegressor(),
                {}),
            'lgb.LGBMRegressor':
                p(lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20),
                {}),
            'HistGradientBoostingRegressor':p(HistGradientBoostingRegressor(),
                {}),
            'MLPRegressor':p(MLPRegressor(),
                {})
}

def get_pipeline(model):
    return Pipeline([('preprocess', preprocessor),
                   ('model',model)])



In [None]:
for name, model in models:
    pipe = get_pipeline(model)
    # score = -1 * cross_val_score(pipe, train_x, train_y,cv=3,scoring='neg_root_mean_squared_error')
    # below is necessary for looking at train scores
    num_fold = 3
    scores = cross_validate(pipe, train_x, train_y, scoring='neg_root_mean_squared_error', cv=num_fold, return_train_score=True)
    # scoring is identical to 
    # make_scorer(mean_squared_error,greater_is_better=False, root=False,squared=False)
    # neg_mean_squared_log_error
    print(f"{name}, \ttrain {-1 * sum(scores['train_score'])/num_fold:.3f}, test {-1 * sum(scores['test_score'])/num_fold:.3f}")

In [25]:
for name in ['svm.SVR']:
    pipe = get_pipeline(svm.SVR())
    pipe.get_params().keys()

## Hyperparameter tuning

In [48]:
models['svm.SVR'].model

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [7]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from aoc import timeit

pipe = get_pipeline(svm.SVR())

In [49]:
def model_instance(model_name):
    model = {'model': [models[model_name].model]}
    for k,v in models[model_name].hyper.items():
        model['model__'+k] = v
    return (model, 50)
a = model_instance('svm.SVR')
a[0]

{'model': [SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
      kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)],
 'model__gamma': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],
 'model__C': [1, 10, 100, 1000, 10000]}

In [50]:
models['svm.SVR'].hyper

{'gamma': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],
 'C': [1, 10, 100, 1000, 10000]}

In [44]:
(parameters_svr, 50)[0]

{'model': [SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
      kernel='rbf', max_iter=1000, shrinking=True, tol=0.001, verbose=False)],
 'model__gamma': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],
 'model__C': [1, 10, 100, 1000, 10000]}

In [51]:

# callback handler
train_status = { 'current_score': -100, 'not_improving': 0, 'stop_thres' : 6}
def on_step(optim_result):
    score = opt.best_score_
    print("best score: %s" % score)
    if score > train_status['current_score']:
        train_status['current_score'] = score
        train_status['not_improving'] = 0
    else:
        train_status['not_improving'] += 1
        if train_status['not_improving'] == train_status['stop_thres']: return True

opt = BayesSearchCV(
    pipe,
    [model_instance('svm.SVR')],
    # cv=3, 
    scoring = 'neg_root_mean_squared_error',
    return_train_score = True # (parameter space, # of evaluations)
)

with timeit('2x2 20 iter cv=3'): opt.fit(train_x[:200],train_y[:200], callback = on_step)
print("val. score: %s" % opt.best_score_)

best score: -0.3887180566689996
best score: -0.25282517460241544
best score: -0.15595175374839695
best score: -0.15595175374839695
best score: -0.15595175374839695
best score: -0.15595175374839695
best score: -0.15595175374839695
best score: -0.15595175374839695
best score: -0.15595175374839695
INFO:root:2x2 20 iter cv=3, time: 0.16215632359186807
val. score: -0.15595175374839695


In [10]:
opt.best_params_

OrderedDict([('model__C', 2885), ('model__gamma', 0.0001)])

## Saving

In [None]:
train_info = (train_x, train_y, num_x, cat_x)
with open('output/train_info.pickle', 'wb') as handle:
    pickle.dump(train_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
model = get_pipeline(CatBoostRegressor(silent=True))
submit(model)
save(model)

In [0]:
m

In [0]:
ridge, 	        train 0.115, test 0.143
svm, 	        train 0.088, test 0.141
lin, 	        train 0.089, test 0.151
lasso, 	        train 0.109, test 0.137
ElasticNet, 	train 0.399, test 0.399
GammaRegressor, train 0.248, test 0.251
KNeighborsRegrestrain 0.169, test 0.215
RandomForestRegrtrain 0.056, test 0.147
SGDRegressor, 	train 0.196, test 0.233
CatBoostRegressotrain 0.032, test 0.122
xgb.XGBRegressortrain 0.005, test 0.150
lgb.LGBMRegressotrain 0.190, test 0.211
HistGradientBoostrain nan, test nan
MLPRegressor, 	train 0.138, test 0.196