In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
import xgboost as xgb
import sklearn.model_selection as model_selection
import contextily as cx
import geopandas as gpd
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from scipy import stats
import optuna
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

gpd.io.file.fiona.drvsupport.supported_drivers["KML"] = "rw"
#pd.set_option('display.max_columns', 500)

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  import pandas.util.testing as tm
  return f(*args, **kwds)


In [4]:
NFOLDS = 5
SEED = 42

In [5]:
# Utils
def rmlse(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

def get_oof(clf, kf, x_train, y_train, x_test):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        x_tr = x_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train.iloc[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [6]:
all_data = pd.read_csv('resources/data_position_poi.csv')

In [7]:
all_data['bathrooms_total'] = all_data['bathrooms_private'] + all_data['bathrooms_shared']

In [8]:
all_data['area_per_room'] = all_data['area_total']/all_data['rooms']

In [9]:
all_data['ballog'] = all_data['loggias'] + all_data['balconies']

In [10]:
all_data = pd.get_dummies(all_data, columns = ["heating", "district", "condition"])

In [11]:
all_data = all_data.fillna(all_data.mean())

In [12]:
data = all_data.loc[all_data['split'] == 'train', :]
data = data.drop(columns=['split'])

data_test = all_data.loc[all_data['split'] == 'test', :]
data_test = data_test.drop(columns=['split', 'price'])

In [17]:
rf_params = {
    'n_estimators' : 100,
    'criterion': 'mse',
    'max_depth': None,
    'min_weight_fraction_leaf': 0.0,
    'max_features': 'auto',
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'bootstrap':True,
    'oob_score': False,
    'n_jobs': None,
    'random_state': SEED,
    'verbose': 0,
    'warm_start': False,
    'ccp_alpha': 0.0,
    'max_samples': None
}

lgb_params = {
    'num_leaves': 10,
    'max_depth': 5, 
    'random_state':SEED, 
    'silent' : True, 
    'metric': 'mse',
    'n_jobs': 4, 
    'n_estimators': 2000,
    'colsample_bytree': 0.95,
    'subsample': 0.9,
    'learning_rate': 0.05
}

cb_params = {
    'n_estimators': 500,
    'learning_rate': 0.1,
    'thread_count': -1,
    'depth': 7,
    'silent': True,
    'random_seed': SEED,
    'bagging_temperature': 0.2
}

ada_params = {
    'n_estimators': 1000,
    'learning_rate':0.05,
    'loss': 'square',
    'random_state': SEED
}

tree_params = {
    'criterion': 'mse',
    'max_depth': 5,
    'min_samples_split': 4,
    'min_samples_leaf': 2,
    'random_state': SEED
}

In [26]:
def objective(trial, X, y, area):

    param = {   'booster': 'gbtree',
                'max_depth':trial.suggest_int('max_depth', 1, 11),
                'reg_alpha':trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
                'reg_lambda':trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
                'min_child_weight':trial.suggest_int('min_child_weight', 0, 5),
                'gamma':trial.suggest_int('gamma', 0, 5),
                'learning_rate':trial.suggest_loguniform('learning_rate',0.001,0.5),
                'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
                'nthread' : -1
    }
    model1 = RandomForestRegressor(**rf_params)
    model2 = lgb.LGBMRegressor(**lgb_params)
    model3 = CatBoostRegressor(**cb_params)
    model4 = AdaBoostRegressor(**ada_params)
    model5 = DecisionTreeRegressor(**tree_params)

    #pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-rmse")
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = np.empty(5)

    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        #y_train, y_test = np.log(y[train_idx]/area[train_idx]), np.log(y[test_idx]/area[test_idx])
        y_train, y_test = np.log(y[train_idx]/X_train.area_total), y[test_idx]

        kf = KFold(
            n_splits=NFOLDS,
            shuffle=True,
            random_state=SEED
        ) 

        rf_oof_train, rf_oof_test = get_oof(model1, kf, X_train, y_train, X_test)
        lgbm_oof_train, lgbm_oof_test = get_oof(model2, kf, X_train, y_train, X_test)
        cb_oof_train, cb_oof_test = get_oof(model3, kf, X_train, y_train, X_test)
        ada_oof_train, ada_oof_test = get_oof(model4, kf, X_train, y_train, X_test)
        tree_oof_train, tree_oof_test = get_oof(model5, kf, X_train, y_train, X_test)

        x_train = np.concatenate((
            rf_oof_train,
            lgbm_oof_train,
            cb_oof_train,
            ada_oof_train,
            tree_oof_train
        ), axis=1)

        x_test = np.concatenate((
            rf_oof_test,
            lgbm_oof_test,
            cb_oof_test,
            ada_oof_test,
            tree_oof_test
        ), axis=1)

        x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.1, random_state = SEED)
        dtrain = xgb.DMatrix(x_tr, label=y_tr)
        dval = xgb.DMatrix(x_val, label=y_val)
        dtest = xgb.DMatrix(x_test, y_test)
        META_MODEL = xgb.train(param, dtrain, 
                                num_boost_round=3000, 
                                evals=[(dval, "validation")], 
                                early_stopping_rounds = 50, 
                                verbose_eval= 200)
                                #callbacks=[pruning_callback])
    
        #preds = np.exp(META_MODEL.predict(dtest)) * area[test_idx]
        #y_true = np.exp(y_test) * area[test_idx]

        preds = np.exp(META_MODEL.predict(dtest)) * X_test.area_total
        y_true = np.exp(y_test)
        cv_scores[idx] = rmlse(y_true, preds)

    return np.mean(cv_scores)

In [24]:
FEATURES = ['area_total', 'rooms', 'longitude', 'latitude', 'area_per_room', 
        'square_distance', 'park_distance', 'ceiling', 'stories', 'constructed', 'bathrooms_total', 'ballog']
district = [col for col in data if col.startswith('district')]
FEATURES += district
heating = [col for col in data if col.startswith('heating')]
FEATURES += heating
condition = [col for col in data if col.startswith('condition')]
FEATURES += condition
print(FEATURES)

['area_total', 'rooms', 'longitude', 'latitude', 'area_per_room', 'square_distance', 'park_distance', 'ceiling', 'stories', 'constructed', 'bathrooms_total', 'ballog', 'district_0.0', 'district_1.0', 'district_2.0', 'district_3.0', 'district_4.0', 'district_5.0', 'district_6.0', 'district_7.0', 'district_8.0', 'district_9.0', 'district_10.0', 'district_11.0', 'heating_0.0', 'heating_1.0', 'heating_2.0', 'heating_3.0', 'condition_0.0', 'condition_1.0', 'condition_2.0', 'condition_3.0']


In [27]:

X = data[FEATURES]
y = data['price']
area = data.area_total

import warnings
warnings.filterwarnings("ignore")

study = optuna.create_study(direction="minimize", study_name="Stack")
func = lambda trial: objective(trial, X, y, area)
study.optimize(func, n_trials=10)


[32m[I 2021-11-07 15:08:25,148][0m A new study created in memory with name: Stack[0m


[0]	validation-rmse:11.75405
[200]	validation-rmse:5.29163
[400]	validation-rmse:2.38620
[600]	validation-rmse:1.08344
[800]	validation-rmse:0.50539
[1000]	validation-rmse:0.26107
[1200]	validation-rmse:0.17215
[1400]	validation-rmse:0.14687
[1600]	validation-rmse:0.14080
[1800]	validation-rmse:0.13946
[2000]	validation-rmse:0.13914
[2200]	validation-rmse:0.13906
[2400]	validation-rmse:0.13903
[2600]	validation-rmse:0.13902
[2682]	validation-rmse:0.13902
[0]	validation-rmse:11.75045
[200]	validation-rmse:5.28883
[400]	validation-rmse:2.38209
[600]	validation-rmse:1.07817
[800]	validation-rmse:0.49957
[1000]	validation-rmse:0.25413
[1200]	validation-rmse:0.16446
[1400]	validation-rmse:0.13956
[1600]	validation-rmse:0.13401
[1800]	validation-rmse:0.13294
[2000]	validation-rmse:0.13276
[2200]	validation-rmse:0.13274
[2205]	validation-rmse:0.13274
[0]	validation-rmse:11.72540
[200]	validation-rmse:5.27133
[400]	validation-rmse:2.36998
[600]	validation-rmse:1.06815
[800]	validation-rmse:0.4

KeyboardInterrupt: 

In [54]:
# Stacking with train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = SEED, test_size = 0.33, stratify=round(np.log(y)))
#y_train = np.log(y_train/X_train.area_total)

In [55]:
model1 = RandomForestRegressor(**rf_params)
model2 = lgb.LGBMRegressor(**lgb_params)
model3 = CatBoostRegressor(**cb_params)
model4 = AdaBoostRegressor(**ada_params)
model5 = DecisionTreeRegressor(**tree_params)

In [57]:
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) 

rf_oof_train, rf_oof_test = get_oof(model1, kf, X_train, y_train, X_test)
lgbm_oof_train, lgbm_oof_test = get_oof(model2, kf, X_train, y_train, X_test)
cb_oof_train, cb_oof_test = get_oof(model3, kf, X_train, y_train, X_test)
ada_oof_train, ada_oof_test = get_oof(model4, kf, X_train, y_train, X_test)
tree_oof_train, tree_oof_test = get_oof(model5, kf, X_train, y_train, X_test)

In [58]:
x_train = np.concatenate((
        rf_oof_train,
        lgbm_oof_train,
        cb_oof_train,
        ada_oof_train,
        tree_oof_train
    ), axis=1)

x_test = np.concatenate((
    rf_oof_test,
    lgbm_oof_test,
    cb_oof_test,
    ada_oof_test,
    tree_oof_test
), axis=1)

In [59]:
# XGB meta model.
param = {
        'max_depth': 3, 
        'reg_alpha': 0.0012, 'reg_lambda': 0.003, 
        'min_child_weight': 0, 'gamma': 2, 
        'learning_rate': 0.0132, 'colsample_bytree': 0.45
        }

dtrain = xgb.DMatrix(x_train, label= y_train)
META_MODEL = xgb.train(param, dtrain, num_boost_round=2000)

In [65]:
dtest = xgb.DMatrix(x_test)
preds = META_MODEL.predict(dtest)
#rmlse(y_test, preds)
for i in range(len(preds)):
    if preds[i] < 0:
        preds[i] = 0
rmlse(y_test, preds)

0.2474842323687101

In [None]:
X_test = data_test[FEATURES]

rf_pred = model1.predict(X_test).reshape(-1, 1)
lgb_pred = model2.predict(X_test).reshape(-1, 1)
cb_pred = model3.predict(X_test).reshape(-1, 1)
ada_pred = model4.predict(X_test).reshape(-1, 1)
tree_pred = model5.predict(X_test).reshape(-1, 1)

x_test = np.concatenate((rf_pred, lgb_pred, cb_pred, ada_pred, tree_pred), axis=1)

In [None]:
dtest = xgb.DMatrix(x_test)
final_preds = META_MODEL.predict(dtest)

In [None]:
final_preds.rename("price_prediction").to_csv("submissions/stack.csv", index_label="id")

In [66]:
# LGB meta model
META_MODEL = lgb.LGBMRegressor(
    num_leaves=5,
    max_depth=7, 
    random_state=SEED, 
    silent=True, 
    metric='mse',
    n_jobs=4, 
    n_estimators=1500,
    colsample_bytree=1,
    subsample=0.9,
    learning_rate=0.003
)

META_MODEL.fit(x_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1,
              importance_type='split', learning_rate=0.003, max_depth=7,
              metric='mse', min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=1500, n_jobs=4, num_leaves=5,
              objective=None, random_state=42, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=0.9, subsample_for_bin=200000,
              subsample_freq=0)

In [68]:
preds = META_MODEL.predict(x_test)
rmlse(y_test, preds)

0.21122655198933668

In [71]:
X_test = data_test[FEATURES]

rf_pred = model1.predict(X_test).reshape(-1, 1)
lgb_pred = model2.predict(X_test).reshape(-1, 1)
cb_pred = model3.predict(X_test).reshape(-1, 1)
ada_pred = model4.predict(X_test).reshape(-1, 1)
tree_pred = model5.predict(X_test).reshape(-1, 1)

x_test = np.concatenate((rf_pred, lgb_pred, cb_pred, ada_pred, tree_pred), axis=1)

In [73]:
final_preds = META_MODEL.predict(x_test)
submission = pd.DataFrame()
submission['id'] = data_test.index
submission['price_prediction'] = final_preds
submission.to_csv('submissions/lgb_stack.csv', index=False)