In [8]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
import xgboost as xgb
import sklearn.model_selection as model_selection
import contextily as cx
import geopandas as gpd
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from scipy import stats
import optuna
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [4]:
imp_all_data = pd.read_csv("jim_imputed")

In [6]:
all_data = pd.read_csv('resources/vegard_preprocessed.csv')

In [7]:
SEED = 42
NFOLDS = 5

def rmlse(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

def get_oof(clf, kf, x_train, y_train, x_test):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        x_tr = x_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train.iloc[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [89]:
NUMERIC = ["latitude", "longitude", "constructed", "area_total", "area_living", "area_kitchen", "rooms", 
            "center_distance", "park_distance", "metro_distance", "square_distance","ballog", "bathrooms_total", "distr_avg"]
CAT = [ "district", "material", "condition", "heating", "new2", "parking", "garbage_chute"]
FEATURES = NUMERIC + CAT

In [90]:
data_train = imp_all_data.loc[imp_all_data.split == 'train']
data_test = imp_all_data.loc[imp_all_data.split == 'test']
X_train = data_train[FEATURES].copy()
X_test = data_test[FEATURES].copy()
y = data_train.price.copy()

In [43]:
X_tr, X_te, y_train, y_test = train_test_split(X_train, y, random_state = SEED, test_size = 0.33, stratify=round(np.log(y)))

In [96]:
# Run this for submissions
X_tr = X_train.copy()
X_te = X_test.copy()
y_train = y.copy()

In [97]:
train_idx = X_tr.index
test_idx = X_te.index

In [98]:
# transforming skewed features
from scipy.stats import skew

y_train = np.log(y_train / (data_train.iloc[train_idx]).area_total)

#log transform skewed numeric features:

skewed_feats = X_tr[NUMERIC].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
print(skewed_feats)

X_tr[skewed_feats] = np.log1p(X_tr[skewed_feats])
X_te[skewed_feats] = np.log1p(X_te[skewed_feats])

Index(['area_total', 'area_living', 'area_kitchen', 'rooms', 'center_distance',
       'park_distance', 'metro_distance', 'square_distance', 'ballog',
       'bathrooms_total', 'distr_avg'],
      dtype='object')


In [99]:
X_tr = pd.get_dummies(X_tr, columns=CAT)
X_te = pd.get_dummies(X_te, columns=CAT)

## Training simpler models

In [100]:
rf_params = {
    'n_estimators' : 100,
    'criterion': 'mse',
    'max_depth': None,
    'min_weight_fraction_leaf': 0.0,
    'max_features': 'auto',
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'bootstrap':True,
    'oob_score': False,
    'n_jobs': None,
    'random_state': SEED,
    'verbose': 0,
    'warm_start': False,
    'ccp_alpha': 0.0,
    'max_samples': None
}

ada_params = {
    'n_estimators': 1000,
    'learning_rate':0.05,
    'loss': 'square',
    'random_state': SEED
}

tree_params = {
    'criterion': 'mse',
    'max_depth': 5,
    'min_samples_split': 4,
    'min_samples_leaf': 2,
    'random_state': SEED
}

gb_params = {
    'learning_rate': 0.01,
    'n_estimators':2000,
    'subsample':1.0, 
    'criterion':'mse', 
    'min_samples_split':4, 
    'min_samples_leaf':2, 
    'min_weight_fraction_leaf':0.0, 
    'max_depth':9, 
    'min_impurity_decrease':0.0, 
    'init':None, 
    'random_state':42, 
    'max_features':None,
    'alpha':0.9,
    'verbose':0,
    'max_leaf_nodes':None,
    'warm_start':False,
    'validation_fraction':0.1,
    'n_iter_no_change':None,
    'tol':0.0001,
    'ccp_alpha': 0.0
}

In [101]:
model1 = RandomForestRegressor(**rf_params)
model2 = AdaBoostRegressor(**ada_params)
model3 = DecisionTreeRegressor(**tree_params)
model4 = LinearRegression()
model5 = GradientBoostingRegressor(**gb_params)

In [102]:
kf = KFold(n_splits=NFOLDS, shuffle=True,random_state=SEED) 

rf_oof_train, rf_oof_test = get_oof(model1, kf, X_tr, y_train, X_te)
ada_oof_train, ada_oof_test = get_oof(model2, kf, X_tr, y_train, X_te)
tree_oof_train, tree_oof_test = get_oof(model3, kf, X_tr, y_train, X_te)
linreg_oof_train, linreg_oof_test = get_oof(model4, kf, X_tr, y_train, X_te)
gb_oof_train, gb_oof_test = get_oof(model5, kf, X_tr, y_train, X_te)

## Training advanced models

In [107]:
STORAGE = "sqlite:///vegard_optuna.sqlite"

FEATURES = ["latitude", "longitude", "district", "constructed", "area_total",
            "rooms", "ballog", "metro_distance", "park_distance",
            "square_distance", "material", "condition", "heating", "stories",
            "floor", "ceiling", "bathrooms_total", "new"]
        
CATEGORICAL_FEATURES = ["district", "material", "condition", "heating", "new"]

In [104]:
data_train2 = all_data.loc[all_data.split == 'train']
data_test2 = all_data.loc[all_data.split == 'test']

In [105]:
X_tr2 = (data_train2[FEATURES]).iloc[train_idx]
X_te2 = (data_test2[FEATURES]).iloc[test_idx]

IndexError: positional indexers are out-of-bounds

In [117]:
X_tr2 = (data_train2[FEATURES])
X_te2 = (data_test2[FEATURES])

In [118]:
cat_study = optuna.create_study(
    storage=STORAGE,
    sampler=optuna.samplers.TPESampler(seed=42),
    study_name="CatBoost-T2",
    direction="minimize",
    load_if_exists=True
)

cat_model = CatBoostRegressor(objective="RMSE", random_seed=SEED, silent=True, thread_count=1, **cat_study.best_params)

cat_oof_train, cat_oof_test = get_oof(cat_model, kf, X_tr2, y_train, X_te2)


[32m[I 2021-11-10 15:06:39,236][0m Using an existing study with name 'CatBoost-T2' instead of creating a new one.[0m


In [119]:
lgb_study = optuna.create_study(
    storage=STORAGE,
    sampler=optuna.samplers.TPESampler(seed=42),
    study_name="LightGBM-F4",
    direction="minimize",
    load_if_exists=True
)

lgb_model = lgb.LGBMRegressor(metric="rmse", n_estimators=5000, n_jobs=3, **lgb_study.best_params)

lgb_oof_train, lgb_oof_test = get_oof(lgb_model, kf, X_tr2, y_train, X_te2)

[32m[I 2021-11-10 15:07:35,544][0m Using an existing study with name 'LightGBM-F4' instead of creating a new one.[0m


In [120]:
x_train = np.concatenate((
        rf_oof_train,
        ada_oof_train,
        tree_oof_train,
        linreg_oof_train,
        gb_oof_train,
        lgb_oof_train,
        cat_oof_train
    ), axis=1)

x_test = np.concatenate((
    rf_oof_test,
    ada_oof_test,
    tree_oof_test,
    linreg_oof_test,
    gb_oof_test,
    lgb_oof_test,
    cat_oof_test
), axis=1)

In [121]:
META_MODEL = lgb.LGBMRegressor(
    num_leaves=5,
    max_depth=7, 
    random_state=SEED, 
    silent=True, 
    metric='mse',
    n_jobs=4, 
    n_estimators=200,
    colsample_bytree=1,
    subsample=0.9,
    learning_rate=0.05
)

META_MODEL.fit(x_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1,
              importance_type='split', learning_rate=0.05, max_depth=7,
              metric='mse', min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=200, n_jobs=4, num_leaves=5,
              objective=None, random_state=42, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=0.9, subsample_for_bin=200000,
              subsample_freq=0)

In [68]:
# XGB meta model.
param = {
        'max_depth': 3, 
        'reg_alpha': 0.0012, 'reg_lambda': 0.003, 
        'min_child_weight': 0, 'gamma': 2, 
        'learning_rate': 0.0132, 'colsample_bytree': 0.45
        }

dtrain = xgb.DMatrix(x_train, label = y_train)
META_MODEL = xgb.train(param, dtrain, num_boost_round=200)

In [70]:
dtest = xgb.DMatrix(x_test)
preds = np.exp(META_MODEL.predict(dtest)) * (data_train.iloc[test_idx]).area_total
rmlse(y_test, preds)

0.8386848920591313

In [122]:
preds = np.exp(META_MODEL.predict(x_test)) * data_test.area_total
submission = pd.DataFrame()
submission['id'] = data_test.index
submission['price_prediction'] = preds.values
submission.to_csv('submissions/stack.csv', index=False)

In [123]:
submission

Unnamed: 0,id,price_prediction
0,23285,3.021545e+07
1,23286,9.785289e+06
2,23287,6.175348e+06
3,23288,8.358377e+06
4,23289,5.332597e+06
...,...,...
9932,33217,2.835395e+07
9933,33218,1.921649e+07
9934,33219,9.002818e+06
9935,33220,8.941344e+06
