# Bagging and stacking

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import xgboost
# Specific tf libraries
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5


In [2]:
pd.options.mode.chained_assignment = None
%run ../common_utils.py

In [3]:
# All their kaggle scores
d = {'LaureRF': 0.20015,
     'RF2': 0.34266,
     'Deep': 0.23278,
     'GB': 0.19968,
     'CB1': 0.23450,
     'XGB1': 0.23787,
     'KNN1' : 0.35042}

acc = pd.DataFrame(
    d,
    index=[0]
)
acc = acc.T
acc.columns = ['RMSLE']
acc

Unnamed: 0,RMSLE
LaureRF,0.20015
RF2,0.34266
Deep,0.23278
GB,0.19968
CB1,0.2345
XGB1,0.23787
KNN1,0.35042


In [4]:
LaureRF = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/LaureRF.csv")
RF2 = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/RF2.csv")
Deep = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/Deep.csv")
GB = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/GB.csv")
CB1 = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/CB1.csv")
XGB1 = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/XGB1.csv")
KNN1 = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/KNN1.csv")

In [5]:
LaureRF = LaureRF.sort_values(by="id")
RF2 = RF2.sort_values(by="id")
Deep = Deep.sort_values(by="id")
GB = GB.sort_values(by="id")
CB1 = CB1.sort_values(by="id")
XGB1 = XGB1.sort_values(by="id")
KNN1 = KNN1.sort_values(by="id")

In [6]:
LaureRF_prediction = LaureRF["price_prediction"].to_numpy().T
RF2_prediction = RF2["price_prediction"].to_numpy().T
Deep_prediction = Deep["price_prediction"].to_numpy().T
GB_prediction = GB["price_prediction"].to_numpy().T
CB1_prediction = CB1["price_prediction"].to_numpy().T
XGB1_prediction = XGB1["price_prediction"].to_numpy().T
KNN1_prediction = KNN1["price_prediction"].to_numpy().T

In [7]:
avg_prediction = np.average(
    [LaureRF_prediction,
     RF2_prediction,
     Deep_prediction,
     GB_prediction,
     CB1_prediction,
     XGB1_prediction,
     KNN1_prediction
    ],
    weights = 1 / acc['RMSLE'] ** 4,
    axis=0
)
print(avg_prediction)

[28507345.00363846  9414770.61022208  6555240.28133837 ...
 10478451.89613688  9934283.66581429  7785634.31308344]


In [8]:
# result = avg_prediction
# submission = pd.DataFrame()
# submission['id'] = LaureRF['id']
# submission['price_prediction'] = result
# if len(submission['id']) != 9937:
#     raise Exception("Not enough rows submitted!")
# submission.to_csv('bagging_2', index=False)

# # kaggle_scores = [0.23450, 0.20502, 0.23278, 0.19968, 0.35042, 0.20015, 0.20159, 0.23787]
# # csv_paths = ["ensemble_predictions/csvs/CB1.csv", "ensemble_predictions/csvs/deep_king_5_5.csv", "ensemble_predictions/csvs/deep.csv", "ensemble_predictions/csvs/GB.csv",
# #              "ensemble_predictions/csvs/KNN1.csv", "ensemble_predictions/csvs/LaureRF.csv", "ensemble_predictions/csvs/xgb_king_2.csv", "ensemble_predictions/csvs/XGB1.csv"]
# # submission_path = "ensemble_predictions/bagging_2"

# # csv_bagging(kaggle_scores, csv_paths, submission_path)

## Stacking

In [9]:
train, test, metaData = load_all_data()

Deep learning

In [13]:
# Mostly the ones correlated to price.
features = ["building_id",
                      "area_total", "area_kitchen", "area_living", "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed", # Numerical
                    "district", "material", "parking"] 

numerical_features = ["area_total", "area_kitchen", "area_living",
                      "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed"]

cat_features = ["district", "material", "parking"]

droptable = ['longitude', 'latitude', 'area_kitchen', 'area_living', 'floor', 'stories'] # Not dropping theta!

train_labels, train_targets, test_labels = pre_process_numerical(
    features, numerical_features, train, test, outliers_value=7, val_data=False, val_split=0.1, random_state=42, scaler="minMax",
    add_R="True", add_rel_height="True",  add_spacious=True, droptable=droptable,
    one_hot_encode=True, cat_features=cat_features, drop_old=True)

model_params = ([18, 12, 6], tf.nn.relu,
                     [False, False, False], 0.2, tf.keras.optimizers.Adam,
                      rmsle_custom, ['mse', 'msle', tf.keras.metrics.Accuracy()], True)

# NB! We cannot just compile ann model beforehand, will just keep training on same model.
ann_oof_train, ann_oof_test, hists = get_oof_ann(model_params, train_labels, train_targets, test_labels)

# ADVISOR PERFORMANCE
for histories in hists:
    hist = pd.DataFrame(histories.history)
    hist['epoch'] = histories.epoch
    print(hist["val_loss"].tail(1))
print(np.sum(ann_oof_train >= 0) / len(ann_oof_train))
print(np.sum(ann_oof_test >= 0) / len(ann_oof_test))

Hot encoding
minMax

..................................

xgboost

In [None]:
# ALL, and building ID!
features =           ["building_id", # For grouping
                      "area_total", "area_kitchen", "area_living", "floor", "ceiling", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed", # Numerical
                     "layout", "condition", "district", "material", "parking", "heating", "seller", #Categorical
                      "windows_court", "windows_street", "new", "elevator_without", "elevator_passenger", "elevator_service", "garbage_chute"] # Bool
                     #"street", "address"] #String

numerical_features = ["area_total", "area_kitchen", "area_living", "floor",
                      "ceiling", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed"]

cat_features = ["layout", "condition", "district", "material", "parking", "heating", "seller"]

droptable = ['longitude', 'latitude']

train_labels, train_targets, test_labels = pre_process_numerical(features, numerical_features, train, test,
                    outliers_value=7, val_data=False, val_split=0.1, random_state=42, scaler="minMax",
                    add_R=True, add_rel_height=True, add_spacious=True, droptable=droptable,
                    one_hot_encode=False, cat_features=cat_features, drop_old=True) # FALSE!

xgb_model = xgboost.XGBRegressor(max_depth=5, min_child_weight=12, gamma=0.15, subsample=0.8, colsample_bytree=0.8, reg_alpha=1.1, reg_lambda=0.3, learning_rate=0.01, n_estimators=10000)

xgb_oof_train, xgb_oof_test, scores = get_oof_xgboost(xgb_model, train_labels, np.log(train_targets), test_labels)
print(scores)

minMax


NameError: name 'xgboost' is not defined

lgbm

In [10]:
features = ["building_id", # For grouping
                      "area_total", "area_kitchen", "area_living", "floor", "ceiling", "stories", "rooms",
                      "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed", # Numerical
                     "layout", "condition", "district", "material", "parking", "heating", "seller", #Categorical
                      "windows_court", "windows_street", "new", "elevator_without", "elevator_passenger", "elevator_service", "garbage_chute"] # Bool
                     #"street", "address"] # Strings

float_numerical_features = ["area_total", "area_kitchen", "area_living", "ceiling", "latitude", "longitude", "constructed"]
int_numerical_features = ["floor", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies", "loggias", "phones"] # Ordinal categories

cat_features = ["layout", "condition", "district", "material", "parking", "heating", "seller"]

droptable = []

# Load data
train, test, metaData = load_all_data()
# Clean data
train_labels, train_targets, test_labels = clean_data(train, test, features, float_numerical_features, int_numerical_features, cat_features, log_targets=False, log_area=True, fillNan=True)
# Add new features
train_labels, test_labels, added_features = feature_engineering(train_labels, test_labels, float_numerical_features, int_numerical_features, cat_features)
# Normalize
train_labels, test_labels = normalize(train_labels, test_labels, float_numerical_features, scaler="minMax")
# One-hot encoding
train_labels, test_labels = one_hot_encoder(train_labels, test_labels, ["condition", "district", "material", "parking", "heating", "seller"], drop_old=True)
# Drop some features
train_labels.drop(droptable, inplace=True, axis=1)
test_labels.drop(droptable, inplace=True, axis=1)

lgbm_model = lightgbm.LGBMRegressor(
  num_iterations= 10000,
  n_estimators= 152,
  learning_rate= 0.05,
  num_leaves= 40,
  max_depth= 10,
  min_data_in_leaf= 20,
  bagging_fraction= 0.9,
  bagging_freq= 5,
  feature_fraction= 0.8,
  random_state=1,
  early_stopping_round=100,
  silent=True,
  metric='regression',
  num_threads=4
)

lgbm_oof_train, lgbm_oof_test, scores = get_oof_lgbm(lgbm_model, train_labels, np.log(train_targets), test_labels)
# Advisor performance
print(scores)

KeyboardInterrupt: 

RF

In [None]:
features = ["building_id",
            "area_total", "latitude", "longitude", "floor", "stories", # Numerical
            "district", 'condition' # Categorical
           ]

numerical = [] # No need to scale for RF! https://stackoverflow.com/questions/8961586/do-i-need-to-normalize-or-scale-data-for-randomforest-r-package
cat_features = []
droptable = []

train_labels, train_targets, test_labels = pre_process_numerical(features, numerical_features, train, test,
                    outliers_value=7, val_data=False, val_split=0.1, random_state=42, scaler="none",
                    add_R=False, add_rel_height=False, add_spacious=False, droptable=droptable,
                    one_hot_encode=False, cat_features=cat_features, drop_old=False) 

rf_oof_train, rf_oof_test, scores = get_oof_rf(train_labels, train_targets, test_labels) # Use log() for RF?

print(scores)
print("==>\t",np.average(scores))

KNN

In [None]:
features = ["building_id", # For Grouping
            "area_total", "ceiling", "floor", "bathrooms_shared", "balconies", "stories", "rooms","latitude","longitude", # Numerical
            "windows_street", "elevator_without", # Bool
            "parking", "heating", "district", "condition"  # Categorical
           ]

numerical_features = ["area_total", "ceiling", "floor", "bathrooms_shared", "balconies", "stories", "rooms"]
categorical_features = ["parking", "heating", "district", "condition"]
droptable=["latitude","longitude"]

train_labels, train_targets, test_labels = pre_process_numerical(features, numerical_features, train, test,
                    outliers_value=7, val_data=False, val_split=0.1, random_state=42, scaler="minMax",
                    add_R=True, add_rel_height=True, add_spacious=True, droptable=droptable,
                    one_hot_encode=True, cat_features=cat_features, drop_old=False) 

knn_oof_train, knn_oof_test, scores = get_oof_knn(train_labels, train_targets, test_labels)

print(scores)
print("==>\t", np.average(scores))

xgb

In [None]:
# ALL, and building ID!
features =           ["building_id", # For grouping
                      "area_total", "area_kitchen", "area_living", "floor", "ceiling", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed", # Numerical
                     "layout", "condition", "district", "material", "parking", "heating", "seller", #Categorical
                      "windows_court", "windows_street", "new", "elevator_without", "elevator_passenger", "elevator_service", "garbage_chute"] # Bool
                     #"street", "address"] #String

numerical_features = ["area_total", "area_kitchen", "area_living", "floor",
                      "ceiling", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed"]

cat_features = ["layout", "condition", "district", "material", "parking", "heating", "seller"]

droptable = ['longitude', 'latitude']

train_labels, train_targets, test_labels = pre_process_numerical(features, numerical_features, train, test,
                    outliers_value=7, val_data=False, val_split=0.1, random_state=42, scaler="minMax",
                    add_R=True, add_rel_height=True, add_spacious=True, droptable=droptable,
                    one_hot_encode=True, cat_features=cat_features, drop_old=True) 

xgb_king = xgboost.XGBRegressor(max_depth=5, min_child_weight=12, gamma=0.15, subsample=0.8, colsample_bytree=0.8, reg_alpha=1.1, reg_lambda=0.3, learning_rate=0.01, n_estimators=10000)

# TODO: shouldn't xgb and lgbm predict the log(price)?
# NB! ann predicts some negative values! Maybe RELU on the output?
train_labels["xgb_preds"] = xgb_oof_train
train_labels["lgbm_preds"] = lgbm_oof_train
train_labels["ann_preds"] = np.log(ann_oof_train)
train_labels["knn_preds"] = np.log(knn_oof_train)
train_labels["rf_preds"] = np.log(rf_oof_train)

test_labels["xgb_preds"] = lgbm_oof_test
test_labels["lgbm_preds"] = xgb_oof_test
test_labels["ann_preds"] = np.log(ann_oof_test)
test_labels["knn_preds"] = np.log(knn_oof_test)
test_labels["rf_preds"] = np.log(rf_oof_test)

# Drop all except the good stuff?
train_labels.drop(train_labels.columns.to_list()[1:-5], inplace=True, axis=1)
test_labels.drop(test_labels.columns.to_list()[1:-5], inplace=True, axis=1)

scores, avg, best_model, best_index = XGB_groupKFold(5, xgb_king, train_labels, np.log(train_targets),
    eval_metric='rmse')

# Kings performance
print(scores)
print("==>", np.average(scores))

ANN

In [None]:
# Mostly the ones correlated to price.
features =           ["building_id",
                      "area_total", "area_kitchen", "area_living", "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed", # Numerical
                    "district", "material", "parking"] 

numerical_features = ["area_total", "area_kitchen", "area_living",
                      "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed"]

cat_features = ["district", "material", "parking"]

droptable = ['longitude', 'latitude', 'area_kitchen', 'area_living', 'floor', 'stories'] # Not dropping theta!

train_labels, train_targets, test_labels = pre_process_numerical(
    features, numerical_features, train, test, outliers_value=7, val_data=False, val_split=0.1, random_state=42, scaler="minMax",
    add_R="True", add_rel_height="True",  add_spacious=True, droptable=droptable,
    one_hot_encode=True, cat_features=cat_features, drop_old=True)

train_labels["xgb_preds"] = np.exp(xgb_oof_train)
train_labels["lgbm_preds"] = np.exp(lgbm_oof_train)
train_labels["ann_preds"] = ann_oof_train
train_labels["knn_preds"] = knn_oof_train
train_labels["rf_preds"] = rf_oof_train

test_labels["xgb_preds"] = np.exp(lgbm_oof_test)
test_labels["lgbm_preds"] = np.exp(xgb_oof_test)
test_labels["ann_preds"] = ann_oof_test
test_labels["knn_preds"] = knn_oof_test
test_labels["rf_preds"] = rf_oof_test

# Drop all except the good stuff?
train_labels.drop(train_labels.columns.to_list()[1:-5], inplace=True, axis=1)
test_labels.drop(test_labels.columns.to_list()[1:-5], inplace=True, axis=1)

## TODO! Use a deeper net, with regularization maybe :D
model_params = ([18, 18, 18], tf.nn.leaky_relu,
                     [False, False, False], 0.2, 'adam',
                      rmsle_custom, ['mse', 'msle', tf.keras.metrics.Accuracy()], True)

ann_scores, models, best_model, hists = ANN_groupKFold(5, model_params, train_labels, train_targets)

print(ann_scores)
print("==>\t", np.average(ann_scores))

gradientboost

In [None]:
# Define the features (this is all)
features =           ["building_id", # For grouping
                      "area_total", "area_kitchen", "area_living", "floor", "ceiling", "stories", "rooms",
                      "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed", # Numerical
                     "layout", "condition", "district", "material", "parking", "heating", "seller", #Categorical
                      "windows_court", "windows_street", "new", "elevator_without", "elevator_passenger", "elevator_service", "garbage_chute"] # Bool
                     #"street", "address"] # Strings

all_numerical_features = ["area_total", "area_kitchen", "area_living", "floor",
                      "ceiling", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed"]

float_numerical_features = ["area_total", "area_kitchen", "area_living", "ceiling", "latitude", "longitude", "constructed"]
int_numerical_features = ["floor", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies", "loggias", "phones"] # Ordinal categories

cat_features = ["layout", "condition", "district", "material", "parking", "heating", "seller"] # All are non-ordinal

droptable = []

# Load data
train, test, metaData = load_all_data()
# Clean data
train_labels, train_targets, test_labels = clean_data(train, test, features, float_numerical_features, int_numerical_features, cat_features, log_targets=False, log_area=True, fillNan=True)
# Add new features
train_labels, test_labels, added_features = feature_engineering(train_labels, test_labels, float_numerical_features, int_numerical_features, cat_features)
# Normalize
train_labels, test_labels = normalize(train_labels, test_labels, float_numerical_features, scaler="minMax")
# One-hot encoding
train_labels, test_labels = one_hot_encoder(train_labels, test_labels, ["condition", "district", "material", "parking", "heating", "seller"], drop_old=True)
# Drop some features
train_labels.drop(droptable, inplace=True, axis=1)
test_labels.drop(droptable, inplace=True, axis=1)

model_no_cv = GradientBoostingRegressor(
    n_estimators = optimal_n_estimators*10,
    max_depth = optimal_max_depth,
    min_samples_split = optimal_min_samples_split,
    min_samples_leaf = optimal_min_samples_leaf,
    max_features = optimal_max_features,
    subsample = optimal_subsample,
    learning_rate = original_learning_rate / 10,
    loss = 'squared_error',
    criterion = 'squared_error',
    verbose = 0,
    warm_start = False,
    random_state = random_state,
)

stacking

In [None]:
zfoqubfouqfoq
#stop the run

In [None]:
x_train = np.concatenate((
    dt_oof_train,
    rf_oof_train,
    ada_oof_train,
    gb_oof_train,
    lgbm_oof_train,
    cat_oof_train
), axis=1)

x_test = np.concatenate((
    dt_oof_test,
    rf_oof_test,
    ada_oof_test,
    gb_oof_test,
    lgbm_oof_test,
    cat_oof_test
), axis=1)

META_MODEL = lgb.LGBMRegressor(
    num_leaves=5,
    max_depth=7, 
    random_state=SEED, 
    silent=True, 
    metric='mse',
    n_jobs=4, 
    n_estimators=200,
    colsample_bytree=1,
    subsample=0.9,
    learning_rate=0.05
)

META_MODEL.fit(x_train, y_train)
final_predictions = META_MODEL.predict(x_test)