# Bagging and stacking

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Specific tf libraries
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5


In [17]:
pd.options.mode.chained_assignment = None
%run ../common_utils.py

In [18]:
# All their kaggle scores
d = {'LaureRF': 0.20015,
     'RF2': 0.34266,
     'Deep': 0.23278,
     'GB': 0.19968,
     'CB1': 0.23450,
     'XGB1': 0.23787,
     'KNN1' : 0.35042}

acc = pd.DataFrame(
    d,
    index=[0]
)
acc = acc.T
acc.columns = ['RMSLE']
acc

Unnamed: 0,RMSLE
LaureRF,0.20015
RF2,0.34266
Deep,0.23278
GB,0.19968
CB1,0.2345
XGB1,0.23787
KNN1,0.35042


In [19]:
LaureRF = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/LaureRF.csv")
RF2 = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/RF2.csv")
Deep = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/Deep.csv")
GB = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/GB.csv")
CB1 = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/CB1.csv")
XGB1 = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/XGB1.csv")
KNN1 = pd.read_csv("../henrik-testsite/ensemble_predictions/csvs/KNN1.csv")

In [20]:
LaureRF = LaureRF.sort_values(by="id")
RF2 = RF2.sort_values(by="id")
Deep = Deep.sort_values(by="id")
GB = GB.sort_values(by="id")
CB1 = CB1.sort_values(by="id")
XGB1 = XGB1.sort_values(by="id")
KNN1 = KNN1.sort_values(by="id")

In [21]:
LaureRF_prediction = LaureRF["price_prediction"].to_numpy().T
RF2_prediction = RF2["price_prediction"].to_numpy().T
Deep_prediction = Deep["price_prediction"].to_numpy().T
GB_prediction = GB["price_prediction"].to_numpy().T
CB1_prediction = CB1["price_prediction"].to_numpy().T
XGB1_prediction = XGB1["price_prediction"].to_numpy().T
KNN1_prediction = KNN1["price_prediction"].to_numpy().T

In [22]:
avg_prediction = np.average(
    [LaureRF_prediction,
     RF2_prediction,
     Deep_prediction,
     GB_prediction,
     CB1_prediction,
     XGB1_prediction,
     KNN1_prediction
    ],
    weights = 1 / acc['RMSLE'] ** 4,
    axis=0
)
print(avg_prediction)

ValueError: Length of weights not compatible with specified axis.

In [15]:
result = avg_prediction
submission = pd.DataFrame()
submission['id'] = LaureRF['id']
submission['price_prediction'] = result
if len(submission['id']) != 9937:
    raise Exception("Not enough rows submitted!")
submission.to_csv('BESTSUBMISSIONEVER', index=False)

# kaggle_scores = [0.23450, 0.20502, 0.23278, 0.19968, 0.35042, 0.20015, 0.20159, 0.23787]
# csv_paths = ["ensemble_predictions/csvs/CB1.csv", "ensemble_predictions/csvs/deep_king_5_5.csv", "ensemble_predictions/csvs/deep.csv", "ensemble_predictions/csvs/GB.csv",
#              "ensemble_predictions/csvs/KNN1.csv", "ensemble_predictions/csvs/LaureRF.csv", "ensemble_predictions/csvs/xgb_king_2.csv", "ensemble_predictions/csvs/XGB1.csv"]
# submission_path = "ensemble_predictions/bagging_2"

# csv_bagging(kaggle_scores, csv_paths, submission_path)

## Stacking

In [None]:
train, test, metaData = load_all_data()

In [None]:
# Mostly the ones correlated to price.
features =           ["building_id",
                      "area_total", "area_kitchen", "area_living", "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed", # Numerical
                    "district", "material", "parking"] 

numerical_features = ["area_total", "area_kitchen", "area_living",
                      "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed"]

cat_features = ["district", "material", "parking"]

droptable = ['longitude', 'latitude', 'area_kitchen', 'area_living', 'floor', 'stories'] # Not dropping theta!

train_labels, train_targets, test_labels = pre_process_numerical(
    features, numerical_features, train, test, outliers_value=7, val_data=False, val_split=0.1, random_state=42, scaler="minMax",
    add_R="True", add_rel_height="True",  add_spacious=True, droptable=droptable,
    one_hot_encode=True, cat_features=cat_features, drop_old=True)