## 1 - K-fold validation
#### Sources
- https://www.machinecurve.com/index.php/2020/02/18/how-to-use-k-fold-cross-validation-with-keras/
- https://medium.com/the-owl/k-fold-cross-validation-in-keras-3ec4a3a00538

A method to validate model more accurately than just using one set of validation data.
Also: stacking the k-fold models could lead to better accuracy, as the result is a model that has seen and trained on all the data.

## 2 - Simple average stacking
Use the average of the predictions over the models, instead of using just one.
NB!
- sklearn.model_selection.StratifiedGroupKFold! Apartments from different buildings should not be in different datasets (val, train)
- Also - stratified: makes y data is equally represented in different datasets (train, val): can pass log(y) or something into the StratifiedGroupKFold, and then extract price from X or something.
- Just do bagging! Average/weighted average.
- Stacking! Wohoo. Probably smart to have a deep learning model in the stack.

In [2]:
import pandas as pd
import time
import random
import math
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
# Specific tf libraries
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout

In [3]:
pd.options.mode.chained_assignment = None

In [4]:
%run ../common_utils.py

In [44]:
tls = []
tts = []
vls = []
vts = []
tstl = []
train, test, metaData = load_all_data()


# Features 1 "All features"
features =           ["area_total", "area_kitchen", "area_living", "floor", "ceiling", "stories", "rooms", "bathrooms_private", "bathrooms_shared", "balconies","loggias", "phones", "latitude", "longitude", "constructed", # Numerical
                     "layout", "condition", "district", "material", "parking", "heating", #Categorical
                      "windows_court", "windows_street", "new", "elevator_without", "elevator_passenger", "elevator_service", "garbage_chute"] # Bool
                     #"street", "address"] #String

numerical_features = ["area_total", "area_kitchen", "area_living",
                      "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed", "r", "rel_height"]

cat_features = ["layout", "condition", "district", "material", "parking", "heating"]

droptable = ['longitude', 'latitude', 'floor', 'stories'] # Not dropping theta!

train_labels, train_targets, val_labels, val_targets, test_labels = pre_process_numerical(
        features, numerical_features, train, test, outliers_value=7, val_split=0.1, random_state=42, scaler="std",
        add_R="True", add_rel_height="True", droptable=droptable,
        one_hot_encode=True, cat_features=cat_features, drop_old=True)

tls.append(train_labels)
tts.append(train_targets)
vls.append(val_labels)
vts.append(val_targets)
tstl.append(test_labels)

# Features 2 "best features"
features =           ["area_total", "area_kitchen", "area_living", "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed", # Numerical
                    "district", "material", "parking"] 

numerical_features = ["area_total", "area_kitchen", "area_living",
                      "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed", "r", "rel_height"]

cat_features = ["district", "material", "parking"]

droptable = ['longitude', 'latitude', 'area_kitchen', 'area_living', 'floor', 'stories'] # Not dropping theta!

# Data pre-processing
train_labels, train_targets, val_labels, val_targets, test_labels = pre_process_numerical(
        features, numerical_features, train, test, outliers_value=7, val_split=0.1, random_state=42, scaler="minMax",
        add_R="True", add_rel_height="True", droptable=droptable,
        one_hot_encode=False, cat_features=cat_features, drop_old=False)

tls.append(train_labels)
tts.append(train_targets)
vls.append(val_labels)
vts.append(val_targets)
tstl.append(test_labels)



In [45]:
# Creater 5 different models to stack
# https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object

model1 = create_ANN_model()
model2 = create_ANN_model()
#model3 = create_ANN_model()
#model4 = create_ANN_model()
#model5 = create_ANN_model()
models = [model1, model2] #, model3, model4, model5]

losses = []
epochs = [10000, 10000, 1000, 1000, 2000]
for i, model in enumerate(models):
    train_labels = tls[i]
    train_targets = tts[i]
    val_labels = vls[i]
    val_targets = vts[i]
    test_labels = tstl[i]
    e = epochs[i]
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', verbose=1, mode='min', patience=40)
    history = model.fit(x=train_labels, y=train_targets.values,
              validation_data=(val_labels,val_targets.values),
              verbose=0, epochs=e, callbacks=[early_stop, PrintDot()]
              )
    
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    losses.append(hist['val_loss'].tail(1))
    model.save('models/some_model_'+str(i))
    predict_and_store(model, test_labels, test, path='advanced_tests/stacking_'+str(i))


....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
..........................................................................................

In [46]:
print(losses)

[1642    0.200561
Name: val_loss, dtype: float64, 2372    0.239887
Name: val_loss, dtype: float64]


### BAGGING

In [29]:
acc = pd.DataFrame(
    {'LaureRF': 0.20015,
     'Deep': 0.23278,
     'GB': 0.19968,
     'CB1': 0.23450,
     'XGB1': 0.23787,
     'KNN1' : 0.35042},
    index=[0]
)
acc = acc.T
acc.columns = ['RMSLE']
acc

Unnamed: 0,RMSLE
LaureRF,0.20015
Deep,0.23278
GB,0.19968
CB1,0.2345
XGB1,0.23787
KNN1,0.35042


In [30]:
LaureRF = pd.read_csv("ensemble_predictions/LaureRF.csv")
Deep = pd.read_csv("ensemble_predictions/Deep.csv")
GB = pd.read_csv("ensemble_predictions/GB.csv")
CB1 = pd.read_csv("ensemble_predictions/CB1.csv")
XGB1 = pd.read_csv("ensemble_predictions/XGB1.csv")
KNN1 = pd.read_csv("ensemble_predictions/KNN1.csv")

In [31]:
LaureRF = LaureRF.sort_values(by="id")
Deep = Deep.sort_values(by="id")
GB = GB.sort_values(by="id")
CB1 = CB1.sort_values(by="id")
XGB1 = XGB1.sort_values(by="id")
KNN1 = KNN1.sort_values(by="id")

In [32]:
LaureRF_prediction = LaureRF["price_prediction"].to_numpy().T
Deep_prediction = Deep["price_prediction"].to_numpy().T
GB_prediction = GB["price_prediction"].to_numpy().T
CB1_prediction = CB1["price_prediction"].to_numpy().T
XGB1_prediction = XGB1["price_prediction"].to_numpy().T
KNN1_prediction = KNN1["price_prediction"].to_numpy().T

In [33]:
avg_prediction = np.average(
    [LaureRF_prediction,
     Deep_prediction,
     GB_prediction,
     CB1_prediction,
     XGB1_prediction,
     KNN1_prediction
    ],
    weights = 1 / acc['RMSLE'] ** 4,
    axis=0
)

In [35]:
result = avg_prediction
submission = pd.DataFrame()
submission['id'] = LaureRF['id']
submission['price_prediction'] = result
if len(submission['id']) != 9937:
    raise Exception("Not enough rows submitted!")
submission.to_csv('BESTSUBMISSIONEVER', index=False)