In [2]:
import pandas as pd
import time
import random
import math
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
# Specific tf libraries
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout

In [3]:
pd.options.mode.chained_assignment = None

In [4]:
%run ../common_utils.py

## 1 - BAGGING
Use the average of the predictions over the models, instead of using just one.
- Stacking! Wohoo. Probably smart to have a deep learning model in the stack.

In [5]:
# All their kaggle scores
d = {'LaureRF': 0.20015,
     'Deep': 0.23278,
     'GB': 0.19968,
     'CB1': 0.23450,
     'XGB1': 0.23787,
     'KNN1' : 0.35042}

acc = pd.DataFrame(
    d,
    index=[0]
)
acc = acc.T
acc.columns = ['RMSLE']
acc

Unnamed: 0,RMSLE
LaureRF,0.20015
Deep,0.23278
GB,0.19968
CB1,0.2345
XGB1,0.23787
KNN1,0.35042


In [6]:
LaureRF = pd.read_csv("ensemble_predictions/LaureRF.csv")
Deep = pd.read_csv("ensemble_predictions/Deep.csv")
GB = pd.read_csv("ensemble_predictions/GB.csv")
CB1 = pd.read_csv("ensemble_predictions/CB1.csv")
XGB1 = pd.read_csv("ensemble_predictions/XGB1.csv")
KNN1 = pd.read_csv("ensemble_predictions/KNN1.csv")

In [7]:
LaureRF = LaureRF.sort_values(by="id")
Deep = Deep.sort_values(by="id")
GB = GB.sort_values(by="id")
CB1 = CB1.sort_values(by="id")
XGB1 = XGB1.sort_values(by="id")
KNN1 = KNN1.sort_values(by="id")

In [8]:
LaureRF_prediction = LaureRF["price_prediction"].to_numpy().T
Deep_prediction = Deep["price_prediction"].to_numpy().T
GB_prediction = GB["price_prediction"].to_numpy().T
CB1_prediction = CB1["price_prediction"].to_numpy().T
XGB1_prediction = XGB1["price_prediction"].to_numpy().T
KNN1_prediction = KNN1["price_prediction"].to_numpy().T

In [9]:
avg_prediction = np.average(
    [LaureRF_prediction,
     Deep_prediction,
     GB_prediction,
     CB1_prediction,
     XGB1_prediction,
     KNN1_prediction
    ],
    weights = 1 / acc['RMSLE'] ** 4,
    axis=0
)

In [35]:
result = avg_prediction
submission = pd.DataFrame()
submission['id'] = LaureRF['id']
submission['price_prediction'] = result
if len(submission['id']) != 9937:
    raise Exception("Not enough rows submitted!")
submission.to_csv('BESTSUBMISSIONEVER', index=False)

## 2 - Stacking
The more sofisticated way of ensemble the models.
Analogy: A king listening to advisors, and using their advise to make desicions.

In [None]:
def get_oof_xgboost(clf, x_train, y_train, x_test):
    """
    Popular function on Kaggle.
    
    Trains a classifier on 4/5 of the training data and
    predicts the rest (1/5). This procedure is repeated for all 5 folds,
    thus we have predictions for all training set. This prediction is one
    column of meta-data, later on used as a feature column by a meta-algorithm.
    We predict the test part and average predictions across all 5 models.
    
    Keyword arguments:
    clf -- classifier
    x_train -- 4/5 of training data
    y_train -- corresponding labels
    x_test -- all test data
    
    """
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    groups = x_train["building_id"]
    
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    gkf = GroupKFold(
        n_splits=NFOLDS,
    )
    
    for train_index, test_index in gkf.split(x_train, y_train, groups):
        x_tr, x_te = x_train.iloc[train_index], x_train.iloc[test_index]
        y_tr, y_te = y_train.iloc[train_index], y_train.iloc[test_index]
        
        x_tr = x_tr.drop(["building_id"], axis=1)
        x_te = x_te.drop(["building_id"], axis=1)
        
        clf.fit(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [46]:
def get_oof_ann(clf, x_train, y_train, x_test):
    """
    Popular function on Kaggle.
    
    Trains a classifier on 4/5 of the training data and
    predicts the rest (1/5). This procedure is repeated for all 5 folds,
    thus we have predictions for all training set. This prediction is one
    column of meta-data, later on used as a feature column by a meta-algorithm.
    We predict the test part and average predictions across all 5 models.
    
    Keyword arguments:
    clf -- classifier
    x_train -- 4/5 of training data
    y_train -- corresponding labels
    x_test -- all test data
    
    """
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    groups = x_train["building_id"]
    
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    gkf = GroupKFold(
        n_splits=NFOLDS,
    )
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', verbose=1, mode='min', patience=40)
    for train_index, test_index in gkf.split(x_train, y_train, groups):
        x_tr, x_te = x_train.iloc[train_index], x_train.iloc[test_index]
        y_tr, y_te = y_train.iloc[train_index], y_train.iloc[test_index]
        
        x_tr = x_tr.drop(["building_id"], axis=1)
        x_te = x_te.drop(["building_id"], axis=1)
        
        clf.fit(x=x_tr, y=y_tr,
              validation_data=(x_te, y_te),
              verbose=0, epochs=1000, callbacks=[early_stop, PrintDot()]
              )
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [38]:
train, test, metaData = load_all_data()
SEED = 42 # for reproducibility
NFOLDS = 5 # set number of folds for out-of-fold prediction

### Deep learning advisor

In [40]:
# Mostly the ones correlated to price.
features =           ["building_id",
                      "area_total", "area_kitchen", "area_living", "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed", # Numerical
                    "district", "material", "parking"] 

numerical_features = ["area_total", "area_kitchen", "area_living",
                      "floor", "stories", "rooms", "bathrooms_shared", "balconies", "latitude", "longitude", "constructed"]

cat_features = ["district", "material", "parking"]

droptable = ['longitude', 'latitude', 'area_kitchen', 'area_living', 'floor', 'stories'] # Not dropping theta!

train_labels, train_targets, test_labels = pre_process_numerical(
    features, numerical_features, train, test, outliers_value=7, val_data=False, val_split=0.1, random_state=42, scaler="minMax",
    add_R="True", add_rel_height="True", droptable=droptable,
    one_hot_encode=True, cat_features=cat_features, drop_old=True)

# Model
ann_model = tf.keras.Sequential()
ann_model.add(Dense(18, activation=tf.nn.relu)) #tf.keras.activations.sigmoid
#ann_model.add(Dropout(0.2))
ann_model.add(Dense(12, activation=tf.nn.relu))
#ann_model.add(Dropout(0.2))
ann_model.add(Dense(6, activation=tf.nn.relu))
ann_model.add(Dense(1)) #Output

# Optimized for reducing msle loss.
ann_model.compile(optimizer='adam', 
              loss=rmsle_custom, #'msle', 'rmse', RMSLETF, rmsle_custom
              metrics=['mse', 'msle', tf.keras.metrics.Accuracy()]) # metrics=['mse', 'msle'] metrics=[tf.keras.metrics.Accuracy()]

Hot encoding
minMax


In [47]:
ann_oof_train, ann_oof_test = get_oof_ann(ann_model, train_labels, train_targets, test_labels)


....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
.......................................................................................

KeyboardInterrupt: 

### XGBoost advisor