In [1]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import TensorBoard
import tensorflow as tf
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

## 1.0 Functions
Here we define the functions used in this notebook.

In [12]:
## This function build the auto encoder model
def build_autoencoder(data, num_hidden_layers = 2, num_nodes = 16, act = 'relu'):

    model=Sequential()
    
    input_layer= Input(shape=(data.shape[1],))
    model.add(input_layer)
    
    for i in range(num_hidden_layers):
        model.add(Dense(units = num_nodes, activation=act))

            
    model.add(Dense(units=data.shape[1], activation=act))
    
    model.compile(optimizer='adam', loss= 'mean_squared_error', metrics=['mse']) 
    
    return model

## 2.0 Data Loading and Processing

In [3]:
data = pd.read_csv('input/processed_data_nyc.csv', index_col = 0)
# numerical_data = pd.read_csv('input/processed_data_nyc_numerical.csv',  index_col = 0)
# categorical_data = pd.read_csv('input/processed_data_nyc_categorical.csv',  index_col = 0)
data.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,all_year_avail,low_avail,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,40.64749,-73.97237,5.010635,1,9,2762,0.21,6,True,False,...,0,0,0,0,0,0,0,0,1,0
1,40.75362,-73.98377,5.420535,1,45,2976,0.38,2,True,False,...,0,0,0,0,0,0,0,1,0,0
2,40.80902,-73.9419,5.01728,3,0,0,0.0,1,True,False,...,0,0,0,0,0,0,0,0,1,0
3,40.68514,-73.95976,4.49981,1,270,3021,4.64,1,False,False,...,0,0,0,0,0,0,0,1,0,0
4,40.79851,-73.94399,4.394449,10,9,2793,0.1,1,False,True,...,0,0,0,0,0,0,0,1,0,0


We want to divide the data into categorical and numerical. We will attempt to get a new representation of the numerical data using the autoencoder. We will compare the new representation to the old one to see which one produces better performance using the same classifiers.

In [6]:
y = data.price
data = data.drop(['price'], axis=1)

# Converting to numpy arrays
X = np.asarray(data).astype(np.float32)
y = np.asarray(y).ravel()

Splitting to train and test sets:

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Dataset: {}".format(X_train.shape))
print("Testing Dataset: {}".format(X_test.shape))

Training Dataset: (39014, 239)
Testing Dataset: (9754, 239)


Scaling the data

In [8]:
scaler = preprocessing.RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## 3.0 Testing Autoencoders
In this section, I will test various architectures for autoencoders and judge based on Mean-Squared Error (MSE) as well as the performance of the three classifiers with and without autoencoder!

In [9]:
def compareClasifiers (ae_model, X_train, y_train, X_test, y_test):
    
    #Classifiers
    randomForest_final = RandomForestRegressor(n_estimators=50)
    ridge_final = Ridge(alpha=5)
    huber_final = HuberRegressor(alpha=10, epsilon=3)
    
    #New representations:
    X_train2 = ae_model.predict(X_train)
    X_test2 = ae_model.predict(X_test)
        
    rf_result_final = []
    ridge_result_final = []
    huber_result_final = []
    
    # Random Forest
    randomForest_final.fit(X_train,y_train)
    rf_y = randomForest_final.predict(X_test)
    rf_result = mean_squared_error(y_test, rf_y)

    randomForest_final.fit(X_train2,y_train)
    rf_y2 = randomForest_final.predict(X_test2)
    rf_result2 = mean_squared_error(y_test, rf_y2)

    print("Random Forest: {}".format(rf_result))
    print("Random Forest with Autoencoder: {}".format(rf_result2))
    rf_result_final.append(rf_result)
    rf_result_final.append(rf_result2)
    rf_result_final.append(rf_result-rf_result2)
    
    # Ridge
    ridge_final.fit(X_train,y_train)
    ridge_y = ridge_final.predict(X_test)
    ridge_result = mean_squared_error(y_test, ridge_y)

    ridge_final.fit(X_train2,y_train)
    ridge_y2 = ridge_final.predict(X_test2)
    ridge_result2 = mean_squared_error(y_test, ridge_y2)

    print("Ridge : {}".format(ridge_result))
    print("Ridge with Autoencoder: {}".format(ridge_result2))
    ridge_result_final.append(ridge_result)
    ridge_result_final.append(ridge_result2)
    ridge_result_final.append(ridge_result-ridge_result2)

    # Huber
    huber_final.fit(X_train,y_train)
    huber_y = huber_final.predict(X_test)
    huber_result = mean_squared_error(y_test,huber_y)

    huber_final.fit(X_train2,y_train)
    huber_y2 = huber_final.predict(X_test2)
    huber_result2 = mean_squared_error(y_test,huber_y2)

    print("Huber: {}".format(huber_result))
    print("Huber with Autoencoder: {}".format(huber_result2))
    huber_result_final.append(huber_result)
    huber_result_final.append(huber_result2)
    huber_result_final.append(huber_result-ridge_result2)

    df = pd.DataFrame({
        'RF': rf_result_final,
        'Ridge': ridge_result_final,
        'Huber': huber_result_final,

        
    },
    index = ['Before','After','Difference']
    )
    
    
    return df

### 3.1 Test 1: Layers and Nodes
In the first test, I will cross-validate for different numbers of layers and nodes. I will use 100 epochs with 512 for batch size. Once an ideal architecture is chosen, I will test the other parameters.

In [13]:
EPOCHS = 30
BATCH_SIZE = 256

# writer = pd.ExcelWriter('output/results_AE_test1.xlsx', engine='xlsxwriter')

NUM_LAYERS = [1,2,3,4]
NUM_BASE_NODES = [16,64,128,512,1024]

for numLayers in NUM_LAYERS:
    for numNodes in NUM_BASE_NODES:
        LOGNAME = "{}-{}-Epochs={}-TIME={}".format(numLayers, numNodes , EPOCHS, int(time.time()) )
        print(LOGNAME)
        tensorboard = TensorBoard(log_dir='logs/AE/{}'.format(LOGNAME))
        ae_model = build_autoencoder(X, num_hidden_layers = numLayers, num_nodes = numNodes)
        ae_model.fit(X_train, X_train, epochs = EPOCHS, batch_size = BATCH_SIZE, callbacks = [tensorboard],
                    validation_split=0.2)

#         result = compareClasifiers (ae_model, X_train, y_train, X_test, y_test)
#         sheetName = '{}-{}'.format(numLayers, numNodes)
#         result.to_excel(writer, sheet_name = sheetName)
                    
# writer.save()

1-16-Epochs=30-TIME=1586823117
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
1-64-Epochs=30-TIME=1586823183
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30


Epoch 29/30
Epoch 30/30
1-128-Epochs=30-TIME=1586823261
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
1-512-Epochs=30-TIME=1586823335
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30


Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
1-1024-Epochs=30-TIME=1586823459
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2-16-Epochs=30-TIME=1586823560
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30


Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2-64-Epochs=30-TIME=1586823600
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2-128-Epochs=30-TIME=1586823643
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30


Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2-512-Epochs=30-TIME=1586823697
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2-1024-Epochs=30-TIME=1586823804
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30


Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
3-16-Epochs=30-TIME=1586824157
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
3-64-Epochs=30-TIME=1586824224
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30


Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
3-128-Epochs=30-TIME=1586824315
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
3-512-Epochs=30-TIME=1586824416
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
3-1024-Epochs=30-TIME=1586824629
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
4-16-Epochs=30-TIME=1586825177
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30


Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
4-64-Epochs=30-TIME=1586825236
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
4-128-Epochs=30-TIME=1586825321
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30


Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
4-512-Epochs=30-TIME=1586825413
Train on 31211 samples, validate on 7803 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
4-1024-Epochs=30-TIME=1586825684
Train on 31211 samples, validate on 7803 samples
Epoch 1/30


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


The logs of each architecture are recorded using the tensorboard callback and can be accessed by running the below shell command and accessing the provided link.

In [None]:
# !tensorboard --logdir logs/AE/test1

The results of test 1 reveal that too many layers and nodes decreased the performance of the autoencoder. Most of the models tested had similar performance when looking at the loss and mse curves on tensorboard. As such, the model was chosen based on computational efficiency and smoothness of the curves.  

The model chosen has 2 hidden layers with a base number of nodes to 16. The number of layers here refers to the number of layers in the encoder, which match the number of layers in the decoder due to symmetry. This means that the actual model has 4 hidden layers of 16-32-32-16 nodes. 

### 3.2 Test 2: Larger Networks
The first test of autoencoder models reveals that larger networks are neccessary and they must be trained for more epochs

In [None]:
EPOCHS = 500
BATCH_SIZE = 512

NUM_LAYERS = [2,3,4,5,6]
NUM_BASE_NODES = [64,128,256,512,1024]
ACTS = ['tanh','relu']


for numLayers in NUM_LAYERS:
    for numNodes in NUM_BASE_NODES:
        for ac in ACTS:
            LOGNAME = "{}-{}-Epochs={}-TIME={}".format(numLayers, numNodes , EPOCHS, int(time.time()) )
            print(LOGNAME)
            tensorboard = TensorBoard(log_dir='logs/AE/{}/{}'.format(ac,LOGNAME))
            ae_model = build_autoencoder(X, num_hidden_layers = numLayers, base_nodes = numNodes, act = ac)
            ae_model.fit(X, X, epochs = EPOCHS, batch_size = BATCH_SIZE, callbacks = [tensorboard])

### 3.2 Test 3: Batch Sizes and Epochs
Now we will test for different batch sizes, and we will use a larger number of epochs to see if performance can improve. To save time, we can use an early stop callback which stops the training when a specific metric reaches a specific goal. In this case, we can stop the training if the MSE does not drop for 20 epochs.

In [None]:
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='mse',
                                                      min_delta=0.0001,
                                                      patience=20)

In [None]:
## The model chosen from test 1:
NUM_LAYERS = 2
NUM_NODES = 16

BATCH_SIZES = [8,16,32,64,128,256,512,1024,2048]

EPOCHS = 500

for batchSize in BATCH_SIZES:
    LOGNAME = "Batch={}--TIME={}".format(batchSize , int(time.time()) )
    print(LOGNAME)
    tensorboard = TensorBoard(log_dir='logs/AE/test2/{}'.format(LOGNAME))
    ae_model = build_autoencoder(X, num_hidden_layers = NUM_LAYERS, base_nodes = NUM_NODES)
    ae_model.fit(X_train, X_train, 
        epochs = EPOCHS,
        batch_size = batchSize,
                callbacks = [tensorboard, earlystop_callback])

We can observe the test results by running the below shell command. These results show that a batch size of 1024 had the best performance, measured by the MSE, smoothness of the loss curve, and the training time.

In [None]:
# !tensorboard --logdir logs/AE/test2

### 3.3 Test 3: Retesting different architectures with classifiers

## 4.0 Combining Autoencoder with Classifiers
Now that we have an autoencoder with tuned parameters, we can test the effectiveness of this autoencoder by observing the MSE of different classifiers with and without the autoencoder. The top three classifiers, determined and tuned in the previous notebook, are defined below.

In [None]:
randomForest_final = RandomForestRegressor(n_estimators=50)
ridge_final = Ridge(alpha=5)
huber_final = HuberRegressor(alpha=10, epsilon=3)

Now we re-define the final autoencoder model, train it, and get the new representation of the data.

In [None]:
LOGNAME = "FinalAE-TIME={}".format(batchSize , int(time.time()) )
tensorboard = TensorBoard(log_dir='logs/final/{}'.format(LOGNAME))

NUM_LAYERS = 2
NUM_NODES = 16
BATCH_SIZE = 1024
EPOCHS = 500

ae_model = build_autoencoder(X, num_hidden_layers = NUM_LAYERS, base_nodes = NUM_NODES)
ae_model.fit(X_train, X_train, 
             epochs = EPOCHS,
             batch_size = BATCH_SIZE,
             callbacks = [tensorboard, earlystop_callback])

In [None]:
#New representations:
X_train2 = ae_model.predict(X_train)
X_test2 = ae_model.predict(X_test)

In [None]:
print(X_test2)

In [None]:
# Random Forest
randomForest_final.fit(X_train,y_train)
rf_y = randomForest_final.predict(X_test)
rf_result = mean_squared_error(y_test, rf_y)

randomForest_final.fit(X_train2,y_train)
rf_y2 = randomForest_final.predict(X_test2)
rf_result2 = mean_squared_error(y_test, rf_y2)

print("Random Forest: {}".format(rf_result))
print("Random Forest with Autoencoder: {}".format(rf_result2))


# Ridge
ridge_final.fit(X_train,y_train)
ridge_y = ridge_final.predict(X_test)
ridge_result = mean_squared_error(y_test, ridge_y)

ridge_final.fit(X_train2,y_train)
ridge_y2 = ridge_final.predict(X_test2)
ridge_result2 = mean_squared_error(y_test, ridge_y2)

print("Ridge : {}".format(ridge_result))
print("Ridge with Autoencoder: {}".format(ridge_result2))

# Huber
huber_final.fit(X_train,y_train)
huber_y = huber_final.predict(X_test)
huber_result = mean_squared_error(y_test,huber_y)

huber_final.fit(X_train2,y_train)
huber_y2 = huber_final.predict(X_test2)
huber_result2 = mean_squared_error(y_test,huber_y2)

print("Huber: {}".format(huber_result))
print("Huber with Autoencoder: {}".format(huber_result2))