In [1]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.callbacks import TensorBoard
import tensorflow as tf
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
from lib import models

## Data

In [3]:
data = pd.read_csv('input/processed_data_nyc.csv', index_col = 0)
data.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,all_year_avail,low_avail,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,40.64749,-73.97237,5.010635,1,9,2762,0.21,6,True,False,...,0,0,0,0,0,0,0,0,1,0
1,40.75362,-73.98377,5.420535,1,45,2976,0.38,2,True,False,...,0,0,0,0,0,0,0,1,0,0
2,40.80902,-73.9419,5.01728,3,0,0,0.0,1,True,False,...,0,0,0,0,0,0,0,0,1,0
3,40.68514,-73.95976,4.49981,1,270,3021,4.64,1,False,False,...,0,0,0,0,0,0,0,1,0,0
4,40.79851,-73.94399,4.394449,10,9,2793,0.1,1,False,True,...,0,0,0,0,0,0,0,1,0,0


In [4]:
y = data.price
X_wp = np.asarray(data).astype(np.float32)
data = data.drop(['price'], axis=1)

X = np.asarray(data).astype(np.float32)
y = np.asarray(y).ravel()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Dataset: {}".format(X_train.shape))
print("Testing Dataset: {}".format(X_test.shape))

Training Dataset: (39014, 239)
Testing Dataset: (9754, 239)


In [6]:
scaler = preprocessing.RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [21]:
X_wp[1]

array([ 4.075362e+01, -7.398377e+01,  5.420535e+00,  1.000000e+00,
        4.500000e+01,  2.976000e+03,  3.800000e-01,  2.000000e+00,
        1.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  1.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e+00,
        0.000000e+00,  0.000000e+00,  0.000000e+00,  0.000000e

## Generating Data with GAN

We build the General Adversarial Network (GAN) using the GAN class in 'models.py'

In [7]:
gan = models.GAN()

The GAN model consists of a generator and a discriminator, which work together to generate and check the generated data respectively.

In [8]:
gan.generator.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               61696     
_________________________________________________________________
dense_1 (Dense)              (None, 240)               61680     
Total params: 123,376
Trainable params: 123,376
Non-trainable params: 0
_________________________________________________________________


In [9]:
gan.discriminator.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 512)               123392    
_________________________________________________________________
dense_3 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 558,593
Trainable params: 558,593
Non-trainable params: 0
________________________________________________

We generate data using the train function. The first input is the real data, the second is the number of training epochs, and the third is the number of samples to generate.

In [20]:
generated_data = gan.train(X_wp[:50*256], 200, 20)
# Need to make sure batch is dividable by batch size

Training on 12736 samples divided into 199 batches
Validating on 20 samples
Time for epoch 1 is 12.620837926864624 sec
train mse = 24469.501953125     val mse = 25044.880859375
Time for epoch 2 is 10.527913570404053 sec
train mse = 25019.94921875     val mse = 25595.42578125
Time for epoch 3 is 11.256608009338379 sec
train mse = 25759.826171875     val mse = 26364.1015625
Time for epoch 4 is 11.480194807052612 sec
train mse = 26223.978515625     val mse = 26843.962890625
Time for epoch 5 is 9.153471946716309 sec
train mse = 26772.287109375     val mse = 27443.05078125
Time for epoch 6 is 7.003467798233032 sec
train mse = 26998.8125     val mse = 27696.908203125
Time for epoch 7 is 6.75046443939209 sec
train mse = 26494.01953125     val mse = 27161.0390625
Time for epoch 8 is 6.613415956497192 sec
train mse = 27004.7265625     val mse = 27665.740234375
Time for epoch 9 is 10.578461408615112 sec
train mse = 27655.583984375     val mse = 28303.021484375
Time for epoch 10 is 13.44317412376

KeyboardInterrupt: 

In [11]:
print(generated_data)

[[8.825003  0.        0.        ... 0.        0.        0.       ]
 [4.6397886 0.        0.        ... 0.        0.        0.       ]
 [2.5917861 0.        0.        ... 0.        0.        0.       ]
 ...
 [2.7451532 0.        0.        ... 0.        0.        0.       ]
 [4.427025  0.        0.        ... 0.        0.        0.       ]
 [2.1830513 0.        0.        ... 0.        0.        0.       ]]


In [12]:
print(generated_data.shape)

(20, 240)


In [13]:
mse = mean_squared_error(X_wp[60:80,:], generated_data)
print(mse)

30296.693


## Ensemble Architecture

The ensemble architecture will include all three regressor models (huber, rf, and ridge regressors) and a neural network; all trained on original processed data as well as new representation of the data retreived from the autoencoder. All 8 models (4 with and 4 without autoencoder) will be connected to a fusion layer which will be used as training data for the final neural network.

In [14]:
# EPOCHS = 500
# BATCH_SIZE = 128

# # Models
# sae = models.build_autoencoder(X_train, num_hidden_layers = 4, num_nodes = 32, act = 'relu')
# rf = RandomForestRegressor(n_estimators=50)
# ridge = Ridge(alpha=5)
# huber = HuberRegressor(alpha=10, epsilon=3)
# ann_inner = models.buildNN (data, num_hidden_layers = 3, hidden_nodes = 128, act = 'relu', do = 0, regularizer = True,
#                 loss_function = 'mean_squared_error')



# earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
#                                                       min_delta=0.0001, patience=10,
#                                                       verbose=0, mode='auto')

# # Get new representations
# sae.fit(X_train, X_train, epochs = EPOCHS,  batch_size = BATCH_SIZE, validation_split = 0.2,
#         callbacks = [earlystop_callback])
# X_train_new = sae.predict(X_train)
# X_test_new = sae.predict(X_test)

# # Models on original representation
# rf.fit(X_train, y_train)
# ridge.fit(X_train, y_train)
# huber.fit(X_train, y_train)
# ann_inner.fit(X_train, y_train, epochs = EPOCHS, batch_size = BATCH_SIZE,
#             validation_split = 0.2)

# # Get original model outputs
# rf_out = rf.predict(X_train)
# ridge_out = ridge.predict(X_train)
# huber_out = huber.predict(X_train)
# ann_inner_out = ann_inner.predict(X_train)

# rf_out_test = rf.predict(X_test)
# ridge_out_test = ridge.predict(X_test)
# huber_out_test = huber.predict(X_test)
# ann_inner_out_test = ann_inner.predict(X_test)

# # Models on new representation
# rf.fit(X_train_new, y_train)
# ridge.fit(X_train_new, y_train)
# huber.fit(X_train_new, y_train)
# ann_inner.fit(X_train_new, y_train, epochs = EPOCHS, batch_size = BATCH_SIZE,
#             validation_split = 0.2, callbacks = [earlystop_callback])

# # Get new representation model outputs
# rf_out2 = rf.predict(X_train_new)
# ridge_out2 = ridge.predict(X_train_new)
# huber_out2 = huber.predict(X_train_new)
# ann_inner_out2 = ann_inner.predict(X_train_new)

# rf_out_test2 = rf.predict(X_test_new)
# ridge_out_test2 = ridge.predict(X_test_new)
# huber_out_test2 = huber.predict(X_test_new)
# ann_inner_out_test2 = ann_inner.predict(X_test_new)

# # Reshape
# rf_out = rf_out.reshape(X_train.shape[0], 1)
# ridge_out = ridge_out.reshape(X_train.shape[0], 1)
# huber_out = huber_out.reshape(X_train.shape[0], 1)
# rf_out2 = rf_out2.reshape(X_train.shape[0], 1)
# ridge_out2 = ridge_out2.reshape(X_train.shape[0], 1)
# huber_out2 = huber_out2.reshape(X_train.shape[0], 1)

# rf_out_test = rf_out_test.reshape(X_test.shape[0], 1)
# ridge_out_test = ridge_out_test.reshape(X_test.shape[0], 1)
# huber_out_test = huber_out_test.reshape(X_test.shape[0], 1)
# rf_out_test2 = rf_out_test2.reshape(X_test.shape[0], 1)
# ridge_out_test2 = ridge_out_test2.reshape(X_test.shape[0], 1)
# huber_out_test2 = huber_out_test2.reshape(X_test.shape[0], 1)


# # Creating fusion vectors
# fused_train = (rf_out, ridge_out, huber_out, ann_inner_out, rf_out2, ridge_out2, huber_out2, ann_inner_out2)
# fused_train = np.concatenate(fused_train, axis=1)
# fused_test = (rf_out_test, ridge_out_test, huber_out_test, ann_inner_out_test,
#               rf_out_test2, ridge_out_test2, huber_out_test2, ann_inner_out_test2)
# fused_test = np.concatenate(fused_test, axis=1)

# # Outer Neural Net
# # ann_outer = buildNN (fused_train, num_hidden_layers = 3, hidden_nodes = 256, act = 'relu', do = 0, regularizer = True,
# #                 loss_function = 'mean_absolute_error')

# # ann_outer.fit(fused_train, y_train, epochs = EPOCHS, batch_size = BATCH_SIZE,
# #             validation_split = 0.2, callbacks = [earlystop_callback])
# model_outer = RandomForestRegressor(n_estimators=50)
# model_outer.fit(fused_train, y_train)

# print("Training Complete")

In [15]:
# model_outer = RandomForestRegressor(n_estimators=50)
# model_outer.fit(fused_train, y_train)

In [16]:
# mse, r2, mae = evaluateModel(model_outer, fused_test, y_test)
# print('MSE = {}'.format(mse))
# print('R2 = {}'.format(r2))
# print("MAE = {}".format(mae))

In [17]:
# a1 = np.arange(10).reshape(10,1)
# a2 = np.arange(10).reshape(10,1)

In [18]:
# a3 = np.concatenate((a1, a2), axis = 1)

In [19]:
# X_train.shape[0]