In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.utils import set_random_seed
set_random_seed(1)
plt.rcParams["figure.figsize"] = (12,7)
sns.set_style("darkgrid")

In [2]:
df = pd.read_csv("data/space_embedding_data.csv", index_col=[0])
df.head()

Unnamed: 0,Grid,PlaceCount,Price 1,Price 2,Price 3,Accommodation,Bar,Cafe,Cultural,Education,...,Health_rating,Other_rating,Outdoors_rating,Restaurant_rating,Retail_rating,Service_rating,Wholesale_rating,PositiveReviews,NegativeReviews,ReviewLength
0,L0,8,0,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.02,0.0,3.942857,0.0,0.0,0.0,0.696,0.304,0.536595
1,L1,10,0,1,0,0.0,0.0,1.0,0.0,0.0,...,5.0,0.0,0.0,4.777778,0.0,4.0,0.0,0.851852,0.148148,0.802238
2,L10,3,0,0,0,0.0,0.0,0.0,0.0,1.0,...,0.0,4.2,0.0,0.0,0.0,0.0,0.0,0.857143,0.142857,0.999095
3,L100,74,2,9,6,12.0,6.0,6.0,0.0,1.0,...,3.6,4.02446,0.0,3.711172,4.5,3.028846,0.0,0.732975,0.267025,0.895548
4,L101,163,7,31,5,21.0,19.0,20.0,1.0,1.0,...,4.5,4.223132,0.0,3.748855,3.549446,4.311111,3.722222,0.801535,0.198465,0.843739


In [3]:
y = df[[i for i in df.columns if "_rating" not in i]].drop(["Grid"], axis=1)
X = pd.get_dummies(df.Grid).values

In [4]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
y_tf = sc.fit_transform(y)

# Space Embeddings model

## Continuous features

In [5]:
from keras.models import Model
from keras.layers import Dense, Embedding, Flatten
from keras import Input
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt
# https://www.tensorflow.org/tutorials/keras/keras_tuner
def model_builder_continuous(hp):
    input_neurons = X.shape[0]

    # Model architecture
    inp = Input(shape=(input_neurons,))
    hp_embedding = hp.Int("units", min_value=5, max_value=15, step=1)
    emb = Embedding(input_dim=input_neurons, output_dim=hp_embedding, input_length=input_neurons)(inp)
    flat = Flatten()(emb)

    # Output layer
    dict_losses = {}
    output_acts = []
    for idx in range(len(y.columns)):
        output_act = Dense(1, activation="linear", use_bias=True, name="output_layer_cont" + str(idx))(flat)
        output_acts.append(output_act)
        dict_losses['output_layer_cont' + str(idx)] = 'mean_squared_error'

    # Model compiling
    model = Model(inputs=inp, outputs=output_acts)
    hp_learning_rate = hp.Choice('learning_rate', values=[3e-4, 0.001, 0.01, 0.1])
    opt = Adam(learning_rate=hp_learning_rate)
    model.compile(loss=dict_losses, optimizer=opt)
    return model


In [6]:
from keras.callbacks import EarlyStopping
objective = kt.Objective("loss", direction="min")
tuner = kt.Hyperband(model_builder_continuous,
                    objective=objective,
                    max_epochs=5,
                    directory="hyperparameter_tuning",
                    project_name="continuous_trials",
                    overwrite=True
)

stop_early = EarlyStopping(monitor="loss", patience=5)

tuner.search(X, y_tf, epochs=5, callbacks=[stop_early])
best_hps = tuner.get_best_hyperparameters()[0]

Trial 10 Complete [00h 00m 09s]
loss: 21.022119522094727

Best loss So Far: 19.898082733154297
Total elapsed time: 00h 01m 29s
INFO:tensorflow:Oracle triggered exit


In [56]:
# Rebuild model with best hyperparameters
model = tuner.hypermodel.build(best_hps)

# train model for 50 epochs
history = model.fit(X, y_tf, epochs=50, verbose=0)
train_loss_per_epoch = history.history['loss']

In [61]:
# Find best epoch in terms of train loss
best_epoch = train_loss_per_epoch.index(min(train_loss_per_epoch)) + 1

# Retrain the model with optimal epochs from above
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.fit(X, y_tf, epochs=best_epoch, verbose=0)

# Save model
hypermodel.save("models/continuous_model")

<keras.callbacks.History at 0x1e85cf349a0>

## Discrete/categorical features

In [107]:
# Defining a function for uniform discretizing of the features
# Can only handle features with less than 20% zero values. 
# Minimum number of bins are set to 3 and the maximum number of bins are 10

def uniform_Discretization(array):
    minspan = []
    
    # max number og bins is 10 (7+3)
    max_bins = 7
    #if input contains zeros, adjust max number of bins
    if np.any(array==0):
        max_bins = np.int32(np.floor(1/(len(np.where(array==0)[0])/len(array))))-2

    # find optimal number of bins with least variation in binsize
    # min number og bins = 3
    for i in range(max_bins-3):
        Value_count = pd.qcut(array,i+3).value_counts()
        minspan.append(max(Value_count)-min(Value_count))

    bins = np.argmin(minspan)+3
    intervals = np.unique(pd.qcut(array,bins))
    
    # print number of bins and bin intervals
    print(array.name,'- Bins: ', bins,'--',' '.join(str(x) for x in intervals))
    
    # uniform discretized bin 
    uniform_vals = pd.qcut(array,bins, labels=False)
    
    return uniform_vals


In [108]:
# Transforming the dataset to uniform discretized values, for features with less than 20% zeros
y_discrete = pd.DataFrame()
for i in range(len(y.columns)-1):
    columns = y.columns
    
    if len(np.where(y[columns[i+1]]==0)[0])/len(y[columns[i+1]])<0.2:
        y_discrete[columns[i+1]] = uniform_Discretization(y[columns[i+1]])
    else:
        0
    #data[columns[i+]] = uniform_Discretization(df[columns[i+1]])

Bar - Bins:  3 -- (-0.001, 2.0] (2.0, 6.0] (6.0, 133.0]
Other - Bins:  3 -- (-0.001, 2.0] (2.0, 6.0] (6.0, 91.0]
Restaurant - Bins:  3 -- (-0.001, 3.0] (3.0, 13.0] (13.0, 263.0]
ReviewCount - Bins:  5 -- (3.999, 40.8] (40.8, 99.2] (99.2, 225.4] (225.4, 638.2] (638.2, 8699.0]
Rating - Bins:  3 -- (1.5550000000000002, 3.941] (3.941, 4.13] (4.13, 5.0]
PositiveReviews - Bins:  4 -- (0.311, 0.792] (0.792, 0.833] (0.833, 0.864] (0.864, 1.0]
NegativeReviews - Bins:  4 -- (-0.001, 0.136] (0.136, 0.167] (0.167, 0.208] (0.208, 0.688]
ReviewLength - Bins:  3 -- (0.0665, 0.899] (0.899, 1.099] (1.099, 2.218]


In [None]:
def model_builder_continuous(hp):
    input_neurons = X.shape[0]

    # Model architecture
    inp = Input(shape=(input_neurons,))
    hp_embedding = hp.Int("units", min_value=5, max_value=15, step=1)
    emb = Embedding(input_dim=input_neurons, output_dim=hp_embedding, input_length=input_neurons)(inp)
    flat = Flatten()(emb)

    # Output layer
    dict_losses = {}
    output_acts = []
    for idx in range(len(y.columns)):
        output_act = Dense(1, activation="softmax", use_bias=True, name="output_layer_cont" + str(idx))(flat)
        output_acts.append(output_act)
        dict_losses['output_layer_cont' + str(idx)] = 'categorical_crossentropy'

    # Model compiling
    model = Model(inputs=inp, outputs=output_acts)
    hp_learning_rate = hp.Choice('learning_rate', values=[3e-4, 0.001, 0.01, 0.1])
    opt = Adam(learning_rate=hp_learning_rate)
    model.compile(loss=dict_losses, optimizer=opt)
    return model

In [None]:
objective = kt.Objective("loss", direction="min")
tuner = kt.Hyperband(model_builder_continuous,
                    objective=objective,
                    max_epochs=5,
                    directory="hyperparameter_tuning",
                    project_name="discrete_trials",
                    overwrite=True
)

stop_early = EarlyStopping(monitor="loss", patience=5)

tuner.search(X, y_tf, epochs=5, callbacks=[stop_early])
best_hps = tuner.get_best_hyperparameters()[0]

Trial 10 Complete [00h 00m 09s]
loss: 21.022119522094727

Best loss So Far: 19.898082733154297
Total elapsed time: 00h 01m 29s
INFO:tensorflow:Oracle triggered exit


In [None]:
# Rebuild model with best hyperparameters
model = tuner.hypermodel.build(best_hps)

# train model for 50 epochs
history = model.fit(X, y_tf, epochs=50, verbose=0)
train_loss_per_epoch = history.history['loss']

In [None]:
# Find best epoch in terms of train loss
best_epoch = train_loss_per_epoch.index(min(train_loss_per_epoch)) + 1

# Retrain the model with optimal epochs from above
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.fit(X, y_tf, epochs=best_epoch, verbose=0)

# Save model
hypermodel.save("models/discrete_model")

<keras.callbacks.History at 0x1e85cf349a0>

## Model comparison (Continuous vs Discrete/Categorical)

In [None]:
from keras.utils.vis_utils import plot_model
#plot_model(hypermodel, to_file="models/model_plot.png", show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [None]:
for idx, (key, value) in enumerate(history.history.items()):
    if key != "loss":
        plt.plot(history.epoch, value, label=y.columns[idx-1])
plt.ylabel("Loss")
plt.xlabel("Epochs");
plt.legend();

In [None]:
from sklearn.metrics import mean_squared_error
continuous_model = []
y_pred = model.predict(X)
y_pred = np.array(y_pred).T.squeeze()
for i in range(y_pred.shape[1]):
    mse_feature = mean_squared_error(y_tf[:, i], y_pred[:, i])
    print(y.columns[i], "MSE:", mse_feature)
    continuous_model.append(mse_feature)

In [11]:
from sklearn.manifold import TSNE
embeddings = model.layers[1].get_weights()[0]
colors = (df.Grid.str.startswith("L")*1).values
colors = ["red" if i==1 else "blue" for i in colors]
X_emb = TSNE(n_components=2, learning_rate="auto", init="random").fit_transform(embeddings)

plt.figure(figsize=(12,7))
plt.scatter(X_emb[:, 0], X_emb[:, 1], color=colors);

UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U32')) -> dtype('<U32')