<a href="https://colab.research.google.com/github/hiydavid/tfdev-learning/blob/main/Other/entity_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libs
import tensorflow as tf
import numpy as np

In [2]:
# create 3 numerical variables
num_data = np.random.random(size=(10, 3))
num_data

array([[0.52175692, 0.66040456, 0.20652166],
       [0.19407358, 0.34044237, 0.80107064],
       [0.53507999, 0.83578053, 0.13006219],
       [0.1060389 , 0.23798112, 0.13598606],
       [0.25751907, 0.98282413, 0.00569923],
       [0.87745857, 0.99920906, 0.40869756],
       [0.66939084, 0.01650516, 0.47842664],
       [0.81126536, 0.74801617, 0.73264185],
       [0.13926239, 0.99080192, 0.48260154],
       [0.14378796, 0.96214013, 0.53919037]])

In [3]:
# create 1 categorical variables with 4 levels
cat_data = np.random.randint(0, 4, 10)
cat_data

array([0, 3, 2, 1, 0, 1, 3, 1, 2, 2])

In [4]:
# create one-hot encoded matrix since expected input_1 to have shape (4,)
one_hot_encoded_cat_data = np.eye(cat_data.max()+1)[cat_data]
one_hot_encoded_cat_data

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [5]:
# create target
target =  np.random.random(size=(10,1))
target

array([[0.18981942],
       [0.53325407],
       [0.02641736],
       [0.57103604],
       [0.61617935],
       [0.46318886],
       [0.50532025],
       [0.55758583],
       [0.60096403],
       [0.23037655]])

In [6]:
# set number of unique categories
no_of_unique_cat  = len(np.unique(cat_data))
no_of_unique_cat

4

In [7]:
# Jeremy Howard provides the following rule of thumb:
# embedding size = min(50, number of categories/2).
embedding_size = min(np.ceil((no_of_unique_cat)/2), 50)
embedding_size = int(embedding_size)
embedding_size

2

In [8]:
# use Input layers, specify input shape (dimensions except first)
inp_cat_data = tf.keras.layers.Input(shape=(no_of_unique_cat,))
inp_num_data = tf.keras.layers.Input(shape=(num_data.shape[1],))

inp_cat_data, inp_num_data

(<KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'input_1')>,
 <KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'input_2')>)

In [9]:
# Bind nulti_hot to embedding layer
emb = tf.keras.layers.Embedding(
    input_dim=no_of_unique_cat, 
    output_dim=embedding_size)(inp_cat_data)  

In [10]:
# flatten
flatten = tf.keras.layers.Flatten()(emb)

In [11]:
# concatenate two layers
conc = tf.keras.layers.Concatenate()([flatten, inp_num_data])
dense1 = tf.keras.layers.Dense(3, activation=tf.nn.relu, )(conc)

In [12]:
# creating output layer
out = tf.keras.layers.Dense(1, activation=None)(dense1)

In [13]:
# make and compile
model = tf.keras.Model(
    inputs=[inp_cat_data, inp_num_data], 
    outputs=out)

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.01),
    loss=tf.keras.losses.mean_squared_error,
    metrics=[tf.keras.metrics.mean_squared_error])

In [14]:
# check summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 4, 2)         8           ['input_1[0][0]']                
                                                                                                  
 flatten (Flatten)              (None, 8)            0           ['embedding[0][0]']              
                                                                                                  
 input_2 (InputLayer)           [(None, 3)]          0           []                               
                                                                                              

In [15]:
# fit model
model.fit([one_hot_encoded_cat_data, num_data], target)



<keras.callbacks.History at 0x7f5dac614810>

In [16]:
# get weights
model.layers[1].get_weights()[0]

array([[ 0.05787079,  0.02935103],
       [ 0.02626067,  0.01357988],
       [ 0.02448512, -0.04573865],
       [ 0.02123172,  0.00887326]], dtype=float32)