In [1]:
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_datasets as tfds

In [2]:
# sample_num is 1000, sequence_length is 5
# create an embedding layer with 1000 row, and 5 width
embedding_layer = layers.Embedding(1000, 5)

In [3]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

array([[ 0.0221931 , -0.00781777,  0.0019397 , -0.03057138,  0.01572508],
       [ 0.00952963, -0.0321022 , -0.01398076, -0.02946041,  0.0023345 ],
       [ 0.00171735,  0.0263508 , -0.03899754, -0.04970043,  0.03538061]],
      dtype=float32)

In [4]:
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
print(result.shape)
print(result.numpy())

(2, 3, 5)
[[[-0.01578252  0.0023705  -0.03343515  0.03090265  0.03187299]
  [ 0.0221931  -0.00781777  0.0019397  -0.03057138  0.01572508]
  [ 0.00952963 -0.0321022  -0.01398076 -0.02946041  0.0023345 ]]

 [[ 0.00171735  0.0263508  -0.03899754 -0.04970043  0.03538061]
  [-0.00239437 -0.02851334 -0.01150944 -0.01987631  0.04672113]
  [-0.02900649  0.03507973  0.00384105  0.00384574  0.03763301]]]


In [5]:
tfds.disable_progress_bar()
(train_data, test_data), info = tfds.load(
    'imdb_reviews/subwords8k',
    data_dir='~/work/temp/tfds',
    split=(tfds.Split.TRAIN, tfds.Split.TEST),
    with_info=True, as_supervised=True
)



In [6]:
encoder = info.features['text'].encoder
encoder.subwords[:20]

['the_',
 ', ',
 '. ',
 'a_',
 'and_',
 'of_',
 'to_',
 's_',
 'is_',
 'br',
 'in_',
 'I_',
 'that_',
 'this_',
 'it_',
 ' /><',
 ' />',
 'was_',
 'The_',
 'as_']

In [7]:
train_batches = train_data.shuffle(1000).padded_batch(10)
test_batches = test_data.shuffle(1000).padded_batch(10)

In [8]:
train_batch, train_labels = next(iter(train_batches))
train_batch.numpy()

array([[  69,   57,  116, ...,    0,    0,    0],
       [7448, 7961, 7228, ...,    0,    0,    0],
       [  62,    9,    4, ...,    0,    0,    0],
       ...,
       [6998, 3149, 7961, ...,    0,    0,    0],
       [ 274,    4, 3073, ...,    0,    0,    0],
       [7514,   60, 1364, ...,    0,    0,    0]])

In [10]:
embedding_dim = 16

model = keras.Sequential([
    layers.Embedding(encoder.vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)
])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 16)          130960    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 131,249
Trainable params: 131,249
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(optimizer='adam',
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])
history = model.fit(
    train_batches,
    epochs=10,
    validation_data=test_batches, validation_steps=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(8185, 16)
