In [2]:
!ls sample_data/

anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md


## Evolution of Embedding Layer

### TODO:
- [ ] Add callback 
- [ ] Store training steps
- [ ] Extract embedding weights
- [ ] Calculate SVD
- [ ] Project it on some human percivable space
- [ ] Present the evolution (rotation) of vectors

In [3]:
import tensorflow as tf
from tensorflow import keras
from keras import layers

import numpy as np

### Read Data
Texts are represented by integer numbers that are sorted according to their frequency in corpus. Each text have a different length that are stored as length-variant lists. We use `pad_sequences` to make the lengths the same.

In [5]:
num_tokens = 10000
(x_train_raw, y_train ), (x_test_raw, y_test) = keras.datasets.imdb.load_data(num_words=num_tokens)

In [6]:
words_dict = keras.datasets.imdb.get_word_index()
len(words_dict)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


88584

In [7]:
x_train_raw.shape, x_test_raw.shape

((25000,), (25000,))

In [8]:
# Entries (reviews) have different length 
len(x_train_raw[0]), len(x_train_raw[1])

(218, 189)

### Standardize the length of each text

In [10]:
max_len = 250
x_train = keras.preprocessing.sequence.pad_sequences(x_train_raw, maxlen=max_len)
x_test = keras.preprocessing.sequence.pad_sequences(x_test_raw, maxlen=max_len)


In [11]:

x_train.shape, x_test.shape

((25000, 250), (25000, 250))

### Convert Numpy `array` to a `dataset` tensor

In [12]:
train_ds = tf.data.Dataset.from_tensor_slices ((x_train, y_train))
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))

train_ds = train_ds.shuffle(train_ds.cardinality())
test_ds = test_ds.shuffle(test_ds.cardinality())

batch_size = 32
train_ds = train_ds.batch(batch_size)
test_ds = test_ds.batch(batch_size)

In [13]:
train_ds.cardinality(), test_ds.cardinality()

(<tf.Tensor: shape=(), dtype=int64, numpy=782>,
 <tf.Tensor: shape=(), dtype=int64, numpy=782>)

In [26]:
def create_model(max_len=max_len, 
                num_tokens=num_tokens,
                embedding_size=16):
  """ Create a sequential vanilla deeplearning model using functional api
  input:
  ------
  max_len: (int) length of each message, if shorter padded to the same length
  num_tokens: (int) max numbe of tokens (features)
  embedding_size: (int) embedding layer dimension

  return:
  -------
  model: (keras model) a vanilla sequential model
  """
  inputs = layers.Input(shape=(None,), dtype=tf.int64)
  x = layers.Embedding(num_tokens, embedding_size, name='embedding')(inputs)
  x = layers.GlobalAvgPool1D(name='avg-pooling')(x)
  x = layers.Dense(units=32, activation='relu', name='Dense')(x)
  x = layers.Dropout(0.2, name='dropout')(x)
  outputs = layers.Dense(units=1, activation='relu', name='prediction')(x)

  model = keras.Model(inputs=inputs, outputs=outputs)

  return model

In [43]:
model = create_model()

In [44]:
model.compile(optimizer='adam',
              loss="binary_crossentropy",
              metrics=['accuracy'])

In [45]:
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 16)          160000    
                                                                 
 avg-pooling (GlobalAverageP  (None, 16)               0         
 ooling1D)                                                       
                                                                 
 Dense (Dense)               (None, 32)                544       
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 prediction (Dense)          (None, 1)                 33        
                                                           

In [41]:
# !rm -r callbacks/

In [46]:
callbacks = [
    keras.callbacks.ModelCheckpoint (
        filepath='./callbacks/',
        save_freq='epoch')
]

In [47]:
epochs=5
model.fit(train_ds,
          validation_data=test_ds,
          epochs=epochs,
          callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8a3563e890>

In [50]:
!ls callbacks/assets

In [None]:
list_of_words = ['good', 'bad', 'amazing', 'fantastics', 'boring', 'rewarding']

In [51]:
embed_l = model.get_layer('embedding')

In [52]:
embd_mtx = embed_l.embeddings

In [53]:
embd_mtx.shape

TensorShape([10000, 16])

In [65]:
words_dict['man']

129

In [None]:
mtx_embd = tf.matmul(embd_mtx, embd_mtx, transpose_a=True)

In [None]:
from numpy import linalg as lng

In [None]:
svd = lng.svd(embd_mtx)

In [None]:
len(svd)

In [None]:
svd[0].shapem 