### Checking tensorflow intermediate output

In [1]:
import tensorflow as tf

W1124 11:20:55.229263 140735637529472 __init__.py:308] Limited tf.compat.v2.summary API due to missing TensorBoard installation.


In [33]:
input_dimensions = 2
hidden_size = 16
dtype=tf.float64

x = tf.Variable(tf.truncated_normal(dtype=dtype, 
                                    shape=(input_dimensions, hidden_size), mean=0, stddev=0.01), 
                name='Wr')

In [18]:
x

<tf.Variable 'Wr_1:0' shape=(5, 2) dtype=float64_ref>

In [34]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(x.eval()) 

[[-0.00023993  0.0154601   0.00696636  0.00011915  0.00746992  0.00567421
  -0.00949571 -0.01017981 -0.01170234  0.00565376  0.0066822   0.00576366
  -0.0082944  -0.00743214 -0.01937575  0.01698793]
 [-0.01174113 -0.0060773   0.00360782 -0.00082508  0.0154051   0.00888502
   0.01021685  0.00832186  0.00145435 -0.0091737   0.00928058 -0.01194321
  -0.00834974 -0.00044248  0.00384688  0.01040147]]


In [13]:
matrix1 = tf.constant([[3., 3.]])
matrix2 = tf.constant([[2.],[2.]])
product = tf.matmul(matrix1, matrix2)

In [14]:
with tf.Session() as sess:  print(product.eval()) 


[[12.]]


In [15]:
with tf.Session() as sess:  print(matrix1.eval()) 


[[3. 3.]]


In [16]:
with tf.Session() as sess:  print(matrix2.eval()) 


[[2.]
 [2.]]


In [45]:
# Build a graph.
a = tf.constant(5.0)
b = tf.constant(6.0)
c = a * b
c


<tf.Tensor 'mul:0' shape=() dtype=float32>

In [46]:
# Launch the graph in a session.
sess = tf.compat.v1.Session()

# Evaluate the tensor `c`.
print(sess.run(c))

30.0


In [49]:
c = a + b 
sess = tf.compat.v1.Session()
sess.run(c, feed_dict={a: 5, b:7})

12.0

### Implmentation of softmax

In [414]:
import numpy as np

sample = [0.001, 0.002, 0.5, 0.3, 0.234, 0.05, 0.007]
exp_ls = [np.exp(i) for i in sample]
sum_exp = sum(exp_ls)
result = [i/sum_exp for i in exp_ls]
print(result)

[0.12026164079359143, 0.12038196258525403, 0.19807974640639128, 0.1621739799448004, 0.15181606801133768, 0.12630122257481688, 0.12098537968380825]


### Cross Entropy

In [54]:
def CrossEntropy(y_true, y_hat):
    return np.sum(- np.array(y_true) * np.log2(np.array(y_hat)))

In [55]:
CrossEntropy([1,0,0,0], [0.7, 0.2, 0.05, 0.05])

0.5145731728297583

In [63]:
CrossEntropy([1,0,0,0], [0.2,.1,0.5,0.2])

2.321928094887362

In [64]:
CrossEntropy([1,0,0,0], [0.2,.5,0.1,0.2])

2.321928094887362

In [65]:
np.log2(0.9)

-0.15200309344504995

## Tokenizer

In [66]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [98]:
num_words = 20
tokenizer = Tokenizer(num_words=num_words)


In [99]:
data_text = ['if you say yes, I will delightfully agree. If you say no, I will painfully do so.']

In [100]:
%%time
tokenizer.fit_on_texts(data_text)

CPU times: user 67 µs, sys: 1e+03 ns, total: 68 µs
Wall time: 72 µs


In [101]:
tokenizer.word_index

{'if': 1,
 'you': 2,
 'say': 3,
 'i': 4,
 'will': 5,
 'yes': 6,
 'delightfully': 7,
 'agree': 8,
 'no': 9,
 'painfully': 10,
 'do': 11,
 'so': 12}

In [106]:
sample1 = ['if you say so', 'I hope you agree']

In [107]:
sample1

['if you say so', 'I hope you agree']

In [108]:
sample1_token = tokenizer.texts_to_sequences(sample1)
sample1_token

[[1, 2, 3, 12], [4, 2, 8]]

In [105]:
np.array(sample1_token)

array([list([1, 2, 3, 12, 4, 5, 11]), list([4, 2, 8])], dtype=object)

### GRU implementation in details

In [109]:
import tensorflow as tf
import numpy as np
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

<img src="img/gru_layers.png" alt="Flowchart NLP" style="width: 1200px;"/>

Since this RNN is simply classifying the message as either good or bad, we only need one output from the last GRU node. The output from the last GRU is vector of 4. This will be converted into one last value by `Dense` layer. 

It makes sense because we don't need every output from the last GRU from every word, `this`, `is`, etc. We only need the one output after consuming all the words from the sentence. 

However, if we were to translate words, we need an output from the last GRU node for every single word fed into them. So we need to use `return_sequences` in this case. 

In [243]:
num_words = 100
embedding_size = 8
max_tokens = 10

model = Sequential()
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(lr=1e-3)

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [244]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 10, 8)             800       
_________________________________________________________________
gru_15 (GRU)                 (None, 10, 16)            1200      
_________________________________________________________________
gru_16 (GRU)                 (None, 10, 8)             600       
_________________________________________________________________
gru_17 (GRU)                 (None, 4)                 156       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 5         
Total params: 2,761
Trainable params: 2,761
Non-trainable params: 0
_________________________________________________________________


#### How to calculate the parameters

https://towardsdatascience.com/counting-no-of-parameters-in-deep-learning-models-by-hand-8f1716241889 

for g 
- GRU = 3 (memory gate, reset gate, internal state) 
- LSTM = 4 

parameters = `g * [h (h1 + i) + h2]`  where 

- h1 = previous state 
- h2 = bias 
- h = hidden units 
- i = input 

example for 1st GRU 
- input dimension from embedding layer for each word = `(1 x 8)` 
- hidden units in 1st GRU = `16` 
- GRU, g = 3 

param = `3 * [16 (16 + 8) + 16]` = `3 * [16 * 24 + 16]` = `3 * [384 + 16]` = `3 * 400` = `1200` 

https://www.data-blogger.com/2017/08/27/gru-implementation-tensorflow/

<p align="center">
<img src="img/gru.png" width="300"></p>
<p align="center">Figure 1. Gated Recurrent Unit</p>

<p align="center">
<img src="img/gru_equation.png" width="300"></p>
<p align="center">Figure 2. Gated Recurrent Unit</p>

If you look at the GRU node, there are 3 operations for the input vectors. Those operations have weights: `W(z)`, `W(r)` and `W(h)`. Therefore, we need to multiply by `3` which is denoted as `g` in the above calculation. 

<img src="img/gru_node.png" alt="GRU node in detail" style="width: 1200px;"/>

x_train_pad = np.array([[13,   11,    6,    3,  93,   19,   12,   10,   67,  10,  21, 2,  12,    9,    6,  40,   27,    4,    1,  42],
                       [10,  40,   43,   22,   62,  10,   11,   19,   27,   67,  38, 12,    9,   80,   26,   14,  15,    2, 51, 3]])

y_train = np.array([[1], [0]])


In [232]:
x_train_pad = np.array([[13, 11, 6, 3, 93, 19, 12, 10, 67, 10], 
                        [21, 2, 12, 9, 6, 40, 27, 4, 1, 42],
                       [10, 40, 43, 22, 62, 10, 11, 19, 27, 67],
                       [38, 12, 9, 80, 26, 14, 15, 2, 51, 3]])
y_train = np.array([[1], [0], [1], [1]])


In [213]:
x_train_pad

array([[13, 11,  6,  3, 93, 19, 12, 10, 67, 10],
       [21,  2, 12,  9,  6, 40, 27,  4,  1, 42],
       [10, 40, 43, 22, 62, 10, 11, 19, 27, 67],
       [38, 12,  9, 80, 26, 14, 15,  2, 51,  3]])

In [236]:
%%time
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

Train on 3 samples, validate on 1 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 3.35 s, sys: 118 ms, total: 3.47 s
Wall time: 3.36 s


<tensorflow.python.keras.callbacks.History at 0x12a7fc940>

test1 = np.array([[90, 16, 3, 78, 74, 94, 49, 2, 56, 99, 3, 39, 22, 31, 40, 28, 41, 20, 24, 88],
                  [10, 40, 43, 22, 62, 10, 11, 19, 27, 67, 38, 12, 9, 80, 26, 14, 15, 2, 51, 3]])
test1

In [237]:
test2 = np.array([[90, 90, 3, 78, 74, 94, 49, 2, 56, 99]])
test2

array([[90, 90,  3, 78, 74, 94, 49,  2, 56, 99]])

In [238]:
model.predict(test2)

array([[0.5088825]], dtype=float32)

In [239]:
model.layers[0].output

<tf.Tensor 'layer_embedding_4/embedding_lookup/Identity_1:0' shape=(?, 10, 8) dtype=float32>

### Checking the intermediate layers in Keras

https://keras.io/getting-started/faq/#how-can-i-obtain-the-output-of-an-intermediate-layer

In [284]:
# with a Sequential model
from keras import backend as K

get_1st_layer_output = K.function([model.layers[0].input],
                                  [model.layers[0].output])
get_1st_layer_output([test2])

[array([[[-0.00057229,  0.03144029, -0.01208645,  0.01250266,
          -0.0112159 ,  0.01250733,  0.02347985, -0.04540693],
         [-0.00057229,  0.03144029, -0.01208645,  0.01250266,
          -0.0112159 ,  0.01250733,  0.02347985, -0.04540693],
         [-0.02888888, -0.02501615,  0.01376894,  0.02180013,
           0.00112535, -0.03033254, -0.0358508 , -0.00298855],
         [ 0.04798928, -0.03607267, -0.04041312,  0.01160529,
          -0.02674392, -0.03048762,  0.00823299,  0.04192502],
         [-0.01783573,  0.01018466, -0.02773213, -0.04810634,
          -0.0474182 ,  0.01849345,  0.0055408 , -0.01327863],
         [ 0.02917637,  0.01125299,  0.02330646,  0.02039612,
           0.03949693,  0.02127938,  0.03067532, -0.01710946],
         [ 0.04990566,  0.0440275 , -0.00158658, -0.00284252,
          -0.01484345, -0.04878316, -0.00673587,  0.0132655 ],
         [ 0.04800986,  0.01151798,  0.04138615, -0.02686044,
          -0.04933664, -0.02193256,  0.03097831, -0.02450117],


### 1st GRU layer output (units=16)

Embedding layer emit (10 x 8) for each word.  
1st GRU will receive (10 x 8) and will emit (10 x 16)

1st word, tokenized as '90' is embedding as `[ 0.02807844,  0.03798428, -0.0355945 , -0.03939711, -0.01135957, -0.02653174,  0.04787086,  0.00446305]`. Since the 2nd word is also the same `90`, its embedding layer output is the same as the 1st one. 

This embedding output is fed into 1st GRU node, which will emit 16 states (or units). Since there are 10 words in 1st sample (labeled test2), the output will be `(10 x 16)`. Now if you look at the result, the first and second output are different. The first one is `[-7.39353709e-03, -2.45588785e-03, ...]`, and 2nd one is `[-1.13133024e-02, -2.58720666e-03, ...]`. But they are for the same word, tokenized as `90`. Shouldn't they be the same? 

The reason they're different is because GRU node state changes, depending on the input and the previous output. It has a memory state. So after receiving the first word `90`, it emits `[-7.39353709e-03, -2.45588785e-03, ...]`, which becomes internal state or memory state for next word. When the next word comes, albeit it's the same `90`, with its internal matrix product, the output now becomes different. 

In [225]:
get_2nd_layer_output = K.function([model.layers[0].input],
                                  [model.layers[1].output])
get_2nd_layer_output([test2])

[array([[[-7.39353709e-03, -2.45588785e-03,  9.36997775e-03,
           5.98293089e-04,  4.61337762e-03, -1.39055622e-03,
          -5.94481872e-03,  6.91419747e-03,  2.37650634e-03,
           1.02202231e-02,  9.74046253e-03, -1.99844595e-03,
           6.11742772e-03, -3.99018964e-03,  9.21347458e-03,
           4.57908201e-04],
         [-1.13133024e-02, -2.58720666e-03,  1.37651609e-02,
           5.02947369e-05,  6.80340594e-03, -2.08873162e-03,
          -8.64002854e-03,  1.05433632e-02,  3.94365098e-03,
           1.54888202e-02,  1.60077885e-02, -3.52106686e-03,
           8.50645918e-03, -4.25248407e-03,  1.53070539e-02,
           1.21246769e-04],
         [-5.64358057e-03, -8.63628741e-03,  6.53780019e-03,
          -1.63866580e-02,  8.16475973e-03, -2.05401350e-02,
          -1.05933463e-02,  9.65292007e-03,  7.13120215e-03,
           1.83349773e-02, -3.01691052e-03,  2.29730085e-03,
          -3.47107695e-03,  1.30173983e-04,  1.72474934e-03,
           2.92117568e-03],
 

In [226]:
get_3rd_layer_output = K.function([model.layers[0].input],
                                  [model.layers[2].output])
get_3rd_layer_output([test2])

[array([[[-0.00130251, -0.00374961, -0.0031796 , -0.0049633 ,
           0.00036867,  0.00457511, -0.00612586, -0.00412154],
         [-0.00407208, -0.00681909, -0.00433862, -0.00891543,
          -0.00251949,  0.00876667, -0.01149588, -0.00704769],
         [ 0.00243492, -0.00821556,  0.00203187, -0.01002471,
          -0.00347194,  0.01034054, -0.01529316, -0.01056274],
         [ 0.00994521, -0.00717895,  0.00555692, -0.0062808 ,
          -0.00204963,  0.01307923, -0.01548386, -0.01110561],
         [ 0.01507948, -0.00959584,  0.00468181, -0.00271715,
           0.00095033,  0.00689437, -0.01090126, -0.01092082],
         [ 0.01567085, -0.01425946, -0.00503693, -0.00313515,
           0.0013425 ,  0.00697276, -0.010996  , -0.01877937],
         [ 0.01722615, -0.01498277, -0.0072018 , -0.00277071,
          -0.0004091 ,  0.0097972 , -0.01463094, -0.02147865],
         [ 0.02065217, -0.01545376, -0.00434624, -0.00325283,
          -0.0031377 ,  0.0124184 , -0.0153431 , -0.02298888],


In [227]:
get_3rd_layer_output = K.function([model.layers[0].input],
                                  [model.layers[3].output])
get_3rd_layer_output([test2])

[array([[ 0.05640356, -0.02059804, -0.00316206,  0.01036271]],
       dtype=float32)]

In [228]:
get_4th_layer_output = K.function([model.layers[0].input],
                                  [model.layers[4].output])
get_4th_layer_output([test2])

[array([[0.5165275]], dtype=float32)]

In [229]:
model.predict(test2)

array([[0.5165275]], dtype=float32)

### Encoder Decoder 

In [249]:
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [274]:
num_words = 50
embedding_size = 10
state_size = 8

In [263]:
encoder_input = Input(shape=(None, ), name='encoder_input')
encoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='encoder_embedding')
encoder_gru1 = GRU(state_size, name='encoder_gru1',
                   return_sequences=True)
encoder_gru2 = GRU(state_size, name='encoder_gru2',
                   return_sequences=True)
encoder_gru3 = GRU(state_size, name='encoder_gru3',
                   return_sequences=False)

In [264]:
def connect_encoder():
    # Start the neural network with its input-layer.
    net = encoder_input
    
    # Connect the embedding-layer.
    net = encoder_embedding(net)

    # Connect all the GRU-layers.
    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)

    # This is the output of the encoder.
    encoder_output = net
    
    return encoder_output

In [265]:
encoder_output = connect_encoder()

In [321]:
decoder_initial_state = Input(shape=(state_size,),
                              name='decoder_initial_state')
decoder_input = Input(shape=(None, ), name='decoder_input')
decoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='decoder_embedding')
decoder_gru1 = GRU(state_size, name='decoder_gru1',
                   return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2',
                   return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3',
                   return_sequences=True)
decoder_dense = Dense(num_words,
                      activation='linear',
                      name='decoder_output')

In [322]:
def connect_decoder(initial_state):
    # Start the decoder-network with its input-layer.
    net = decoder_input

    # Connect the embedding-layer.
    net = decoder_embedding(net)
    
    # Connect all the GRU-layers.
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)

    # Connect the final dense layer that converts to
    # one-hot encoded arrays.
    decoder_output = decoder_dense(net)
    
    return decoder_output

In [323]:
decoder_output = connect_decoder(initial_state=encoder_output)

model_train = Model(inputs=[encoder_input, decoder_input],
                    outputs=[decoder_output])

In [324]:
model_encoder = Model(inputs=[encoder_input],
                      outputs=[encoder_output])
decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state],
                      outputs=[decoder_output])

In [325]:
def sparse_cross_entropy(y_true, y_pred):

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)

    loss_mean = tf.reduce_mean(loss)

    return loss_mean

In [326]:
optimizer = RMSprop(lr=1e-3)
decoder_target = tf.placeholder(dtype='int32', shape=(None, None))

In [327]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                    target_tensors=[decoder_target])

In [328]:
path_checkpoint = '21_checkpoint.keras'
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)

In [329]:
encoder_input_data = np.array([[13, 11, 6, 3, 3, 19, 12, 10, 27, 10], 
                               [21, 2, 12, 9, 6, 40, 27, 4, 1, 42]])
decoder_input_data = np.array([[2, 40, 19, 43, 26, 20, 18, 1, 45, 3],
                               [2, 40, 43, 22, 2, 10, 11, 19, 27, 3]])
decoder_output_data = np.array([[40, 19, 43, 26, 20, 18, 1, 45, 3, 0],
                               [40, 43, 22, 2, 10, 11, 19, 27, 3, 0]])

In [330]:
x_data = \
{
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}
y_data = \
{
    'decoder_output': decoder_output_data
}

In [331]:
model_train.fit(x=x_data,
                y=y_data,
                batch_size=512,
                epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x13887a898>

In [332]:
model_train.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 10)     1000        encoder_input[0][0]              
__________________________________________________________________________________________________
encoder_gru1 (GRU)              (None, None, 8)      456         encoder_embedding[0][0]          
__________________________________________________________________________________________________
decoder_input (InputLayer)      [(None, None)]       0                                            
____________________________________________________________________________________________

### Checking intermediate layers in Encoder-Decoder model

In [333]:
test1_e = np.array([[16, 12, 46, 33, 13], 
                  [23, 23, 2, 49, 6]])
test1_d = np.array([[2, 2, 4, 4, 3], 
                  [2, 35, 24, 9, 3]])

In [289]:
model_train.layers[0].input

<tf.Tensor 'encoder_input_1:0' shape=(?, ?) dtype=float32>

In [376]:
# with a Sequential model
from keras import backend as K

get_1st_layer_output = K.function(inputs  = [model_train.layers[0].input, model_train.layers[3].input],
                                  outputs = [model_train.layers[10].output])
get_1st_layer_output([test1_e, test1_d])

[array([[[ 5.83190098e-02,  1.65246651e-02, -3.11908592e-03,
           1.42389992e-02, -2.12707855e-02, -2.98207030e-02,
          -5.57734743e-02,  2.73377523e-02,  8.26421287e-03,
          -3.68149169e-02,  3.41209173e-02,  3.29646356e-02,
          -1.52627639e-02, -1.78109817e-02, -5.65267913e-02,
           1.61449797e-03, -4.28009592e-02,  1.47205507e-02,
          -2.68751103e-03,  3.80825922e-02, -2.41071209e-02,
           2.59800889e-02,  3.77022363e-02, -1.52681768e-02,
           1.92913562e-02, -1.62147917e-02,  5.15831821e-03,
          -8.37580767e-03, -5.91894798e-02,  1.57503858e-02,
          -1.08616799e-02, -4.69888747e-02, -6.07327931e-02,
          -2.57650092e-02, -8.72816052e-03, -3.34480964e-02,
           1.23722367e-02, -5.75172855e-03,  1.21681727e-02,
          -1.78278591e-02,  6.06575832e-02, -1.36935767e-02,
          -3.03729484e-03,  5.86389676e-02, -4.02520709e-02,
          -3.35508119e-03,  3.95788774e-02, -3.69283408e-02,
           2.17085797e-0

In [335]:
def sparse_cross_entropy(y_true, y_pred):

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)

    loss_mean = tf.reduce_mean(loss)

    return loss_mean

In [410]:
test_true = np.array([[1, 2, 3], [4, 5, 1]])
test_pred = np.random.random_sample(size=(2, 3, 5))
test_pred


array([[[0.65495786, 0.88441492, 0.07064226, 0.12934237, 0.06340601],
        [0.11318139, 0.24185687, 0.16992654, 0.01078665, 0.61953245],
        [0.48669977, 0.72875375, 0.48223012, 0.02217202, 0.8067884 ]],

       [[0.75088541, 0.5426623 , 0.33015992, 0.12262951, 0.07843957],
        [0.56402264, 0.30867622, 0.07711186, 0.28058797, 0.23841429],
        [0.85780687, 0.1998348 , 0.0728362 , 0.6682692 , 0.35341616]]])

In [405]:
test_true.shape

(2, 3)

In [406]:
test_pred.shape

(2, 3, 5)

In [411]:
sparse_cross_entropy(test_true, x)

<tf.Tensor 'Mean_3:0' shape=() dtype=float64>

## My explanation on sparse_cross_entropy

sparse vectors are used to save memory instead of encoding all the zero in the vectors. For example, let's say there is a corpus of `['apple', 'orange', 'is', 'fruit', 'sweet', 'ssss', 'eeee']` words. There are 7 words in the corpus. If we're predicting `orange` word in our model, and using one hot encoding, it will be a vector of `[0, 1, 0, 0, 0, 0, 0]`. The `1` at the 2nd index indicates the word `orange`. For a small corpus, it's manageable. But if the corpus becomes very huge like `10,000` or `20,000`, it's a waste of memory. So instead of using a complete vector format, sparse vector helps to save the memory. The sparse vector in the above example would be : `[1]`. In Python, the first index is considered as `0`. So the word `orange` in sparse vector is just `[1]`. We save an ample amount of memory. 

The same logic applies in `tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)`. The labels `y_true` is a sparse vector. the logits or predicted values are with actual values. Let's say we're predicting the description `apple is fruit` in encoder-decoder model. The input for the decoder is `ssss apple is fruit eeee` and decoder output is `apple is fruit eeee`. It looks like this. 

```
       encoder input
           |
ssss  [5] ---> apple [0]
apple [0] ---> is    [2]
is    [2] ---> fruit [3]
fruit [3] ---> eeee  [6]
```

This is very simple illustration of encoder-decoder model with arbitrary tokenization of each word. You can see that the first word `ssss[5]` is converted to the word `apple` whose token is `0`. How do we convert the value from `5` to `0`. Of course, there are many layers, embedding layers, GRU to execute the operation. 

### Loss function 

Imagine if our model predicts `is orange fruit` instead of `apple is fruit`, we want to punish our model by cost. How do we implement a loss function? Cross entropy is a mathematical operation that involves taking the log on the predicted value. The label is always in the form of one hot. Either it's a complete one hot vector or sparse vector is just for saving memory space. But always remember, label is always in one hot vector form. So first thing you need to check is log function. How does it look alike? 

<img src="img/log_graph.png" alt="Log graph" style="width: 300px;"/>

You can see that when the x value or predicted value is `0`, the penalty is very huge in the negative y value. When the x value is `1`, which is correctly predicted, the penalty is zero `0`. Since our predicted value lies between `0` and `1` on x-axis, you can ignore the log graph in the 1st quadrant, which is when the x-value is more than 1. Now that we understand the log function, we can simply imagine in the following scenario. 

Let's say the true value is `1`, and our predicted value is `1`, the cost should be `0`. So taking our predicted value of `1` by log (either log10 or log2), it will become `0`. When the `log2(x=1) = 0` is multiplied with the true value `1`, it produces `0`. The cost is `0`. This is a basic concept of loss function in cross entropy. So imagine if our predicted value is either `0.1` or `0.7`, which value will have more penalty? It's straightforward. The predicted value of `0.1` will have more penalty because it's much farther away from the true value of `1`, and taking `log2(0.1) = -3.3219` than `log2(0.7) = -0.5146`. If the negative sign bothers you, you can convert it to positive sign by multiplying with `-1`. 

Now let's go back to our example. 

```
       encoder input
           |
ssss  [5] ---> apple [0], predicted -> is     [2]
apple [0] ---> is    [2], predicted -> orange [1]
is    [2] ---> fruit [3], predicted -> fruit  [3]
fruit [3] ---> eeee  [6], predicted -> eeee   [6] 
```

So the first word true label is supposed to be apple, but it is predicted by our model to be `is[2]`. Let's dive into loss function in detail. Suppose our model has embedding layer with 7 words with 3 features, GRU with 5 units, the true label for the entire sentence: `apple is fruit` is `np.array([[0, 2, 3]])`. The shape of true label is `(1, 3)`, `1` --> there is only one sample in this batch. `3` is the length of the sequence or length of the description. Since label is `apple is fruit`, there are 3 words. Look at the number carefully, `0, 2, 3`. They represents the index of the true labels. It's in sparse vector form. That's why it's a bit confusing here. If it were to be a complete vector form, it will look like this. 

```
corpus in embedding layer 
['apple', 'orange', 'is', 'fruit', 'sweet', 'ssss', 'eeee']

apple --> [1, 0, 0, 0, 0, 0, 0] 
is    --> [0, 0, 1, 0, 0, 0, 0]
fruit --> [0, 0, 0, 1, 0, 0, 0]
```

Instead of all those zeros to represent `apple is fruit`, just a sparse vector format of `[0, 2, 3]`, it saves a space. But it can be confusing sometimes. 

Now let's look at our predicted value `is apple fruit`. Of course we didn't know our predicted words beforehand. All we receive is the floating points value, which we need to convert into integer so that we can then map into our corpus index, to translate into texts. 

```
                last output from the Dense layer            softmax 
1st word = [0.01, 0.002, 0.5, 0.3, 0.234, 0.05, 0.07] = [0.1203, 0.1193, 0.1963, 0.1607, 0.1505, 0.1252, 0.1277]
2nd word = [0.8, 0.01, 0.03, 0.001, 0.3, 0.2, 0.06]   = [0.2501, 0.1135, 0.1158, 0.1125, 0.1517, 0.1372, 0.1193]
3rd word = [0.05, 0.1, 0.07, 0.9, 0.2, 0.08, 0.001]   = [0.1169, 0.1229, 0.1192, 0.2735, 0.1358, 0.1204, 0.1112]
```

From the last output from the Dense layer, we convert them into softmax. We then take the log2 just to show you how it's slowly changing the value. 

Let's just take the 1st word for example. 

```
based on the corpus

['apple', 'orange', 'is' , 'fruit', 'sweet', 'ssss', 'eeee'] 
[0.1203,  0.1193 , 0.1963, 0.1607 , 0.1505 , 0.1252, 0.1277]
```
We can see that the highest value in softmax `0.1963` is the index for the word `is`. But the true value is `apple`, which is the first index. We can see that the cost will be huge because our model is predicting a wrong word. 

Cross Entropy in detail 
https://github.com/datasci-w266/2019-summer-assignment-kckenneth/blob/a1-submit/assignment/a1/information_theory.ipynb

```
def CrossEntropy(y_true, y_hat):
    return np.sum(- np.array(y_true) * np.log2(np.array(y_hat)))
```

CrossEntropy generate a penalty of `3.0553` for our wrong predicting. Let's jump to 3rd word `fruit`, where we predict correctly. For 3rd word, the cost is `1.8704`. The cost becomes very low when we predict correctly. 

That's the idea of sparse_cross_entropy. 

You might wonder why the cost is not zero because we're predicting correctly. Honestly, I'm just implementing what the cost will look like with a small sample set. I don't know the exact mechanism behind the cross entropy. I thought of converting the softmax value into just `0 and 1` only. Like any maximum value will be given `1` and others as `0`. But this will create `log2(0) = inf` problem. 




In [425]:
import numpy as np

sample = [0.05, 0.1, 0.07, 0.9, 0.2, 0.08, 0.001]
exp_ls = [np.exp(i) for i in sample]
sum_exp = sum(exp_ls)
result = [i/sum_exp for i in exp_ls]
print(result)

[0.11688265731814443, 0.12287535930618881, 0.11924384362185905, 0.27346414123914103, 0.13579827365329575, 0.12044226417401714, 0.11129346068735398]


In [419]:
np.log2([0.1203, 0.1193, 0.1963, 0.1607, 0.1505, 0.1252, 0.1277])

array([-3.05529145, -3.06733405, -2.34886792, -2.63755817, -2.73216461,
       -2.99769353, -2.96916957])

In [420]:
def CrossEntropy(y_true, y_hat):
    return np.sum(- np.array(y_true) * np.log2(np.array(y_hat)))

In [431]:
y_true = [1, 0, 0, 0, 0, 0, 0] 
y_hat = [0.1203,  0.1193 , 0.1963, 0.1607 , 0.1505 , 0.1252, 0.1277]

word_3rd = [0, 0, 0, 1, 0, 0, 0]
# I'm adding the already softmaxed value here. 
# In fact it supposed to be just logits value which is the value before softmax.
predicted_3rd = [0.1169, 0.1229, 0.1192, 0.2735, 0.1358, 0.1204, 0.1112]
CrossEntropy(word_3rd, predicted_3rd)

1.8703872618695303