# Word embedding
Word embedding is feature learning techniques in NLP where words or phrases from the vocabulary are mapped to vectors of real numbers. It translates a space with many dimensions per word to a space with a much lower dimension that vector based.

A embedding matrix of weights will be learned during the training.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

sns.set()
sns.set_style('white')

In [2]:
vocab_size = 1000
max_len = 50

(train_sequences,train_labels),(test_sequences,test_labels) = tf.keras.datasets.imdb.load_data(
    num_words=vocab_size)

n_sample = len(train_labels)

In [3]:
word_index = tf.keras.datasets.imdb.get_word_index()
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [4]:
train_ex_sequence = train_sequences[0]
print(train_ex_sequence)
train_ex_text = [reverse_word_index[index] for index in train_ex_sequence]
print(train_ex_text)

[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
['<START>', 'this', 'film', 'was', 'just', 'brilliant', 'casting', '<UNK>', '<UNK>', 'story', 'direction', '<UNK>', 'really', '<U

In [5]:
train_sequences = pad_sequences(train_sequences,maxlen=max_len, truncating='post')
test_sequences  = pad_sequences(test_sequences,maxlen=max_len, truncating='post')

### A naive implementation using Tensorflow

In [6]:
batch_size = 512
batch_num = np.int32(n_sample/batch_size + 1)
print('batch num = %d' % batch_num)

batch_indices = []
for i in range(batch_num-1):
    batch_indices.append(np.arange(batch_size*i,batch_size*(i+1)))
batch_indices.append(np.arange(batch_size*i,n_sample))

batch num = 49


In [7]:
train_sequences_onehot = tf.Session().run(tf.one_hot(tf.constant(train_sequences),depth=vocab_size,axis=-1))
test_sequences_onehot  = tf.Session().run(tf.one_hot(tf.constant(test_sequences) ,depth=vocab_size,axis=-1))

In [13]:
embedding_dim = 3

tf.reset_default_graph()

x = tf.placeholder(tf.float32,shape=(None,max_len,vocab_size))
y = tf.placeholder(tf.float32,shape=(None,))


w1 = tf.get_variable('embedding_weights',shape=(vocab_size,embedding_dim),dtype=tf.float32,
                     initializer=tf.contrib.layers.xavier_initializer())
w2 = tf.get_variable('w_linear',shape=(max_len*embedding_dim,1),dtype=tf.float32,
                     initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.get_variable('b_linear',shape=(1,1),dtype=tf.float32,initializer=tf.zeros_initializer())

e = tf.matmul(x,w1)
e_f = tf.contrib.layers.flatten(e)
z = tf.reshape(tf.matmul(e_f,w2) + b2,[-1])

a = tf.nn.sigmoid(z)
y_p = tf.cast(a > 0.5,tf.int32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(y_p,tf.cast(y,tf.int32)),tf.float32))


cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=z,labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=0.01).minimize(cost)

n_epoch = 5
print_cost = True
with tf.Session() as sess:  
    sess.run(tf.global_variables_initializer())
    cost_val = sess.run(cost,feed_dict={x:train_sequences_onehot,y:train_labels})
    print ("initial cost: %.4f" % (cost_val))
    for i in range(n_epoch):
        for batch_index in batch_indices:
            sess.run(optimizer,feed_dict={x:train_sequences_onehot[batch_index,:,:],y:train_labels[batch_index]})
            
        if print_cost:
            train_cost = sess.run(cost,feed_dict={x:train_sequences_onehot,y:train_labels})
            train_acc = sess.run(accuracy,feed_dict={x:train_sequences_onehot, y:train_labels})
            test_acc = sess.run(accuracy,feed_dict={x:test_sequences_onehot, y:test_labels})
            print ("cost after epoch %d: %.4f, train_acc: %.4f, test_acc: %.4f" % (i+1,train_cost,train_acc,test_acc))  

initial cost: 0.6933
cost after epoch 1: 0.5669, train_acc: 0.7131, test_acc: 0.6954
cost after epoch 2: 0.4812, train_acc: 0.7661, test_acc: 0.7363
cost after epoch 3: 0.4613, train_acc: 0.7769, test_acc: 0.7374
cost after epoch 4: 0.4507, train_acc: 0.7835, test_acc: 0.7359
cost after epoch 5: 0.4434, train_acc: 0.7894, test_acc: 0.7343


### Keras implementation

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
num_epochs = 5
model.fit(train_sequences, train_labels, epochs=num_epochs, validation_data=(test_sequences, test_labels))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 3)             3000      
_________________________________________________________________
flatten_1 (Flatten)          (None, 150)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 151       
Total params: 3,151
Trainable params: 3,151
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1a523d6b00>

In [15]:
ex_sentence = 'this movie is excellent'
ex_sequence = [word_index[word] for word in ex_sentence.split()]
ex_sequence = [ex_sequence]
ex_sequence = pad_sequences(ex_sequence,maxlen=max_len, truncating='post')
model.predict(ex_sequence)

array([[0.67168754]], dtype=float32)

### visualize 
Run the following code and go to https://projector.tensorflow.org/ and upload the vecs.tsv and meta.tsv.

In [11]:
import io

embedding_layer = model.layers[0]
weights = embedding_layer.get_weights()[0]

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for i in range(1, vocab_size):
    word = reverse_word_index[i]
    embeddings = weights[i]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

## Why word embedding?
