<a href="https://colab.research.google.com/github/harenlin/Simple-Text-Generation/blob/main/Embedding_Layer_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Flatten, Input, Embedding

# define documents
docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!',
		'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.']
# define class labels
labels = [1,1,1,1,1,0,0,0,0,0]

# prepare tokenizer
from keras.preprocessing.text import Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n') # encode sequences of words with index
tokenizer.fit_on_texts(docs) # create a hash map of numbers and words, word2idx & idx2word
sequences = tokenizer.texts_to_sequences(docs) # shape = (# of docs, length of text)
print(sequences)
vocabulary_size = len(tokenizer.word_counts)
print("The size of vocab.txt is", vocabulary_size)

[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
The size of vocab.txt is 14


In [18]:
# p.s. Member Variables of Tokenizer()
# (1) document_count = number of documents
print(tokenizer.document_count) 
print(len(sequences))
# (2) word_index = word2idx (index starting from 1)
# (3) index_word = idx2word
print(tokenizer.index_word[5])
print(tokenizer.word_index[tokenizer.index_word[5]])
# (4) word_counts = dictionary of the each word and its occurence
print(tokenizer.word_counts)

10
10
poor
5
OrderedDict([('well', 1), ('done', 2), ('good', 2), ('work', 3), ('great', 1), ('effort', 2), ('nice', 1), ('excellent', 1), ('weak', 1), ('poor', 2), ('not', 1), ('could', 1), ('have', 1), ('better', 1)])


In [19]:
# since the max sequence length of the corpus is 4 (doc[9]), 
# we are going to made the max_seq_len = 4 -> PADDING
from keras.preprocessing.sequence import pad_sequences
max_seq_len = 4
sequences = pad_sequences(sequences, maxlen=max_seq_len, padding='post')
print(sequences)

# define simple model to understand what Embedding do
def embedding_model(vocabulary_size, hidden_dim, max_seq_len):
    input = Input(shape=(max_seq_len))
    x = Embedding(input_dim=vocabulary_size, output_dim=hidden_dim, input_length=max_seq_len)(input)
    x = Flatten()(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    return model

# hyper-parameters
hidden_dim = 32
# model construction
model = embedding_model(vocabulary_size+1, hidden_dim, max_seq_len)
print(model.summary())
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# fit the model
model.fit(sequences, np.array(labels), batch_size=4, epochs=5)

[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]
Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 4, 32)             480       
_________________________________________________________________
flatten_3 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 609
Trainable params: 609
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5ac68cd810>

# How about including Pre-Trained Model GloVe?

In [20]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Flatten, Input, Embedding

# define documents
docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!',
		'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.']
# define class labels
labels = [1,1,1,1,1,0,0,0,0,0]

# prepare tokenizer
from keras.preprocessing.text import Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n') # encode sequences of words with index
tokenizer.fit_on_texts(docs) # create a hash map of numbers and words, word2idx & idx2word
sequences = tokenizer.texts_to_sequences(docs) # shape = (# of docs, length of text)
print(sequences)
vocabulary_size = len(tokenizer.word_counts)
print("The size of vocab.txt is", vocabulary_size)

# since the max sequence length of the corpus is 4 (doc[9]), 
# we are going to made the max_seq_len = 4 -> PADDING
from keras.preprocessing.sequence import pad_sequences
max_seq_len = 4
sequences = pad_sequences(sequences, maxlen=max_seq_len, padding='post')
print(sequences)

[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
The size of vocab.txt is 14
[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


In [21]:
# hyper-parameters
hidden_dim = 50 # glove.6B.50d.txt "50d"
# load the whole embedding into memory
embeddings_index = dict()
f = open('./glove.6B.50d.txt', encoding='utf-8')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.array(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocabulary_size+1, hidden_dim))
for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [22]:
# define simple model to understand what Embedding do
def embedding_model(vocabulary_size, hidden_dim, max_seq_len):
    input = Input(shape=(max_seq_len))
    x = Embedding(input_dim=vocabulary_size, output_dim=hidden_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=False)(input)
    x = Flatten()(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    return model

In [23]:
# model construction
model = embedding_model(vocabulary_size+1, hidden_dim, max_seq_len)
print(model.summary())
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# fit the model
model.fit(sequences, np.array(labels), batch_size=4, epochs=50)

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 4, 50)             750       
_________________________________________________________________
flatten_4 (Flatten)          (None, 200)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 201       
Total params: 951
Trainable params: 201
Non-trainable params: 750
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/

<keras.callbacks.History at 0x7f5ac53be3d0>