# Keras Embedding Layer
* Keras offers an Embedding layer that can be used for neural networks on text data. It requires that the input data be integer encoded, so that each word is represented by a unique integer.
* The Embedding layer is defined as the first hidden layer of a network. It must specify 3 arguments:
1. **input_dim** : This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words.
2. **output_dim** : This is the size of the vector space in which words will be embedded. It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem.
3. **input_length** : This is the length of input sequences, as you would define for any input layer of a Keras model. For example, if all of your input documents are comprised of 1000 words, this would be 1000.

## Example of Learning an Embedding


In [4]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
import numpy as np



# define documents
docs = ["Well done!","Good work","Great effort","nice work","Excellent!","Weak","Poor effort!","not good","poor work","Could have done better."]

# define class labels
labels = [1,1,1,1,1,0,0,0,0,0]

# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d,vocab_size) for d in docs]
print(encoded_docs)

# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs,maxlen=max_length,padding='post')
print(padded_docs)

# define the model
model = Sequential()
model.add(Embedding(vocab_size,8,input_length=max_length))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))

# compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

# summarize the model
model.summary()

# fit the model
model.fit(np.array(padded_docs),np.array(labels),epochs=50,verbose=0)

# evaluate the model
loss,accuracy = model.evaluate(np.array(padded_docs),np.array(labels),verbose=0)
print('Accuracy: %f' %(accuracy*100))

[[6, 46], [12, 19], [29, 5], [17, 19], [13], [6], [45, 5], [4, 12], [45, 19], [17, 40, 46, 23]]
[[ 6 46  0  0]
 [12 19  0  0]
 [29  5  0  0]
 [17 19  0  0]
 [13  0  0  0]
 [ 6  0  0  0]
 [45  5  0  0]
 [ 4 12  0  0]
 [45 19  0  0]
 [17 40 46 23]]
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_3 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
Accuracy: 80.000001


## Example of Using Pre-Trained Glove Embedding


In [9]:
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

# define documents
docs = ["Well done!","Good work","Great effort","nice work","Excellent!","Weak","Poor effort!","not good","poor work","Could have done better."]

# define class labels
labels = [1,1,1,1,1,0,0,0,0,0]

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)

# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs,maxlen=max_length,padding='post')
print(padded_docs)

# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt',mode='rt',encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:],dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size,100))
for word,i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
# define model
model = Sequential()
e = Embedding(vocab_size,100,weights=[embedding_matrix],input_length=4,trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))

# compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

# summarize the model
model.summary()

# fit the model
model.fit(np.array(padded_docs),np.array(labels),epochs=50,verbose=0)

# evaluate the model
loss,accuracy = model.evaluate(np.array(padded_docs),np.array(labels),verbose=0)
print('Accuracy: %f' %(accuracy*100))

[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]
Loaded 0 word vectors.
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
flatten_6 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 401       
Total params: 1,901
Trainable params: 401
Non-trainable params: 1,500
_________________________________________________________________
Accuracy: 50.000000
