# Embeddings

An example of how to create atextembedding in keras.

Please complete the code in the where you find the "# complete here" commnet.

In [2]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
import pandas as pd

# define the corpus
corpus = ['This is good pizza',
        'I love Italian pizza',
        'The best pizza',
        'nice pizza',
        'Excellent pizza',
        'I love pizza',
        'The pizza was alright',
        'disgusting pineapple pizza',
        'not good pizza',
        'bad pizza',
        'very bad pizza',
        'I had better pizza']


# creating class labels for our 
labels = array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0])

output_dim = 8
pd.DataFrame({'text': corpus, 'sentiment':labels})

Using TensorFlow backend.


Unnamed: 0,text,sentiment
0,This is good pizza,1
1,I love Italian pizza,1
2,The best pizza,1
3,nice pizza,1
4,Excellent pizza,1
5,I love pizza,1
6,The pizza was alright,0
7,disgusting pineapple pizza,0
8,not good pizza,0
9,bad pizza,0


In [4]:
# we extract the vocabulary from our corpus
sentences = [voc.split() for voc in corpus]
vocabulary = set([word for sentence in sentences for word in sentence])

vocab_size = len(vocabulary)
encoded_corpus = [one_hot(d, vocab_size) for d in corpus]
encoded_corpus


[[1, 2, 5, 15],
 [1, 16, 10, 15],
 [17, 12, 15],
 [16, 15],
 [14, 15],
 [1, 16, 15],
 [17, 15, 15, 2],
 [18, 7, 15],
 [8, 5, 15],
 [11, 15],
 [3, 11, 15],
 [1, 14, 17, 15]]

In [13]:
vocabulary

{'Excellent',
 'I',
 'Italian',
 'The',
 'This',
 'alright',
 'bad',
 'best',
 'better',
 'disgusting',
 'good',
 'had',
 'is',
 'love',
 'nice',
 'not',
 'pineapple',
 'pizza',
 'very',
 'was'}

In [5]:
# we now pad the documents to  
# the max length of the longest sentences
# to have an uniform length
max_length = 5
padded_docs = pad_sequences(encoded_corpus, maxlen=max_length, padding='post')
print(padded_docs)


[[ 1  2  5 15  0]
 [ 1 16 10 15  0]
 [17 12 15  0  0]
 [16 15  0  0  0]
 [14 15  0  0  0]
 [ 1 16 15  0  0]
 [17 15 15  2  0]
 [18  7 15  0  0]
 [ 8  5 15  0  0]
 [11 15  0  0  0]
 [ 3 11 15  0  0]
 [ 1 14 17 15  0]]


In [6]:

# model definition
model = Sequential()
model.add(Embedding(vocab_size, output_dim, input_length=max_length, name='embedding'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model

loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy * 100))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5, 8)              160       
_________________________________________________________________
flatten_1 (Flatten)          (None, 40)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 41        
Total params: 201
Trainable params: 201
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 91.666669


In [4]:
help(Embedding)

Help on class Embedding in module keras.layers.embeddings:

class Embedding(keras.engine.base_layer.Layer)
 |  Turns positive integers (indexes) into dense vectors of fixed size.
 |  eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
 |  
 |  This layer can only be used as the first layer in a model.
 |  
 |  # Example
 |  
 |  ```python
 |    model = Sequential()
 |    model.add(Embedding(1000, 64, input_length=10))
 |    # the model will take as input an integer matrix of size (batch, input_length).
 |    # the largest integer (i.e. word index) in the input should be
 |    # no larger than 999 (vocabulary size).
 |    # now model.output_shape == (None, 10, 64), where None is the batch dimension.
 |  
 |    input_array = np.random.randint(1000, size=(32, 10))
 |  
 |    model.compile('rmsprop', 'mse')
 |    output_array = model.predict(input_array)
 |    assert output_array.shape == (32, 10, 64)
 |  ```
 |  
 |  # Arguments
 |      input_dim: int > 0. Size of the vocabulary,
 |          i.