# Tutorial
* https://stackabuse.com/python-for-nlp-word-embeddings-for-deep-learning-in-keras/

* https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

References:
* https://keras.io/guides/sequential_model/

In [10]:
# linear algebra
import numpy as np
from numpy import array

# tokenize data: can use one_hot or Tokenizer
from keras.preprocessing.text import one_hot, Tokenizer
from nltk.tokenize import word_tokenize

# padding the arrays with zeros
from keras.preprocessing.sequence import pad_sequences

# # modeling
# from keras.models import Sequential # beginners API
# from keras.models import Model # advanced

# from keras.layers import Dense
# from keras.layers import Flatten
# from keras.layers.embeddings import Embedding

# from keras.models import Input

# Text and label data

In [11]:
# import list of text
corpus = [
    # Positive Reviews

    'This is an excellent movie',
    'The move was fantastic I like it',
    'You should watch it is brilliant',
    'Exceptionally good',
    'Wonderfully directed and executed I like it',
    'Its a fantastic series',
    'Never watched such a brillent movie',
    'It is a Wonderful movie',

    # Negtive Reviews

    "horrible acting",
    'waste of money',
    'pathetic picture',
    'It was very boring',
    'I did not like the movie',
    'The movie was horrible',
    'I will not recommend',
    'The acting is pathetic'
]

# create an array of labels: 1 for positive and 0 for negative
sentiments = np.array(np.ones(len(corpus)//2).tolist() + 
                      np.zeros(len(corpus)//2).tolist())

## Embedding vectors

### Encode my vectos

In [12]:
# preprocessing: create encoded arrays

from word_embedding_preprocessing import padding_encoded_vector_func

padded_sentences = padding_encoded_vector_func(corpus, extra_size=5)
padded_sentences

array([[24,  1, 35,  5, 20,  0,  0],
       [12, 31, 21, 10, 29, 41, 44],
       [25, 47, 27, 44,  1, 47,  0],
       [22,  4,  0,  0,  0,  0,  0],
       [35, 16, 45, 19, 29, 41, 44],
       [38, 34, 10,  7,  0,  0,  0],
       [42, 33, 25, 34,  1, 20,  0],
       [44,  1, 34, 40, 20,  0,  0],
       [23,  8,  0,  0,  0,  0,  0],
       [35, 10,  9,  0,  0,  0,  0],
       [30, 36,  0,  0,  0,  0,  0],
       [44, 21, 34, 28,  0,  0,  0],
       [29, 46, 22, 41, 12, 20,  0],
       [12, 20, 21, 23,  0,  0,  0],
       [29, 10, 22,  4,  0,  0,  0],
       [12,  8,  1, 30,  0,  0,  0]], dtype=int32)

In [13]:
vocab_size = 50

### Load pre-trained vectors: GloVe

In [14]:
from load_glove import load_embedding_matrix

path_glove = '/Users/liviaclarete/Downloads/glove.6B/glove.6B.100d.txt'

embedding_matrix = load_embedding_matrix(corpus, vocab_size, path_glove)

In [15]:
embedding_matrix.shape

(50, 100)

## Modeling

__Interpreting the model.summary()__

Params #: number of trainable parameters from each layer

* embedding_14: product of the vocabulary size (50) and the number of dimentional vector (20): 50 * 20 = 1000

In [16]:
from word_embedding_model import create_sequencial_model
from word_embedding_model import create_functional_model

# instantiate the model sequencial model
learn_rate = 0.0001
model = create_sequencial_model(corpus, 50, learn_rate)
model.summary()

model2 = create_functional_model(corpus, 50, embedding_matrix)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 7, 20)             1000      
_________________________________________________________________
flatten_2 (Flatten)          (None, 140)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 141       
Total params: 1,141
Trainable params: 1,141
Non-trainable params: 0
_________________________________________________________________


In [17]:
# SEQUENCIAL MODEL

# fit the models

epochs = 2
model.fit(
    # array of encoded padded array (completed with zeros)
    padded_sentences, 
    # labels
    sentiments, 
    # number of epochs
    epochs=epochs, 
    verbose=1)


# evaluate the models
# in this case, the evaluation is being evaluated with the
# same data. In real world it should be different

loss, accuracy = model.evaluate(
    # padded array for the test set
    padded_sentences, 
    # labels for the test set
    sentiments, 
    verbose=1)

print('Loss:', round(loss))

# In the output, that model accuracy is 1.00 i.e. 100 percent.
print('Accuracy: ', accuracy)

Epoch 1/2
Epoch 2/2
Loss: 1
Accuracy:  0.5625


In [18]:
# FUNCTIONAL MODELS

# fit the models
model2.fit(
    # array of encoded padded array (completed with zeros)
    padded_sentences, 
    # labels
    sentiments, 
    # number of epochs
    epochs=epochs, 
    verbose=1)


# evaluate the models
# in this case, the evaluation is being evaluated with the
# same data. In real world it should be different

loss, accuracy = model.evaluate(
    # padded array for the test set
    padded_sentences, 
    # labels for the test set
    sentiments, 
    verbose=1)

print('Loss:', round(loss))

# In the output, that model accuracy is 1.00 i.e. 100 percent.
print('Accuracy: ', accuracy)

Epoch 1/2
Epoch 2/2
Loss: 1
Accuracy:  0.5625
