In [1]:
import numpy as np
import pandas as pd
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(42)

ModuleNotFoundError: No module named 'keras'

In [2]:
# load the dataset but only keep the top n words, zero the rest
N_WORDS = 5000
INDEX_FROM = 0
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=N_WORDS, index_from=INDEX_FROM)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [3]:
print("Train data shapes:" , X_train.shape, y_train.shape)

Train data shapes: (25000,) (25000,)


In [4]:
print("Test data shapes:" , X_test.shape, y_test.shape)

Test data shapes: (25000,) (25000,)


In [5]:
pd.DataFrame(y_train)[0].value_counts()

1    12500
0    12500
Name: 0, dtype: int64

### Print the text

In [8]:
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
id_to_word = {value:key for key,value in word_to_id.items()}

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [10]:
ROW = 0
print(' '.join(id_to_word[id] for id in X_train[ROW] ))

the this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert and is an amazing actor and now the same being director and father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for and and would recommend it to everyone to watch and the fly and was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also and to the two little and that played the and of norman and paul they were just brilliant children are often left out of the and list i think because the stars that play them all grown up are such a big and for the whole film but these children are amazing and should be and for what they have done don't you thi

### Pad the sequences

In [0]:
max_review_length = 250
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

### Build the model

In [23]:
# create the model
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(N_WORDS, embedding_vector_length, input_length=max_review_length))
model.add(Dropout(0.5))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 32)           160000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 250, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f913ac635c0>

In [0]:
model.predict_proba(X_test)

array([[0.18707702],
       [0.9915855 ],
       [0.88425034],
       ...,
       [0.05354523],
       [0.12668486],
       [0.86854273]], dtype=float32)

### Let's Play

In [0]:
review = "i liked this movie a lot the characters are deep and well developed"
raw_review_list = review.split()
print(raw_review_list)

['i', 'liked', 'this', 'movie', 'a', 'lot', 'the', 'characters', 'are', 'deep', 'and', 'well', 'developed']


In [0]:
# clean list
review_list = []
for x in raw_review_list:
    if x in list(word_to_id.keys()):
        review_list.append(x)

In [0]:
ids_from_words = [word_to_id[x] for x in review_list]
ids_from_words

[10, 420, 11, 17, 3, 173, 1, 102, 23, 930, 2, 70, 1388]

In [0]:
# clean ids_from_words
review_ids_from_words = []
for x in ids_from_words:
    if np.sum(np.isin(X_train, x)) > 0:
        review_ids_from_words.append(x)

In [0]:
# simulate the np.array to be like the train set
review_array = np.array(review_ids_from_words).reshape((1,len(review_ids_from_words)))
review_array

array([[  10,  420,   11,   17,    3,  173,    1,  102,   23,  930,    2,
          70, 1388]])

In [0]:
processed_review = sequence.pad_sequences(review_array, maxlen=max_review_length)
processed_review

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [0]:
model.predict_proba(processed_review)

array([[0.94799393]], dtype=float32)

### Create a function

In [0]:
def sentimentator(review):
    raw_review_list = review.split()
    # clean list
    review_list = []
    for x in raw_review_list:
        if x in list(word_to_id.keys()):
            review_list.append(x)

    ids_from_words = [word_to_id[x] for x in review_list]

    # clean ids_from_words
    review_ids_from_words = []
    for x in ids_from_words:
        if np.sum(np.isin(X_train, x)) > 0:
            review_ids_from_words.append(x)
    
    print(review_ids_from_words)
    review_array = np.array(review_ids_from_words).reshape((1,len(review_ids_from_words)))
    processed_review = sequence.pad_sequences(review_array, maxlen=max_review_length)
    return model.predict_proba(processed_review)

In [0]:
#new_review = "hate this movie it is not good and the acting is really bad"
new_review = "That movie is fucking good"

In [0]:
sentimentator(new_review)

[17, 6, 49]


array([[0.6352975]], dtype=float32)

In [0]:
new_review2 = "i did not like this shitty movie acting is bad as fuck"
sentimentator(new_review2)

[10, 119, 21, 37, 11, 17, 113, 6, 75, 14]


array([[0.13691352]], dtype=float32)

### Usando convoluções
- imagine uma janela "rolando" por cima do texto, o que acontece com o modelo?
- O numero de parametros aumenta ou diminui?
- Quão bom ficou o modelo?
- Qual foi o tempo de treinamento? Qual modelo é mais rápido de treinar?

In [0]:
## Importe:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [0]:
# create the model
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(N_WORDS, embedding_vector_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 250, 32)           160000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 250, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 125, 32)           0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 125, 32)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total para

<keras.callbacks.History at 0x1e5e3506390>