In [None]:
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import gensim

# Dataset: IMDB Movie reviews sentiment classification

In [None]:
num_words=30000
INDEX_FROM=3  # idx 0 => PAD, idx 1 => START, idx 2 => OOV (out of vocab.)
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=num_words+2,)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

In [None]:
num_words=len(np.unique(np.hstack(data)))
print("Categories:", np.unique(targets))
print("Number of unique words:", num_words)

In [None]:
length = [len(i) for i in data]
print("Average Review length:", np.mean(length))
print("Standard Deviation:", round(np.std(length)))

In [None]:
print("Label:", targets[0])
print(data[0])

# Traemos el vocabulario y armamos indice reverso

In [None]:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
decoded = " ".join( [reverse_index.get(i - INDEX_FROM, "#") for i in data[1]] )
print(decoded)

In [None]:
w2v = gensim.models.KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
w2v.wv["car"]

In [None]:
embed_dim=300
embedding_matrix=np.zeros([num_words+4,embed_dim])
for word, idx in index.items():
  if idx <= num_words and word in w2v.wv:
    embedding_matrix[idx+INDEX_FROM,:]=w2v.wv[word]

embedding_matrix.shape

# Hacemos que todos los reviews tengan el mismo largo

In [None]:
maxlen=1000

In [None]:
data = pad_sequences(data, maxlen=maxlen, value=0.0)

In [None]:
len(data[0])

In [None]:
len(data[1])

In [None]:
data=np.array(data)

In [None]:
data.shape

# Armamos el modelo con una Conv1D

In [None]:
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Dense, Flatten
from keras.models import Sequential
from keras import optimizers

In [None]:
#num_distinct_words = #TODO jenni
embedding_output_dims = embed_dim
max_sequence_length = maxlen

In [None]:
# Define the Keras model
model = Sequential()
model.add(Embedding(num_distinct_words, embedding_output_dims, input_length=max_sequence_length))
model.add(Dropout(0.50))
model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
model.add(Dropout(0.50))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.50))
model.add(Dense(1, activation='sigmoid'))