In [10]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [13]:
import requests
requests.packages.urllib3.disable_warnings()
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

In [11]:
# number of words to be considered, selects based on frequency
# too big of a vocab size can lead to overfitting
vocab_size = 10000

# max number of words in each sequence
max_length = 250

In [14]:
# load the IMDB dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [15]:
# pads the shorter sequences with zeroes, and truncates the sequences that are too long
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_length)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=max_length)

In [17]:
# define the RNN
# RNNs are for sequences of data, CNNs are for grids of data 
model = Sequential([
    
    # translates words to number vectors
    # output_dim is number of features
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),

    # captures relationships between neurons
    SimpleRNN(units=64, return_sequences=True),
    
    SimpleRNN(units=32),
    
    Dense(units=1, activation='sigmoid')
])

# compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
# train
model.fit(x_train, y_train, batch_size=32, epochs=5, validation_split=0.2)

# evaluate
loss, accuracy = model.evaluate(x_test, y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
prediction = model.predict(x_test)



In [20]:
prediction

array([[0.0890338 ],
       [0.9636415 ],
       [0.3486933 ],
       ...,
       [0.13828507],
       [0.25446862],
       [0.90883505]], dtype=float32)

In [39]:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()]) 
decoded = " ".join( [reverse_index.get(i - 3, "#") for i in x_test[1000]] )
print(decoded) 

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # this is one of the worst movies i have ever seen the # for the film is better than the film itself my girlfriend and i watched it this past weekend and we only continued to watch it in the hopes that it would get better it didn't br br the picture quality is poor it looks like it was shot on video and transferred to film the lighting is not great which makes it harder to read the actors' facial expressions the acting itself was cheesy but i guess it's acceptable for yet another teenage horror flick the sound was a huge problem sometimes you have to # the video because the sound is unclear and or # br br it holds no real merit of it's own trying to ride on the # of sleepy hollow don't bother with this one


In [38]:
prediction[1000]

array([0.07530642], dtype=float32)