In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, GRU, LSTM, Embedding

Definition of a plot function for training result visualization

In [None]:
def plot_results(history):
    hist_df = pd.DataFrame(history.history)
    hist_df.columns=["loss", "accuracy", "val_loss", "val_accuracy"]
    hist_df.index = np.arange(1, len(hist_df)+1)
    
    fig, axs = plt.subplots(nrows=2, sharex=True, figsize=(16, 10))
    axs[0].plot(hist_df.val_accuracy, lw=3, label='Validation Accuracy')
    axs[0].plot(hist_df.accuracy, lw=3, label='Training Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].grid()
    axs[0].legend(loc=0)
    axs[1].plot(hist_df.val_loss, lw=3, label='Validation Loss')
    axs[1].plot(hist_df.loss, lw=3, label='Training Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].grid()
    axs[1].legend(loc=0)
    
    plt.show();

## Preprocessing of the data

We get the IMDB dataset directly from the tensorflow_datasets API

In [None]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)


We build train and test sets, getting both the string input data as well as the labels

In [None]:
train_data, test_data = datasets['train'], datasets['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

for s,l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())
  
for s,l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())
  
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

We can check some values of the data to have a good understanding of it

In [None]:
training_sentences[0]

In [None]:
training_labels[0]

We create, from the text reviews, padding sequences of token values

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

Have a look to some values in order to understand how we go from text sentences to sequences of tokens and, finally, to zero-padded sequences

In [None]:
testing_sentences[20]

'b"I really tried, but this movie just didn\'t work for me. The action scenes were dull, the acting was surprisingly poor, and some of these characters were TOO stereotypical to even be funny. Pam Grier tries, but when you have nothing to work with, even her considerable talent cannot prevent a disaster. Even by the standards of this weak genre, this film is pretty bad."'

In [None]:
print(testing_sequences[20])
print(f"length of the sequence: {len(testing_sequences[20])}")

[59, 11, 68, 812, 19, 12, 18, 43, 330, 157, 16, 73, 2, 204, 140, 72, 767, 2, 116, 14, 1255, 344, 3, 50, 5, 135, 106, 72, 100, 2794, 6, 62, 29, 162, 8844, 1, 506, 19, 55, 23, 28, 163, 6, 157, 17, 62, 40, 4452, 682, 577, 3673, 4, 1712, 62, 33, 2, 1574, 5, 12, 830, 517, 12, 20, 7, 184, 80]
length of the sequence: 66


In [None]:
print(testing_padded[20])
print(f"length of the sequence: {len(testing_padded[20])}")

## Neural network model with LSTM

Build a neural network using at least one LSTM layer

(you may have a look at https://keras.io/api/layers/)

In [None]:
# Question 1: define a neural network model using at least one LSTM layer
# Hint1: we give you the first Embedding layer to help you, you can feed it directly with the padded sequences
# Hint2: do not restreint yourself to the LSTM layers, you can use as well Dense, Bidirectional, GRU, Dropout layers

model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    # ??????
])

# Question 2: define a relevant loss function and optimizer

# loss_function = ??????
# optimizer = ??????
model.compile(loss=loss_function, optimizer=optimizer,metrics=['accuracy'])
model.summary()


In [None]:
# Try first with a reasonnable number of epochs, before increasing the number
num_epochs = 10
history = model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

In [None]:
plot_results(history)