In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<a href="https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%202%20-%20Lesson%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


So let's start looking at it. There are a couple of things that you need to take into account before you start working with this week's code in TensorFlow. The first is the version of TensorFlow you're using. Use this code to determine it. Also, do note that all the code I'm using here is in Python 3. There are some differences if you use Python 2. So if you're using a Colab, you can set the environment to three. If you're doing this in your own environment, you may need to make some changes. If the previous code gave you TensorFlow 1.x, you'll need this line of code before you can go any further. If it gave you 2.x, then you won't need anything because eager execution is enabled by default in TensorFlow 2.0.

In [2]:
import tensorflow as tf
print(tf.__version__)

# !pip install -q tensorflow-datasets
#  If you're using Google Colab, then you should have TensorFlow datasets already installed. Should you not have them, they're easily installed with this line of code.

2.4.1


In [3]:
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
# Now, you can import TensorFlow datasets, and in this case I call them tfds. 
# With imdb reviews, I can now call tfds.load, pass it the string imdb reviews, 
# and it will return the data from imdb, and metadata about it with this code. 


[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteNF072Z/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteNF072Z/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteNF072Z/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [4]:
# The data is split into 25,000 samples for training and 25,000 samples for testing. 
# I can split them out like this. Each of these are iterables containing the 25,000 
# respective sentences and labels as tensors. Up to this point, we've been using 
# the Cara's tokenizers and padding tools on arrays of sentences, so we need to do 
# a little converting. We'll do it like this. First of all, let's define the lists 
# containing the sentences and labels for both training and testing data. 

import numpy as np

train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

#  Now, I can iterate over training data extracting the sentences and the labels. #
# The values for S and I are tensors, so by calling their NumPy method, I'll actually extract their value. 



# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for s,l in train_data:
  training_sentences.append(s.numpy().decode('utf8'))
  training_labels.append(l.numpy())

# Then I'll do the same for the test set. 
for s,l in test_data:
  testing_sentences.append(s.numpy().decode('utf8'))
  testing_labels.append(l.numpy())
  
# When training, my labels are expected to be NumPy arrays.
# So I'll turn the list of labels that I've just created into NumPy arrays with this code

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)


In [5]:
# I've put the hyperparameters at the top like this for the reason that 
# it makes it easier to change and edit them, instead of phishing through 
# function sequences for the literals and then changing those

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

#  Now, as before, we import the tokenizer and the pad sequences.

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# We'll create an instance of tokenizer, giving it our vocab size 
# and our desired out of vocabulary token. We'll now fit the tokenizer on our training set of data
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

# We'll now fit the tokenizer on our training set of data.
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

#  Once we have our word index, we can now replace the strings
# containing the words with the token value we created for them.
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

# As before, the sentences will have variant length. 
# So we'll pad and or truncate the sequenced sentences until they're all the same length, 
# determined by the maxlength parameter.
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# As before, the sentences will have variant length. So we'll 
# pad and or truncate the sequenced sentences until they're all 
# the same length, determined by the maxlength parameter.

testing_padded = pad_sequences(testing_sequences,maxlen=max_length)


In [6]:
# We can see that this is a 10,000 by 16 array, we have 10,000 words in our corpus, 
# and we're working in a 16 dimensional array, so our embedding will have that shape. 
# To be able to plot it, we need a helper function to reverse our word index. 
# As it currently stands, our word index has the key being the word, and the value being the token for the word. 

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[3]))
print(training_sentences[3])

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? this is the kind of film for a snowy sunday afternoon when the rest of the world can go ahead with its own business as you <OOV> into a big arm chair and <OOV> for a couple of hours wonderful performances from cher and nicolas cage as always gently row the plot along there are no <OOV> to cross no dangerous waters just a warm and witty <OOV> through new york life at its best a family film in every sense and one that deserves the praise it received
This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.


In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    # The results of the embedding will be a 2D array with the length of the sentence 
    # and the embedding dimension for example 16 as its size. 
    # So we need to flatten it out in much the same way as we needed to flatten out our images.
    # We then feed that into a dense neural network to do the classification.

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
# The reason for this is the size of the output vector being fed into the dance.
# So for example, if I show the summary of the model with the flatten that we just saw, it will look like this. 

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [8]:
num_epochs = 10
# Now training is the simplest passing padded and your training labels final as your training set, 
# specifying the number of epochs, and passing the testing padded and testing labels final as your test set. 
# Here's the results of training, with the training set giving us 1.00 accuracy and the validation set at 0.8259. 
# So there's a good chance that we're overfitting. 
# We'll look at some strategies to avoid this later, but you should expect results a little bit like this. 

model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4a685f52d0>

In [9]:
# We'll start by getting the results of the embeddings layer, which is layer zero. 
# We can get the weights, and print out their shape like this. 
# We can see that this is a 10,000 by 16 array, we have 10,000 words in our corpus, 
# and we're working in a 16 dimensional array, so our embedding will have that shape. 


e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


In [10]:
#  The TensorFlow Projector reads this file type and uses it to plot the vectors in 3D space so we can visualize them. To the vectors file, we simply write out the value of each of the items in the array of embeddings, i.e, the co-efficient of each dimension on the vector for this word. To the metadata array, we just write out the words. If you're working in Colab, this code will download the two files.

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [11]:
# If you're working in Colab, this code will download the two files. 
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences([sentence])
print(sequence)

[[11, 64, 102, 12, 7, 478, 1200]]
