# Natural Language Processing in TensorFlow

# Week 1: Tokenizing and Padding

## Week 1 - Lesson 1

In [0]:
# Load packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

Here's the code to encode the two sentences. Tensorflow and keras give us a number of ways to encode words. We are going to use `tokenizer` here.

In [0]:
sentences = ['I love my dog', 'i love my cat']

In [0]:
tokenizer = Tokenizer(num_words = 20)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}


## Week 1 - Lesson 2

In [0]:
# Load packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
sentences = ['I love my dog', 'i love my cat', 'this moive is great!','I hope the virus can be controlled soon.', 'how long do you think it will last?']
tokenizer = Tokenizer(num_words= 100, oov_token= "<oov>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=5)

In [0]:
print("\nWord Index = ", word_index)
print("\nSequences = ", sequences)
print("\nPadded Sequences:")
print(padded)


Word Index =  {'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5, 'this': 6, 'moive': 7, 'is': 8, 'great': 9, 'hope': 10, 'the': 11, 'virus': 12, 'can': 13, 'be': 14, 'controlled': 15, 'soon': 16, 'how': 17, 'long': 18, 'do': 19, 'you': 20, 'think': 21, 'it': 22, 'will': 23, 'last': 24}

Sequences =  [[1, 2, 3, 4], [1, 2, 3, 5], [6, 7, 8, 9], [1, 10, 11, 12, 13, 14, 15, 16], [17, 18, 19, 20, 21, 22, 23, 24]]

Padded Sequences:
[[ 0  1  2  3  4]
 [ 0  1  2  3  5]
 [ 0  6  7  8  9]
 [12 13 14 15 16]
 [20 21 22 23 24]]


In [0]:
# try with words that the tokenizer wasn't fit to
test_data = ['what is randomness?', 'Generate a random sequence of events']
test_seq = tokenizer.texts_to_sequences(test_data)
print("\nText Sequence = ", test_seq)


Text Sequence =  [[8], []]


In [0]:

padded = pad_sequences(test_seq, maxlen = 10)
print("\nPadded Test Sequence: ")
print(padded)


Padded Test Sequence: 
[[0 0 0 0 0 0 0 1 9 1]
 [0 0 0 0 1 1 1 1 1 1]]


## Week 1 - Lesson 3

In [0]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

--2020-03-16 03:53:05--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.216.128, 2607:f8b0:400c:c02::80
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.216.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2020-03-16 03:53:06 (147 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [0]:
import json
with open("/tmp/sarcasm.json","r") as f:
  datastore = json.load(f)

sentences = []
labels = []
urls = []
for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])
  urls.append(item['article_link'])

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(oov_token="<oov>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(len(word_index))

29657


In [0]:
# print(word_index)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding = "post")
print(sentences[2])
print(padded[2])
print(padded.shape)

this moive is great!
[6 7 8 9 0 0 0 0]
(5, 8)


# Week 2: Embedding

TFTS

50 thousand moive reviews

In [2]:
import tensorflow as tf
print(tf.__version__)
# tf.enable_eager_execution()
import tensorflow_datasets as tfds
imdb, info = tfds.load('imdb_reviews', with_info = True, as_supervised=True)

2.2.0-rc1
[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…







HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFN2OF1/imdb_reviews-train.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFN2OF1/imdb_reviews-test.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFN2OF1/imdb_reviews-unsupervised.tfrecord


HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [0]:
# ?? what is the structure of imdb
import numpy as np
train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# str(s.numpy()) is needed in Python3 instead of just s.numpy()
for s, l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

for s, l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

# Turn into numpy array
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [0]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = "<OOV>"

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# initialize the tokenizer
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen = max_length, truncating = trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen = max_length)

In [0]:
# len(training_sentences)
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [20]:
def decode_review(text):
  return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_review(padded[1]))

? ? ? ? ? ? ? b'i have been known to fall asleep during films but this is usually due to a combination of things including really tired being warm and comfortable on the <OOV> and having just eaten a lot however on this occasion i fell asleep because the film was rubbish the plot development was constant constantly slow and boring things seemed to happen but with no explanation of what was causing them or why i admit i may have missed part of the film but i watched the majority of it and everything just seemed to happen of its own <OOV> without any real concern for anything else i cant recommend this film at all '


In [14]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
                             tf.keras.layers.Flatten(),
                             tf.keras.layers.Dense(6, activation = 'relu'),
                             tf.keras.layers.Dense(1, activation= 'sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [15]:
num_epochs = 10
model.fit(padded, training_labels_final, epochs = num_epochs, validation_data= (testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2c2f9343c8>

In [20]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


In [0]:
import io
out_v = io.open('vecs.tsv', 'w', encoding = 'utf-8')
out_m = io.open('meta.tsv', 'w', encoding = 'utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [0]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

In [23]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences(sentence)
print(sequence)

[[11], [], [1431], [966], [4], [1537], [1537], [4715], [], [790], [2019], [11], [2929], [2184], [], [790], [2019], [11], [579], [], [11], [579], [], [4], [1782], [4], [4517], [11], [2929], [1275], [], [], [2019], [1003], [2929], [966], [579], [790], []]
