In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, GlobalAveragePooling1D
from tensorflow.python.keras.utils.vis_utils import plot_model

import numpy as np, matplotlib.pyplot as plt

### IMDB 

In [2]:
# IMDB dataset
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for s,l in train_data:
  training_sentences.append(s.numpy().decode('utf8'))
  training_labels.append(l.numpy())
  
for s,l in test_data:
  testing_sentences.append(s.numpy().decode('utf8'))
  testing_labels.append(l.numpy())
  
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [3]:
# Parameters
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<OOV>'

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

In [4]:
print(len(word_index))
print(len(sequences))
print(len(padded))

88583
25000
25000


In [5]:
reverse_word_index = {v:k for k, v in word_index.items()}

def decode_review(padded_sequence):
    return ' '.join([reverse_word_index.get(i, '?') for i in padded_sequence])

print(padded[0])
print(decode_review(padded[0]))
print(training_sentences[0])

[  12   14   33  425  392   18   90   28    1    9   32 1366 3585   40
  486    1  197   24   85  154   19   12  213  329   28   66  247  215
    9  477   58   66   85  114   98   22 5675   12 1322  643  767   12
   18    7   33  400 8170  176 2455  416    2   89 1231  137   69  146
   52    2    1 7577   69  229   66 2933   16    1 2904    1    1 1479
 4940    3   39 3900  117 1584   17 3585   14  162   19    4 1231  917
 7917    9    4   18   13   14 4139    5   99  145 1214   11  242  683
   13   48   24  100   38   12 7181 5515   38 1366    1   50  401   11
   98 1197  867  141   10    0    0    0]
this was an absolutely terrible movie don't be <OOV> in by christopher walken or michael <OOV> both are great actors but this must simply be their worst role in history even their great acting could not redeem this movie's ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the <OOV> rebels were making their cases for <OOV> ma

In [6]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Flatten(),
    Dense(units=6, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [7]:
num_epochs = 10
model.fit(padded, training_labels_final,
         epochs=num_epochs,
         validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f59a04440a0>

In [8]:
def plot_metrics(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_' + string])
    plt.xlabel('Epochs')
    plt.ylabel(string)
    plt.legend([string, 'val_' + string])
    plt.show()

In [9]:
plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')

NameError: name 'history' is not defined

## Inspect the embedding layer and retrieve embedding vectors

In [None]:
emb_layer = model.layers[0]
weights = emb_layer.get_weights()[0]
weights.shape # shape: (vocab_size, embedding_dim)

In [None]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + '\n')
    out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')

out_v.close()
out_m.close()

In [None]:
sentence = 'I really think this is amazing. honestly.'
sequence = tokenizer.texts_to_sequences([sentence])
print(sequence)

In [None]:
weights[12]

### IMDB: GlobalAveragePooling1D layer instead of Flatten

In [None]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(units=6, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
num_epochs = 10
history = model.fit(padded, training_labels_final,
         epochs=num_epochs,
         validation_data=(testing_padded, testing_labels_final))

In [None]:
plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')

# Subwords tokenizer - imdb

**Sequence** becomes much more important when dealing with subwords.

In [None]:
imdb, info = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)

In [None]:
train_data, test_data = imdb['train'], imdb['test']

tokenizer = info.features['text'].encoder

print(tokenizer.subwords[:100])

In [None]:
sample_string = 'TensorFlow, from basics to mastery'

tokenized_string = tokenizer.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

In [None]:
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer.decode([ts])))

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_data.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))
test_dataset = test_data.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_data))

In [None]:
embedding_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(), #Subwords can't use Flatten()
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
num_epochs = 10

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

history = model.fit(train_dataset, epochs=num_epochs, validation_data=test_dataset)

In [None]:
plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')

## Sarcasm Dataset

In [None]:
import json

with open("/tmp/sarcasm.json", 'r') as f:
    datastore = json.load(f)


sentences = [] 
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 32
padding_type = 'post'
trunc_type='post'
ovv_tok = '<OOV>'

training_size = 20000

training_data = np.array(sentences[:training_size])
training_labels = np.array(labels[:training_size])

testing_data = np.array(sentences[training_size:])
testing_labels = np.array(labels[training_size:])

In [None]:
plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_data)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_data)

padded = pad_sequences(sequences, padding=padding_type, truncating=trunc_type, maxlen=max_length)

testing_sequences = tokenizer.texts_to_sequences(testing_data)
testing_padded = pad_sequences(testing_sequences, padding=padding_type, truncating=trunc_type, maxlen=max_length)

In [None]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(units=24, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
num_epochs = 10
history = model.fit(padded, training_labels,
         epochs=num_epochs,
         validation_data=(testing_padded, testing_labels))

In [None]:
plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')

the validation loss increase too much, Not good find better hyperparameters.

## Save sarcasm embeddings and train again with different hyperparameters

In [None]:
reverse_word_index = {v:k for k, v in word_index.items()}

def decode_review(padded_sequence):
    return ' '.join([reverse_word_index.get(i, '?') for i in padded_sequence])

print(padded[0])
print(decode_review(padded[0]))
print(training_sentences[0])
print(training_labels[0])

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

#### Inference

In [None]:
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

### Hyperparameters

In [None]:
vocab_size = 1000 #10000
embedding_dim = 16
max_length = 16 #32
padding_type = 'post'
trunc_type='post'
ovv_tok = '<OOV>'

training_size = 20000

training_data = np.array(sentences[:training_size])
training_labels = np.array(labels[:training_size])

testing_data = np.array(sentences[training_size:])
testing_labels = np.array(labels[training_size:])

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_data)
sequences = tokenizer.texts_to_sequences(training_data)

padded = pad_sequences(sequences, padding=padding_type, truncating=trunc_type, maxlen=max_length)

testing_sequences = tokenizer.texts_to_sequences(testing_data)
testing_padded = pad_sequences(testing_sequences, padding=padding_type, truncating=trunc_type, maxlen=max_length)

In [None]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(units=24, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
num_epochs = 30
history = model.fit(padded, training_labels,
         epochs=num_epochs,
         validation_data=(testing_padded, testing_labels))

In [None]:
#Better smooth loss | lower accuracy
plot_metrics(history, 'accuracy')
plot_metrics(history, 'loss')

Better validation loss

TODO: Twerk hypeparamters to find a set with .90 accuracy without effect of the loss function increasing sharply

vocab_size = optimal? #1000 #10000

embedding_dim = 16

max_length = optimal? #16 #32

### Inference

In [None]:
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))