<a href="https://colab.research.google.com/github/harnalashok/deeplearning-sequences/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# https://www.tensorflow.org/tutorials/text/text_classification_rnn
# https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/guide/data.ipynb#scrollTo=m5bz7R1xhX1f
# Call libraries
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import matplotlib.pyplot as plt


In [None]:
help(tfds.load)

In [2]:
dataset, info = tfds.load(
                           'imdb_reviews',
                            with_info=True,
                            as_supervised=True
                          )


[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete7FOCML/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete7FOCML/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete7FOCML/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [3]:
type(dataset)    # dict
dataset.keys()   # _keys(['test', 'train', 'unsupervised'])

dict_keys(['test', 'train', 'unsupervised'])

In [4]:
train_dataset, test_dataset = dataset['train'], dataset['test']


In [5]:
type(train_dataset)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [6]:
train_dataset.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [7]:
for example, label in train_dataset.take(2):
  print('text: ', example.numpy())
  print('label: ', label.numpy())


text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0
text:  b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. 

In [8]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64      # Try 2 and see what happens


In [9]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [10]:
# Creates a `Dataset` with at most `count` elements from this dataset.
help(train_dataset.take)
dataset = tf.data.Dataset.range(10)
dataset = dataset.take(3)
list(dataset.as_numpy_iterator())

Help on method take in module tensorflow.python.data.ops.dataset_ops:

take(count) method of tensorflow.python.data.ops.dataset_ops.PrefetchDataset instance
    Creates a `Dataset` with at most `count` elements from this dataset.
    
    >>> dataset = tf.data.Dataset.range(10)
    >>> dataset = dataset.take(3)
    >>> list(dataset.as_numpy_iterator())
    [0, 1, 2]
    
    Args:
      count: A `tf.int64` scalar `tf.Tensor`, representing the number of
        elements of this dataset that should be taken to form the new dataset.
        If `count` is -1, or if `count` is greater than the size of this
        dataset, the new dataset will contain all elements of this dataset.
    
    Returns:
      Dataset: A `Dataset`.



[0, 1, 2]

In [11]:
# Each take is of batch size
for example, label in train_dataset.take(3):
  print('texts: ', example.numpy().shape)
  print('texts: ', example.numpy()[:4])
  print()
  print('labels: ', label.numpy()[:4])


texts:  (64,)
texts:  [b"Wow probable the worst movie i have ever seen!! This person should never make another movie!!I cant believe anyone would have produce this in good conscience.YOu have have wasted every cent. No concept of real life. I have wasted 2 hours of my life i will never get back. EVER!!! Everyone who worked on this show should be embarrassed!!!!!! I'm embarrassed for them! All of you should be ashamed. If i was gay i would want to tell the director that they have personally set back gay rights progress by 5 years. Please never watch this movie.I have never written a blogg about a film before but The distaste for this film has compelled me to do so."
 b"As I was watching this movie I was thinking,OK it'll get good any moment...I was wrong. The real best part of this movie was when it was over. A complete waste of 92 minutes. All seriousness aside the best part was when the Wendigo finally showed up which was at the end and you couldn't really even see him that good. And 

In [12]:
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/TextVectorization
# Text vectorization layer.
VOCAB_SIZE=1000

encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
                                                                         max_tokens=VOCAB_SIZE
                                                                       )



In [13]:
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
encoder.get_vocabulary()[:20]

In [None]:
help(encoder(example))

In [None]:
example

In [15]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[ 2,  1,  1, ...,  0,  0,  0],
       [49, 45,  1, ...,  0,  0,  0],
       [86,  5, 32, ...,  0,  0,  0]])

In [16]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])


In [17]:
print([layer.supports_masking for layer in model.layers])


[False, True, True, True, True]


In [18]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])


[0.0159057]


In [19]:
# predict on a sample text with padding

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])


[0.0159057]


In [20]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])


In [24]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
                    validation_steps=30)   
# Each epoch takes 690 secs on TPU

Epoch 1/10
  2/391 [..............................] - ETA: 9:32 - loss: 0.5201 - accuracy: 0.7578

KeyboardInterrupt: ignored

In [None]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
