In [1]:
%matplotlib inline

In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import re
import string
from IPython.display import display

import tensorflow as tf
from tensorflow.keras.utils import text_dataset_from_directory
from tensorflow.keras.layers import TextVectorization, Input
from tensorflow.keras.models import Sequential

# Basic Text Classification (repeat)

## Load the dataset

### Define constants.

In [141]:
SEED = 123
BATCH_SIZE = 32
MAX_TOKENS = 10_000  # the vocabulary would contain max of 10K words+ngrams
OUTPUT_SEQUENCE_LENGTH = 250  # each text(record) shall be limited to 250 words(+ngrams)

### List the contents of the dataset folder.

In [4]:
movies_path = os.path.join(
    os.environ['HOME'],
    'Desktop/datasets/aclImdb',
)

os.listdir(movies_path)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

### Print out a sample of the vocabulary.

In [5]:
with open(os.path.join(movies_path, 'imdb.vocab'), encoding='ISO-8859-1') as f:
    vocab = [v.strip() for v in f.readlines()[:10]]
    print(', '.join(vocab))

the, and, a, of, to, is, it, in, i, this


### Print out a sample review.

In [6]:
path_train_positive = os.path.join(movies_path, 'train', 'pos')
a_file = os.listdir(path_train_positive)[123]
with open(os.path.join(path_train_positive, a_file)) as f:
    print(f.read())

Any story comprises a premise, characters and conflict. Characters plotting their own play promises triumph, and a militant character readily lends oneself to this. Ardh Satya's premise is summarized by the poem of the same name scripted by Dilip Chitre. The line goes - "ek palde mein napunsaktha, doosre palde mein paurush, aur teek tarazu ke kaante par, ardh satya ?". A rough translation - "The delicate balance of right & wrong ( commonly seen on the busts of blind justice in the courts ) has powerlessness on one plate and prowess on another. Is the needle on the center a half-truth ? "<br /><br />The poem is recited midway in the film by Smita Patil to Om Puri at a resturant. It makes a deep impact on the protagonist & lays the foundation for much of the later events that follow. At the end of the film, Om Puri ends up in exactly the same situation described so aptly in the poem.<br /><br />The film tries mighty hard to do a one-up on the poem. However, Chitre's words are too powerfu

### Define train and test directories.

In [7]:
path_train = os.path.join(movies_path, 'train')
path_test = os.path.join(movies_path, 'test')

### Read raw training and validation datasets.

In [8]:
raw_training_ds, raw_val_ds = text_dataset_from_directory(
    directory=path_train,
    labels='inferred',
    label_mode='int',
    class_names=['pos', 'neg'],
    batch_size=BATCH_SIZE,
    max_length=None,
    shuffle=True,
    seed=SEED,
    validation_split=0.2,
    subset='both',
)

raw_training_ds, raw_val_ds

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Using 5000 files for validation.


(<BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>,
 <BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>)

### Verify the number of records.

In [9]:
n_training = raw_training_ds.cardinality().numpy() * BATCH_SIZE
n_validation = raw_val_ds.cardinality().numpy() * BATCH_SIZE

n_training, n_validation

(20000, 5024)

### Print out a sample review and its label.

In [10]:
for texts_batch, labels_batch in raw_training_ds.take(1):
    sample_review = texts_batch[0].numpy()
    sample_label = labels_batch[0].numpy()
    print(sample_review, sample_label, sep='\n')

b'After, I watched the films... I thought, "Why the heck was this film such a high success in the Korean Box Office?" Even thought the movie had a clever/unusal scenario, the acting wasn\'t that good and the characters weren\'t very interesting. For a Korean movie... I liked the fighting scenes. If you want to watch a film without thinking, this is the film for you. But I got to admit... the film was kind of childish... 6/10'
0


## Prepare the dataset for training

### Implement preprocessing function.

In [11]:
# Implement a function to clean each review.
def clean_review(review):
    cleaned = tf.strings.lower(review)
    cleaned = tf.strings.regex_replace(cleaned, '<[^>]+>', '')
    cleaned = tf.strings.regex_replace(cleaned, f'[{re.escape(string.punctuation)}]', '')
    
    return cleaned

print(sample_review, clean_review(sample_review), sep='\n\n')

b'After, I watched the films... I thought, "Why the heck was this film such a high success in the Korean Box Office?" Even thought the movie had a clever/unusal scenario, the acting wasn\'t that good and the characters weren\'t very interesting. For a Korean movie... I liked the fighting scenes. If you want to watch a film without thinking, this is the film for you. But I got to admit... the film was kind of childish... 6/10'

tf.Tensor(b'after i watched the films i thought why the heck was this film such a high success in the korean box office even thought the movie had a cleverunusal scenario the acting wasnt that good and the characters werent very interesting for a korean movie i liked the fighting scenes if you want to watch a film without thinking this is the film for you but i got to admit the film was kind of childish 610', shape=(), dtype=string)


### Define layer.

In [17]:
vectorize_layer = TextVectorization(
    max_tokens=MAX_TOKENS,
    standardize=clean_review,
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=OUTPUT_SEQUENCE_LENGTH,
    encoding='utf-8',
)

### Extract just the reviews (drop labels).

In [18]:
training_ds = raw_training_ds.map(lambda review, label: review)

### Compute the vocabulary.

In [142]:
# Vocab is ordered (most frequent words first).
vectorize_layer.adapt(training_ds)
vectorize_layer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

### Vectorize a sample review.

In [143]:
# The review printed below is before preprocessing 
# that is before lowercasing, removing punctuation and html.
sample_review = list(training_ds.as_numpy_iterator())[0][0]
display(sample_review)
display(vectorize_layer(sample_review))

b'This is a wonderful movie in a lot of ways. Everyone in my family enjoyed it. The animation is excellent and easily demonstrates that there are plenty of producers who create films that are as visually brilliant as anything that comes from the Disney Studio.<br /><br />One difference from the normal Disney fare is that this Dreamworks movie does not feature some wise-cracking side kick for comedy relief. And, there are no sudden moments where the characters break into song. I am sure that a scene at the beginning of the film would not appear in a Disney picture: the birth of Spirit. But it is done tastefully and is not offensive at all. "Spirit" was a great breath of fresh air. Don\'t get me wrong. I have loved Disney for years and will continue to do so. <br /><br />"Spirit" is another example of great animated fare. As soon as it was over, my kids wanted to watch it again. I had the same feeling. I thoroughly recommend it.'

<tf.Tensor: shape=(250,), dtype=int64, numpy=
array([  10,    7,    4,  368,   17,    8,    4,  168,    5,  745,  302,
          8,   58,  224,  493,    9,    2,  730,    7,  311,    3,  678,
       5277,   12,   48,   23,  939,    5, 1156,   35,  966,   96,   12,
         23,   14, 1965,  522,   14,  226,   12,  250,   36,    2,  900,
          1, 1383,   36,    2, 1196,  900, 2470,    7,   12,   10,    1,
         17,  120,   21,  788,   46, 9125,  505, 2164,   16,  220, 2137,
          3,   48,   23,   56, 1974,  388,  112,    2,  101,  987,   78,
        609,   11,  236,  240,   12,    4,  131,   30,    2,  443,    5,
          2,   19,   57,   21,  925,    8,    4,  900,  430,    2, 2435,
          5, 1071,   18,    9,    7,  219,    1,    3,    7,   21, 2320,
         30,   31, 1071,   13,    4,   82, 2939,    5, 1488, 1003,   89,
         76,   69,  355,   11,   25,  425,  900,   16,  147,    3,   74,
       1719,    6,   81,   38, 1071,    7,  154,  441,    5,   82, 1084,
     

## DEMO `TextVectorization`

### Define a Vectorizer layer.

In [154]:
vectorizer = TextVectorization(name='dummy_vectorizer')
texts = [
	"She she doesn’t study German on Monday.",
	"Does she live in Paris?",
	"He doesn’t teach math.",
	"Cats hate water.",
	# "Every child likes an ice cream.",
	# "My brother takes out the trash.",
	# "The course starts next Sunday.",
	# "She swims every morning.",
	# "I don’t wash the dishes.",
	# "We see them every week.",
	# "I don’t like tea.",
	# "When does the train usually leave?",
	# "She always forgets her purse.",
	# "You don’t have children.",
	# "I and my sister don’t see each other anymore.",
	# "They don’t go to school tomorrow.",
]

### Teach vocab.

In [155]:
vectorizer.adapt(texts)

### Display some attributes.

In [156]:
# `[UNK]` =  unknown word
display(vectorizer.vocabulary_size())
display(vectorizer.dtype)
display(vectorizer.name)
display(vectorizer.get_vocabulary()[:8])

18

'string'

'dummy_vectorizer'

['', '[UNK]', 'she', 'doesn’t', 'water', 'teach', 'study', 'paris']

### Default integer encoding

In [157]:
# Default integer encoding: integer indices, one integer index per split string token.
# Each sentence need to be a separate record -> `tf.expand_dims()`.
display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 7), dtype=int64, numpy=
array([[ 2,  2,  3,  6, 15,  8,  9],
       [16,  2, 11, 12,  7,  0,  0],
       [13,  3,  5, 10,  0,  0,  0],
       [17, 14,  4,  0,  0,  0,  0]], dtype=int64)>

### Cap the number of tokens

In [158]:
# Cap the number of tokens -> a lot of `[UNK]` (unknown words).
vectorizer = TextVectorization(max_tokens=8)
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['', '[UNK]', 'she', 'doesn’t', 'water', 'teach', 'study', 'paris']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 7), dtype=int64, numpy=
array([[2, 2, 3, 6, 1, 1, 1],
       [1, 2, 1, 1, 7, 0, 0],
       [1, 3, 5, 1, 0, 0, 0],
       [1, 1, 4, 0, 0, 0, 0]], dtype=int64)>

### Increase the len of the output encoding.

In [159]:
# Increase the len of the output encoding -> trailing zeros.
vectorizer = TextVectorization(max_tokens=8, output_sequence_length=20)
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['', '[UNK]', 'she', 'doesn’t', 'water', 'teach', 'study', 'paris']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 20), dtype=int64, numpy=
array([[2, 2, 3, 6, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 2, 1, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)>

### Add ngrams

In [160]:
# Add ngrams -> vocabulary size increases.
vectorizer = TextVectorization(ngrams=2)
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['',
 '[UNK]',
 'she',
 'doesn’t',
 'water',
 'teach math',
 'teach',
 'study german',
 'study',
 'she she',
 'she live',
 'she doesn’t',
 'paris',
 'on monday',
 'on',
 'monday',
 'math',
 'live in',
 'live',
 'in paris',
 'in',
 'he doesn’t',
 'he',
 'hate water',
 'hate',
 'german on',
 'german',
 'doesn’t teach',
 'doesn’t study',
 'does she',
 'does',
 'cats hate',
 'cats']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 13), dtype=int64, numpy=
array([[ 2,  2,  3,  8, 26, 14, 15,  9, 11, 28,  7, 25, 13],
       [30,  2, 18, 20, 12, 29, 10, 17, 19,  0,  0,  0,  0],
       [22,  3,  6, 16, 21, 27,  5,  0,  0,  0,  0,  0,  0],
       [32, 24,  4, 31, 23,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int64)>

### One-hot encode

In [161]:
# One-hot encode each text (sentence).
vectorizer = TextVectorization(output_mode='multi_hot')
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['[UNK]',
 'she',
 'doesn’t',
 'water',
 'teach',
 'study',
 'paris',
 'on',
 'monday',
 'math',
 'live',
 'in',
 'he',
 'hate',
 'german',
 'does',
 'cats']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 17), dtype=float32, numpy=
array([[0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
        0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
        0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1.]], dtype=float32)>

### `output_mode='count'`

In [162]:
vectorizer = TextVectorization(output_mode='count')
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['[UNK]',
 'she',
 'doesn’t',
 'water',
 'teach',
 'study',
 'paris',
 'on',
 'monday',
 'math',
 'live',
 'in',
 'he',
 'hate',
 'german',
 'does',
 'cats']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 17), dtype=float32, numpy=
array([[0., 2., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
        0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
        0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1.]], dtype=float32)>

### One-hot encoding with TF-IDF instead of ints. 

In [163]:
vectorizer = TextVectorization(output_mode='tf-idf')
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['[UNK]',
 'she',
 'doesn’t',
 'water',
 'teach',
 'study',
 'paris',
 'on',
 'monday',
 'math',
 'live',
 'in',
 'he',
 'hate',
 'german',
 'does',
 'cats']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 17), dtype=float32, numpy=
array([[0.        , 1.6945957 , 0.84729785, 0.        , 0.        ,
        1.0986123 , 0.        , 1.0986123 , 1.0986123 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.0986123 ,
        0.        , 0.        ],
       [0.        , 0.84729785, 0.        , 0.        , 0.        ,
        0.        , 1.0986123 , 0.        , 0.        , 0.        ,
        1.0986123 , 1.0986123 , 0.        , 0.        , 0.        ,
        1.0986123 , 0.        ],
       [0.        , 0.        , 0.84729785, 0.        , 1.0986123 ,
        0.        , 0.        , 0.        , 0.        , 1.0986123 ,
        0.        , 0.        , 1.0986123 , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 1.0986123 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.0986123 , 0.        ,
        0.        , 1.0986123 ]], dt

## DEMO `tf.expand_dims()`

In [106]:
display(tf.expand_dims([1, 2], -1))
display(tf.expand_dims([1, 2], 0))
display(tf.expand_dims(tf.expand_dims([1, 2], 0), 0))
display(tf.expand_dims(tf.expand_dims([1, 2], -1), -1))

<tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[1],
       [2]])>

<tf.Tensor: shape=(1, 2), dtype=int32, numpy=array([[1, 2]])>

<tf.Tensor: shape=(1, 1, 2), dtype=int32, numpy=array([[[1, 2]]])>

<tf.Tensor: shape=(2, 1, 1), dtype=int32, numpy=
array([[[1]],

       [[2]]])>

In [97]:
tf.expand_dims('asd', -1)

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'asd'], dtype=object)>