In [1]:
%matplotlib inline

In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import re
import string
from IPython.display import display

import tensorflow as tf
from tensorflow.keras.utils import text_dataset_from_directory
from tensorflow.keras.layers import TextVectorization, Input
from tensorflow.keras.models import Sequential

# Basic Text Classification (repeat)

## Constants

In [219]:
SEED = 123
BATCH_SIZE = 32
MAX_TOKENS = 10_000  # the vocabulary would contain max of 10K words+ngrams
OUTPUT_SEQUENCE_LENGTH = 250  # each text(record) shall be limited to 250 words(+ngrams)
AUTOTUNE = tf.data.AUTOTUNE
display(AUTOTUNE)

-1

## Load the dataset

### Folder

In [169]:
movies_path = os.path.join(
    os.environ['HOME'],
    'Desktop/datasets/aclImdb',
)

os.listdir(movies_path)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

### Integrated vocabulary.

In [170]:
with open(os.path.join(movies_path, 'imdb.vocab'), encoding='ISO-8859-1') as f:
    vocab = [v.strip() for v in f.readlines()[:10]]
    print(', '.join(vocab))

the, and, a, of, to, is, it, in, i, this


### Sample review.

In [171]:
path_train_positive = os.path.join(movies_path, 'train', 'pos')
a_file = os.listdir(path_train_positive)[123]
with open(os.path.join(path_train_positive, a_file)) as f:
    print(f.read())

Any story comprises a premise, characters and conflict. Characters plotting their own play promises triumph, and a militant character readily lends oneself to this. Ardh Satya's premise is summarized by the poem of the same name scripted by Dilip Chitre. The line goes - "ek palde mein napunsaktha, doosre palde mein paurush, aur teek tarazu ke kaante par, ardh satya ?". A rough translation - "The delicate balance of right & wrong ( commonly seen on the busts of blind justice in the courts ) has powerlessness on one plate and prowess on another. Is the needle on the center a half-truth ? "<br /><br />The poem is recited midway in the film by Smita Patil to Om Puri at a resturant. It makes a deep impact on the protagonist & lays the foundation for much of the later events that follow. At the end of the film, Om Puri ends up in exactly the same situation described so aptly in the poem.<br /><br />The film tries mighty hard to do a one-up on the poem. However, Chitre's words are too powerfu

### Directories, train and test.

In [172]:
path_train = os.path.join(movies_path, 'train')
path_test = os.path.join(movies_path, 'test')

### Datasets, training and val.

In [173]:
raw_training_ds, raw_val_ds = text_dataset_from_directory(
    directory=path_train,
    labels='inferred',
    label_mode='int',
    batch_size=BATCH_SIZE,
    max_length=None,
    shuffle=True,
    seed=SEED,
    validation_split=0.2,
    subset='both',
)

raw_training_ds, raw_val_ds

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Using 5000 files for validation.


(<BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>,
 <BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>)

### Test: number of records.

In [174]:
n_training = raw_training_ds.cardinality().numpy() * BATCH_SIZE
n_validation = raw_val_ds.cardinality().numpy() * BATCH_SIZE

n_training, n_validation

(20000, 5024)

### Test: sample review.

In [175]:
for texts_batch, labels_batch in raw_training_ds.take(1):
    sample_review = texts_batch[0].numpy()
    sample_label = labels_batch[0].numpy()
    print(sample_review, sample_label, sep='\n')

b'After, I watched the films... I thought, "Why the heck was this film such a high success in the Korean Box Office?" Even thought the movie had a clever/unusal scenario, the acting wasn\'t that good and the characters weren\'t very interesting. For a Korean movie... I liked the fighting scenes. If you want to watch a film without thinking, this is the film for you. But I got to admit... the film was kind of childish... 6/10'
1


### Dataset, test.

In [176]:
display(os.path.exists(path_test))

raw_test_ds = text_dataset_from_directory(
    directory=path_test,
    labels='inferred',
    label_mode='int',
    class_names=None,
    batch_size=BATCH_SIZE,
    max_length=None,
    shuffle=True,
    seed=SEED,
)

True

Found 25000 files belonging to 2 classes.


In [177]:
raw_test_ds.class_names

['neg', 'pos']

## Prepare the dataset for training

### Preprocessing
Implement a function to clean each review.

In [178]:
def clean_review(review):
    cleaned = tf.strings.lower(review)
    cleaned = tf.strings.regex_replace(cleaned, '<[^>]+>', '')
    cleaned = tf.strings.regex_replace(cleaned, f'[{re.escape(string.punctuation)}]', '')
    
    return cleaned

print(sample_review, clean_review(sample_review), sep='\n\n')

b'After, I watched the films... I thought, "Why the heck was this film such a high success in the Korean Box Office?" Even thought the movie had a clever/unusal scenario, the acting wasn\'t that good and the characters weren\'t very interesting. For a Korean movie... I liked the fighting scenes. If you want to watch a film without thinking, this is the film for you. But I got to admit... the film was kind of childish... 6/10'

tf.Tensor(b'after i watched the films i thought why the heck was this film such a high success in the korean box office even thought the movie had a cleverunusal scenario the acting wasnt that good and the characters werent very interesting for a korean movie i liked the fighting scenes if you want to watch a film without thinking this is the film for you but i got to admit the film was kind of childish 610', shape=(), dtype=string)


### `TextVectorization` layer

In [179]:
vectorize_layer = TextVectorization(
    max_tokens=MAX_TOKENS,
    standardize=clean_review,
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=OUTPUT_SEQUENCE_LENGTH,
    encoding='utf-8',
)

### Datasets, only input

In [189]:
raw_training_ds

<BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [186]:
train_data = raw_training_ds.map(lambda review, label: review)
# train_labels = raw_training_ds.map(lambda review, label: review)

In [187]:
train_data

<MapDataset element_spec=TensorSpec(shape=(None,), dtype=tf.string, name=None)>

In [188]:
raw_training_ds

<BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

### Vocabulary

In [181]:
# Compute vocab.
# Vocab is ordered (most frequent words first).
vectorize_layer.adapt(training_ds)
vectorize_layer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']

### Test: sample review

In [182]:
# The review printed below is before preprocessing 
# that is before lowercasing, removing punctuation and html.
sample_review = list(training_ds.as_numpy_iterator())[0][0]
display(sample_review)
display(vectorize_layer(sample_review))

b"This Documentary (Now available free on Video.Google.Com) is a fantastic demonstration of the power of ordinary people to overcome injustice. Everyone must see this.<br /><br />Chavez was elected in a landslide vote in 1998. His platform was to divert the fantastic oil wealth from the 20% middle class to the 80% poor. He banned foreign drift net fishing in Venezuelan waters. He sent 10,000 Cuban doctors to the slums to treat the sick for free. He wiped out illiteracy and set up new free Universities. <br /><br />But it was his 30% tax on oil company profits that got him in trouble with the Bush administration. In 2002, while Irish film makers Kim Bartley and Donnacha O'Briain were interviewing Chavez inside the Presidential Palace about his social programs, a CIA backed coup was launched. With the cameras rolling, Chavez was captured and flown out of the country. It was announced on national TV that he had 'resigned'.<br /><br />But the poor of Venezuela didn't believe the media. The

<tf.Tensor: shape=(250,), dtype=int64, numpy=
array([  10,  676,  158, 1412,  962,   20,    1,    7,    4,  768,    1,
          5,    2,  664,    5, 1809,   77,    6, 3042, 8554,  302,  217,
         65,    1,   13, 8402,    8,    4,    1, 2187,    8, 6825,   24,
       7397,   13,    6,    1,    2,  768, 3340, 3671,   36,    2,  955,
        759,  797,    6,    2, 3461,  339,   27, 3921, 2060, 8011, 5767,
       5627,    8,    1, 3956,   27, 1345,    1, 6336, 3835,    6,    2,
          1,    6, 1656,    2, 1187,   16,  962,   27, 6585,   44,    1,
          3,  272,   54,  153,  962,    1,   18,    9,   13,   24, 1183,
       6840,   20, 3340, 1145, 9978,   12,  183,   88,    8, 1137,   15,
          2, 3458, 9612,    8, 3847,  136, 2524,   19, 1346, 2560,    1,
          3,    1,    1,   66,    1, 6344,  979,    2, 8951, 5388,   42,
         24, 1007, 5528,    4, 3600, 7342, 5801,   13,    1,   15,    2,
       3948, 2698, 6344,   13, 1798,    3,    1,   44,    5,    2,  694,
     

### Final datasets

Contain both input (vectorized amazon review) and the label.

In [211]:
def vectorize_ds(review, label):
    review = vectorize_layer(review)
    return review, label

train_ds = raw_training_ds.map(vectorize_ds)
val_ds = raw_val_ds.map(vectorize_ds)
test_ds = raw_test_ds.map(vectorize_ds)

display(train_ds)
display(val_ds)
display(test_ds)

<MapDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

<MapDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

<MapDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

### Test: train batch

In [196]:
first_train_batch = next(iter(train_ds))
first_train_batch

(<tf.Tensor: shape=(32, 250), dtype=int64, numpy=
 array([[1872,  704,    6, ...,    0,    0,    0],
        [  11,   25,  200, ...,    0,    0,    0],
        [   5,   31,    2, ...,    0,    0,    0],
        ...,
        [  48,   23,  105, ...,  184,   12,   24],
        [   1,   10, 4302, ...,    0,    0,    0],
        [  10,   17,    7, ...,    0,    0,    0]], dtype=int64)>,
 <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
        1, 0, 1, 0, 1, 1, 0, 0, 0, 0])>)

## Configure for performance

In [259]:
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

## DEMO `TextVectorization`

### Define a Vectorizer layer.

In [154]:
vectorizer = TextVectorization(name='dummy_vectorizer')
texts = [
	"She she doesn’t study German on Monday.",
	"Does she live in Paris?",
	"He doesn’t teach math.",
	"Cats hate water.",
	# "Every child likes an ice cream.",
	# "My brother takes out the trash.",
	# "The course starts next Sunday.",
	# "She swims every morning.",
	# "I don’t wash the dishes.",
	# "We see them every week.",
	# "I don’t like tea.",
	# "When does the train usually leave?",
	# "She always forgets her purse.",
	# "You don’t have children.",
	# "I and my sister don’t see each other anymore.",
	# "They don’t go to school tomorrow.",
]

### Teach vocab.

In [155]:
vectorizer.adapt(texts)

### Display some attributes.

In [156]:
# `[UNK]` =  unknown word
display(vectorizer.vocabulary_size())
display(vectorizer.dtype)
display(vectorizer.name)
display(vectorizer.get_vocabulary()[:8])

18

'string'

'dummy_vectorizer'

['', '[UNK]', 'she', 'doesn’t', 'water', 'teach', 'study', 'paris']

### Default integer encoding

In [157]:
# Default integer encoding: integer indices, one integer index per split string token.
# Each sentence need to be a separate record -> `tf.expand_dims()`.
display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 7), dtype=int64, numpy=
array([[ 2,  2,  3,  6, 15,  8,  9],
       [16,  2, 11, 12,  7,  0,  0],
       [13,  3,  5, 10,  0,  0,  0],
       [17, 14,  4,  0,  0,  0,  0]], dtype=int64)>

### Cap the number of tokens

In [158]:
# Cap the number of tokens -> a lot of `[UNK]` (unknown words).
vectorizer = TextVectorization(max_tokens=8)
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['', '[UNK]', 'she', 'doesn’t', 'water', 'teach', 'study', 'paris']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 7), dtype=int64, numpy=
array([[2, 2, 3, 6, 1, 1, 1],
       [1, 2, 1, 1, 7, 0, 0],
       [1, 3, 5, 1, 0, 0, 0],
       [1, 1, 4, 0, 0, 0, 0]], dtype=int64)>

### Increase the len of the output encoding.

In [159]:
# Increase the len of the output encoding -> trailing zeros.
vectorizer = TextVectorization(max_tokens=8, output_sequence_length=20)
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['', '[UNK]', 'she', 'doesn’t', 'water', 'teach', 'study', 'paris']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 20), dtype=int64, numpy=
array([[2, 2, 3, 6, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 2, 1, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)>

### Add ngrams

In [160]:
# Add ngrams -> vocabulary size increases.
vectorizer = TextVectorization(ngrams=2)
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['',
 '[UNK]',
 'she',
 'doesn’t',
 'water',
 'teach math',
 'teach',
 'study german',
 'study',
 'she she',
 'she live',
 'she doesn’t',
 'paris',
 'on monday',
 'on',
 'monday',
 'math',
 'live in',
 'live',
 'in paris',
 'in',
 'he doesn’t',
 'he',
 'hate water',
 'hate',
 'german on',
 'german',
 'doesn’t teach',
 'doesn’t study',
 'does she',
 'does',
 'cats hate',
 'cats']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 13), dtype=int64, numpy=
array([[ 2,  2,  3,  8, 26, 14, 15,  9, 11, 28,  7, 25, 13],
       [30,  2, 18, 20, 12, 29, 10, 17, 19,  0,  0,  0,  0],
       [22,  3,  6, 16, 21, 27,  5,  0,  0,  0,  0,  0,  0],
       [32, 24,  4, 31, 23,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int64)>

### One-hot encode

In [161]:
# One-hot encode each text (sentence).
vectorizer = TextVectorization(output_mode='multi_hot')
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['[UNK]',
 'she',
 'doesn’t',
 'water',
 'teach',
 'study',
 'paris',
 'on',
 'monday',
 'math',
 'live',
 'in',
 'he',
 'hate',
 'german',
 'does',
 'cats']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 17), dtype=float32, numpy=
array([[0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
        0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
        0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1.]], dtype=float32)>

### `output_mode='count'`

In [162]:
vectorizer = TextVectorization(output_mode='count')
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['[UNK]',
 'she',
 'doesn’t',
 'water',
 'teach',
 'study',
 'paris',
 'on',
 'monday',
 'math',
 'live',
 'in',
 'he',
 'hate',
 'german',
 'does',
 'cats']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 17), dtype=float32, numpy=
array([[0., 2., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
        0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
        0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
        0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1.]], dtype=float32)>

### One-hot encoding with TF-IDF instead of ints. 

In [163]:
vectorizer = TextVectorization(output_mode='tf-idf')
vectorizer.adapt(texts)
display(vectorizer.get_vocabulary())

display(texts)
display(vectorizer(tf.expand_dims(texts, -1)))

['[UNK]',
 'she',
 'doesn’t',
 'water',
 'teach',
 'study',
 'paris',
 'on',
 'monday',
 'math',
 'live',
 'in',
 'he',
 'hate',
 'german',
 'does',
 'cats']

['She she doesn’t study German on Monday.',
 'Does she live in Paris?',
 'He doesn’t teach math.',
 'Cats hate water.']

<tf.Tensor: shape=(4, 17), dtype=float32, numpy=
array([[0.        , 1.6945957 , 0.84729785, 0.        , 0.        ,
        1.0986123 , 0.        , 1.0986123 , 1.0986123 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.0986123 ,
        0.        , 0.        ],
       [0.        , 0.84729785, 0.        , 0.        , 0.        ,
        0.        , 1.0986123 , 0.        , 0.        , 0.        ,
        1.0986123 , 1.0986123 , 0.        , 0.        , 0.        ,
        1.0986123 , 0.        ],
       [0.        , 0.        , 0.84729785, 0.        , 1.0986123 ,
        0.        , 0.        , 0.        , 0.        , 1.0986123 ,
        0.        , 0.        , 1.0986123 , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 1.0986123 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.0986123 , 0.        ,
        0.        , 1.0986123 ]], dt

## DEMO `tf.expand_dims()`

In [106]:
display(tf.expand_dims([1, 2], -1))
display(tf.expand_dims([1, 2], 0))
display(tf.expand_dims(tf.expand_dims([1, 2], 0), 0))
display(tf.expand_dims(tf.expand_dims([1, 2], -1), -1))

<tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[1],
       [2]])>

<tf.Tensor: shape=(1, 2), dtype=int32, numpy=array([[1, 2]])>

<tf.Tensor: shape=(1, 1, 2), dtype=int32, numpy=array([[[1, 2]]])>

<tf.Tensor: shape=(2, 1, 1), dtype=int32, numpy=
array([[[1]],

       [[2]]])>

In [97]:
tf.expand_dims('asd', -1)

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'asd'], dtype=object)>

## DEMO Caching

In [218]:
# Make folder to store cache files.
cache_dir = '/tmp_cache'
if not os.path.exists(cache_dir):
    os.mkdir(cache_dir)

    
train_ds = train_ds.cache(cache_dir)
iterator = train_ds.as_numpy_iterator()

# Iterating happens here!
list(iterator)

UnknownError: {{function_node __wrapped__IteratorGetNext_output_types_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Failed to create a NewWriteableFile: /tmp_cache_0.lockfile : Access is denied.
; Input/output error [Op:IteratorGetNext]

## DEMO `tf.data`

In [220]:
tf.data.INFINITE_CARDINALITY

-1

In [221]:
tf.data.UNKNOWN_CARDINALITY

-2

In [222]:
tf.data.Dataset

tensorflow.python.data.ops.dataset_ops.DatasetV2

In [223]:
tf.data.experimental.AUTOTUNE

-1

In [None]:
tf.data.FixedLengthRecordDataset()

## DEMO Binary files

In [256]:
filename = 'dummy_text.txt'
some_text = 'ю'

with open(filename, 'w', encoding='utf-8') as f:
    f.write(some_text)
    
with open(filename, 'br') as f:
    print(f'Bytes stored in the file: \t{f.read()}')
    
print(f'Unicode value for {some_text}: \t\t{ord(some_text)}')
print(hex(ord(some_text)))

Bytes stored in the file: 	b'\xd1\x8e'
Unicode value for ю: 		1102
0x44e
