Deep learing by Python: 
Chapter 11: Deep learning for text
Page no: 334

In [1]:
# !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -xf aclImdb_v1.tar.gz

In [2]:
# !rm -r aclImdb/train/unsup

In [3]:
# import os, pathlib, shutil, random

# base_dir = pathlib.Path("aclImdb")
# val_dir = base_dir / "val"
# train_dir = base_dir / "train"
# for category in ("neg", "pos"):
#     os.makedirs(val_dir / category)
#     files = os.listdir(train_dir / category)
#     random.Random(1337).shuffle(files)
#     num_val_samples = int(0.2 * len(files))
#     val_files = files[-num_val_samples:]
#     for fname in val_files:
#         shutil.move(train_dir / category / fname,
#                     val_dir / category / fname)

In [4]:
from tensorflow import keras

batch_size = 32

train_ds = keras.utils.text_dataset_from_directory("aclImdb/train",batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

2024-12-20 11:29:18.258249: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-20 11:29:19.618512: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-20 11:29:24.303589: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [5]:
type(train_ds)

tensorflow.python.data.ops.prefetch_op._PrefetchDataset

In [6]:
for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print(f'targets.dtype: {targets.dtype}')
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'This Raggedy Ann and Andy Movie is so adorable. We love watching Ann and Andy sing and dance, along with the camel with the wrinkled knees. This movie is what made the Camel with the Wrinkled Knees so popular, singing his song, "I\'m nobodies I Love You". If you love Raggedy Ann and Andy Watch the movie and you will see why it\'s a movie the kids love, and adults!', shape=(), dtype=string)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


In [7]:
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(max_tokens=20000, output_mode="multi_hot")

txt_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(txt_only_train_ds)

bin_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
bin_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
bin_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

for inputs, targets in bin_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print(f'targets.dtype: {targets.dtype}')
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

2024-12-20 11:30:54.096864: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'int64'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1 1 1 ... 0 0 0], shape=(20000,), dtype=int64)
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


In [8]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_token=2000, hidden_units=16):
    inputs = keras.Input(shape=(max_token,), dtype="int64")
    x = layers.Dense(hidden_units, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model

model = get_model()
model.summary()

In [9]:
callbacks = [ keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True)]
model.fit(bin_1gram_train_ds.cache(),
           validation_data=bin_1gram_val_ds.cache(),
             epochs=10, callbacks=callbacks) 
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(bin_1gram_test_ds)[1]:.3f}")

Epoch 1/10


ValueError: Input 0 of layer "functional_1" is incompatible with the layer: expected shape=(None, 2000), found shape=(None, 20000)