In [27]:
# Preprocessing text data for machine learning applications
# Bag-of-words approaches and sequence-modeling approaches for text processing
# The Transformer architecture
# Sequence-to-sequence learning

In [28]:
# preparing text data
# standardize text, vectrorise text. Convert to lowercase and remove punctuations.
# tokenization
# convert each token to numeric vector, indexing all tokens present in the data.
# one hot encoding or embedding

In [29]:
# Text standardization
# tokenization

In [30]:
# Vocabulary indexing
# padding

In [31]:
# Preparing the IMDB movie reviews data

In [32]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz


 37 80.2M   37 29.9M    0     0   648k      0  0:02:06  0:00:47  0:01:19  733k

In [17]:
!rm -r aclImdb/train/unsup

In [None]:
!cat aclImdb/train/pos/4077_10.txt

In [19]:
import os, pathlib, shutil, random

In [20]:
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir / category / fname)

In [None]:
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory("aclImdb/train", batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

In [None]:
for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

In [None]:
# TextVectorization layer so that they yield multi hot encoded binary word vectors

In [23]:
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(max_tokens=20000, output_mode="multi_hot")
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [None]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

In [25]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs) 
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x) 
    model = keras.Model(inputs, outputs) 
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [None]:
model = get_model()
model.summary()
callbacks = [keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True)]
model.fit(binary_1gram_train_ds.cache(), validation_data=binary_1gram_val_ds.cache(), epochs=10, callbacks=callbacks)
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")