In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
batch_size = 32
raw_train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size, validation_split=0.2, subset="training", seed=1337)
raw_val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size, validation_split=0.2, subset="validation", seed=1337)
raw_test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size)

In [None]:
max_features = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_features,
    ngrams=2,  # unigrams + bigrams
    output_mode="tf_idf"
)

In [None]:
text_only_train_ds = raw_train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

In [None]:
binary_tfidf_train_ds = raw_train_ds.map(lambda x, y: (text_vectorization(x), y))
binary_tfidf_val_ds = raw_val_ds.map(lambda x, y: (text_vectorization(x), y))
binary_tfidf_test_ds = raw_test_ds.map(lambda x, y: (text_vectorization(x), y))

In [None]:
def get_model():
    inputs = keras.Input(shape=(max_features,))
    x = layers.Dense(16, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    return keras.Model(inputs, outputs)

model = get_model()
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
checkpoint_filepath = "/tmp/checkpoint_tfidf.weights.h5"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor="val_accuracy",
    mode="max",
    save_best_only=True
)

model.fit(
    binary_tfidf_train_ds.cache(),
    validation_data=binary_tfidf_val_ds.cache(),
    epochs=3,
    callbacks=[model_checkpoint_callback]
)

In [None]:
model.load_weights(checkpoint_filepath)
print(f"Test acc: {model.evaluate(binary_tfidf_test_ds)[1]:.3f}")