# Pyspark TensorFlow Inference

## Text classification
Based on: https://www.tensorflow.org/tutorials/keras/text_classification

In [None]:
import os
import re
import shutil
import string

import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, losses

In [None]:
print(tf.__version__)

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(
    "aclImdb_v1", url, untar=True, cache_dir=".", cache_subdir=""
)

dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")

In [None]:
os.listdir(dataset_dir)

In [None]:
train_dir = os.path.join(dataset_dir, "train")
os.listdir(train_dir)

In [None]:
sample_file = os.path.join(train_dir, "pos/1181_9.txt")
with open(sample_file) as f:
    print(f.read())

In [None]:
remove_dir = os.path.join(train_dir, "unsup")
shutil.rmtree(remove_dir)

In [None]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=seed,
)

In [None]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(3):
        print("Review", text_batch.numpy()[i])
        print("Label", label_batch.numpy()[i])

In [None]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

In [None]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=seed,
)

In [None]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )

In [None]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [None]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [None]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

In [None]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

In [None]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
embedding_dim = 16

In [None]:
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

In [None]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [None]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

In [None]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

In [None]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

In [None]:
examples = [
  "The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."
]

export_model.predict(examples)

### Save Model

In [None]:
!rm -rf text_model

In [None]:
export_model.save('text_model')

### Inspect saved model

In [None]:
!tree text_model

In [None]:
!saved_model_cli show --dir text_model --tag_set serve --signature_def serving_default

### Load model

In [None]:
# register callables as custom objects before loading
custom_objects = {"vectorize_layer": vectorize_layer, "custom_standardization": custom_standardization}
with tf.keras.utils.custom_object_scope(custom_objects):
    new_model = tf.keras.models.load_model('text_model')

new_model.summary()

### Predict

In [None]:
new_model.predict(examples)

## PySpark

## Inference using Spark ML Model
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import re
import sparkext
import string
import tensorflow as tf

from pathlib import Path
from tensorflow.keras import layers

In [None]:
# note: using huggingface IMDB parquet for now, since conversion above isn't working
dataset = Path("../huggingface/imdb_test").absolute().as_posix()
df = spark.read.parquet(dataset).repartition(10)

In [None]:
df.show(truncate=120)

In [None]:
def model_loader(path: str):
    import re
    import string
    import tensorflow as tf
    from tensorflow.keras import layers
    
    def custom_standardization(input_data):
        lowercase = tf.strings.lower(input_data)
        stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
        return tf.strings.regex_replace(
            stripped_html, "[%s]" % re.escape(string.punctuation), ""
        )
    
    max_features = 10000
    sequence_length = 250

    vectorize_layer = layers.TextVectorization(
        standardize=custom_standardization,
        max_tokens=max_features,
        output_mode="int",
        output_sequence_length=sequence_length,
    )
    
    custom_objects = {"vectorize_layer": vectorize_layer, "custom_standardization": custom_standardization}
    with tf.keras.utils.custom_object_scope(custom_objects):
        model = tf.keras.models.load_model(path)

    return model

In [None]:
# Note: must use absolute path to model, since executors have their own current working directories
model_path = Path("text_model").absolute().as_posix()
print(f"model: {model_path}")

model = sparkext.tensorflow.Model(model_path, model_loader) \
                .setInputCol("lines") \
                .setOutputCol("preds")

In [None]:
predictions = model.transform(df)

In [None]:
%%time
preds = predictions.collect()

In [None]:
predictions.show(truncate=80)

## Inference using Spark DL UDF
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [None]:
from pyspark.sql.functions import col
from sparkext.tensorflow import model_udf
from pathlib import Path

In [None]:
# note: using huggingface IMDB parquet for now, since conversion above isn't working
dataset = Path("../huggingface/imdb_test").absolute().as_posix()
df = spark.read.parquet(dataset).repartition(10)

In [None]:
df.show(truncate=120)

In [None]:
def model_loader(path: str):
    # since this function runs on the executor, any required imports should be added inside the function.
    import re
    import string
    import tensorflow as tf
    from tensorflow.keras import layers
    
    def custom_standardization(input_data):
        lowercase = tf.strings.lower(input_data)
        stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
        return tf.strings.regex_replace(
            stripped_html, "[%s]" % re.escape(string.punctuation), ""
        )
    
    max_features = 10000
    sequence_length = 250

    vectorize_layer = layers.TextVectorization(
        standardize=custom_standardization,
        max_tokens=max_features,
        output_mode="int",
        output_sequence_length=sequence_length,
    )
    
    custom_objects = {"vectorize_layer": vectorize_layer, "custom_standardization": custom_standardization}
    with tf.keras.utils.custom_object_scope(custom_objects):
        model = tf.keras.models.load_model(path)

    return model

In [None]:
# get absolute path to model, since executors have their own current working directories
model_path = Path("text_model").absolute().as_posix()

In [None]:
classify = model_udf(model_path, model_loader)

In [None]:
predictions = df.withColumn("preds", classify(col("lines")))

In [None]:
%%time
results = predictions.collect()

In [None]:
predictions.show(truncate=80)

## Inference using MLFlow UDF
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [None]:
import mlflow
from pyspark.sql.functions import col
from pathlib import Path

In [None]:
# note: using huggingface IMDB parquet for now, since conversion above isn't working
dataset = Path("../huggingface/imdb_test").absolute().as_posix()
df = spark.read.parquet(dataset).repartition(10)

In [None]:
df.show(truncate=120)

In [None]:
# get absolute path to model, since executors have their own current working directories
model_path = Path("text_model").absolute().as_posix()

In [None]:
import re
import string
import tensorflow as tf
from tensorflow.keras import layers

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )

max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

custom_objects = {"vectorize_layer": vectorize_layer, "custom_standardization": custom_standardization}
with tf.keras.utils.custom_object_scope(custom_objects):
    model = tf.keras.models.load_model(model_path)

In [None]:
import numpy as np
examples = np.array([
  "The movie was great!",
  "The movie was okay.",
  "The movie was terrible."
])

model.predict(examples)

In [None]:
from mlflow.models.signature import infer_signature, ModelSignature
from mlflow.types.schema import Schema, TensorSpec

In [None]:
signature = infer_signature(examples, model.predict(examples))
signature

In [None]:
input_schema = Schema([TensorSpec(np.dtype(np.str), (-1,), "input")])
output_schema = Schema([TensorSpec(np.dtype(np.float32), (-1,), "output")])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)
signature

In [None]:
import shutil

In [None]:
shutil.rmtree("text_model_mlflow")

In [None]:
mlflow.tensorflow.save_model(tf_saved_model_dir=model_path,
                             tf_meta_graph_tags=["serve"],
                             tf_signature_def_key="serving_default",
                             signature=signature,
                             path="text_model_mlflow")

In [None]:
model_infer = mlflow.pyfunc.load_model("text_model_mlflow")

In [None]:
print(model_infer.metadata)

In [None]:
model_infer.predict({"input": examples})

In [None]:
predictions = df.withColumn("preds", classify(col("lines")))

In [None]:
%%time
results = predictions.collect()

In [None]:
predictions.show(truncate=80)

## Inference using Spark DL API
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [1]:
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import struct, col
from pyspark.sql.types import ArrayType, FloatType

In [2]:
# note: using huggingface IMDB parquet for now, since conversion above isn't working
df = spark.read.parquet("/home/leey/devpub/leewyang/sparkext/examples/huggingface/imdb_test").repartition(1)

                                                                                

In [3]:
df.show(truncate=100)

[Stage 1:>                                                         (0 + 8) / 10]

+----------------------------------------------------------------------------------------------------+
|                                                                                               lines|
+----------------------------------------------------------------------------------------------------+
|...But not this one! I always wanted to know "what happened" next. We will never know for sure wh...|
|I found myself getting increasingly angry as this movie progressed.<br /><br />Basically, Dr. Cra...|
|The comparisons between the 1995 version and this are inevitable. Sadly, this version falls far s...|
|Doesn't anyone bother to check where this kind of sludge comes from before blathering on about it...|
|Don't get me wrong, I love the TV series of League Of Gentlemen. It was funny, twisted and comple...|
|Made it through the first half an hour and deserved a medal for getting that far. Lots of excuses...|
|This movie seems a little clunky around the edges, like not quite enough

                                                                                

In [4]:
def predict_batch_fn():
    # since this function runs on the executor, any required imports should be added inside the function.
    import re
    import string
    import tensorflow as tf
    from tensorflow.keras import layers

    def custom_standardization(input_data):
        lowercase = tf.strings.lower(input_data)
        stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
        return tf.strings.regex_replace(
            stripped_html, "[%s]" % re.escape(string.punctuation), ""
        )

    max_features = 10000
    sequence_length = 250

    vectorize_layer = layers.TextVectorization(
        standardize=custom_standardization,
        max_tokens=max_features,
        output_mode="int",
        output_sequence_length=sequence_length,
    )

    custom_objects = {"vectorize_layer": vectorize_layer,
                      "custom_standardization": custom_standardization}
    with tf.keras.utils.custom_object_scope(custom_objects):
        model = tf.keras.models.load_model(
            "/home/leey/devpub/leewyang/sparkext/examples/tensorflow/text_model")

    def predict(inputs):
        return model.predict(inputs)

    return predict

In [5]:
classify = predict_batch_udf(predict_batch_fn,
                             return_type=FloatType(),
                             batch_size=256)

In [6]:
%%time
predictions = df.withColumn("preds", classify(struct("lines")))

CPU times: user 10.3 ms, sys: 4.07 ms, total: 14.3 ms
Wall time: 80.2 ms


In [7]:
%%time
predictions = df.withColumn("preds", classify("lines"))

CPU times: user 2.56 ms, sys: 1.02 ms, total: 3.58 ms
Wall time: 8.76 ms


In [8]:
%%time
predictions = df.withColumn("preds", classify(col("lines")))

CPU times: user 2.87 ms, sys: 478 µs, total: 3.35 ms
Wall time: 14.1 ms


In [9]:
%%time
results = predictions.collect()

                                                                                

CPU times: user 165 ms, sys: 31.6 ms, total: 197 ms
Wall time: 14.5 s


In [10]:
predictions.show(truncate=80)

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------------------------------------------------------------------+------------+
|                                                                           lines|       preds|
+--------------------------------------------------------------------------------+------------+
|...But not this one! I always wanted to know "what happened" next. We will ne...|   0.5685876|
|I found myself getting increasingly angry as this movie progressed.<br /><br ...|  0.19131866|
|The comparisons between the 1995 version and this are inevitable. Sadly, this...|  0.08580024|
|Doesn't anyone bother to check where this kind of sludge comes from before bl...|0.0034280755|
|Don't get me wrong, I love the TV series of League Of Gentlemen. It was funny...|3.9646143E-4|
|Made it through the first half an hour and deserved a medal for getting that ...|  0.09948175|
|This movie seems a little clunky around the edges, like not quite enough zani...|  0.19058977|
|Oh but this is woeful. One good actor a

                                                                                

In [11]:
spark.stop()