# PySpark Huggingface Inferencing
### Text Classification using Pipelines

Based on: https://huggingface.co/docs/transformers/quicktour#pipeline-usage

In [None]:
import pandas as pd
import sparkext

from inspect import signature
from pyspark.sql.functions import col, pandas_udf
from sparkext.huggingface import pipeline_udf
from transformers import pipeline

In [None]:
pipe = pipeline("text-classification")

In [None]:
pipe("What can I say that hasn't been said already. I think this place is totally worth the hype.")

In [None]:
pipe("I will not say much about this film, because there is not much to say, because there is not much there to talk about.")

## Inference using Spark ML Model

In [None]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(100)
df.show(truncate=120)

In [None]:
my_model = sparkext.huggingface.PipelineModel(pipe, return_type="label string, score float") \
                    .setInputCol("sentence") \
                    .setOutputCol("preds")

In [None]:
predictions = my_model.transform(df).select("sentence", "preds.*")

In [None]:
predictions.show(truncate=80)

In [None]:
%%time
preds = predictions.collect()

## Inference using Spark DL UDF

In [None]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(100)
df.show(truncate=80)

In [None]:
# note: need to manually specify return_type per pipe output above
classify = pipeline_udf(pipe, return_type="label string, score float")

In [None]:
# note: expanding the "struct" return_type to top-level columns
predictions = df.withColumn("preds", classify(col("sentence"))).select("sentence", "preds.*")

In [None]:
%%time
preds = predictions.collect()

In [None]:
predictions.show(truncate=80)

### Using model loader

In [None]:
import pandas as pd

from pyspark.sql.functions import col, pandas_udf
from sparkext.huggingface import pipeline_udf

In [None]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(100)
df.show(truncate=80)

In [None]:
def model_loader(task: str):
    import torch
    from transformers import pipeline
    
    device_id = torch.cuda.current_device() if torch.cuda.is_available() else -1
    return pipeline(task, device=device_id)    

In [None]:
# note: need to manually specify return_type per pipe output above
classify = pipeline_udf("text-classification", model_loader=model_loader, return_type="label string, score float")

In [None]:
predictions = df.withColumn("preds", classify(col("sentence"))).select("sentence", "preds.*")

In [None]:
%%time
preds = predictions.collect()

In [None]:
predictions.show(truncate=80)

## Inference using Spark DL API

In [1]:
import pandas as pd
from pyspark.sql.functions import col, struct, pandas_udf
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.types import FloatType, StringType, StructField, StructType

In [2]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(100)
df.show(truncate=80)

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------------------------------------------------------------------+
|                                                                        sentence|
+--------------------------------------------------------------------------------+
|                                                                                |
|              I found myself getting increasingly angry as this movie progressed|
|                The comparisons between the 1995 version and this are inevitable|
|Doesn't anyone bother to check where this kind of sludge comes from before bl...|
|                 Don't get me wrong, I love the TV series of League Of Gentlemen|
|Made it through the first half an hour and deserved a medal for getting that far|
|This movie seems a little clunky around the edges, like not quite enough zani...|
|                                                           Oh but this is woeful|
|                           Terry Cunningham directs this Sci-Fi Network original|
|Fir

                                                                                

In [3]:
def predict_batch_fn():
    import numpy as np
    import pandas as pd
    from transformers import pipeline
    pipe = pipeline("text-classification")
    def predict(inputs):
        flattened = np.squeeze(inputs).tolist()
        return pipe(flattened)
    return predict

In [4]:
classify = predict_batch_udf(predict_batch_fn,
                             return_type=StructType([
                                 StructField("label", StringType(), True),
                                 StructField("score", FloatType(), True)
                             ]),
                             batch_size=10)

In [5]:
# note: expanding the "struct" return_type to top-level columns
predictions = df.withColumn("preds", classify(struct("sentence"))).select("sentence", "preds.*")

In [6]:
%%time
preds = predictions.collect()

[Stage 4:>                                                          (0 + 1) / 1]

CPU times: user 15.6 ms, sys: 5.15 ms, total: 20.8 ms
Wall time: 13 s


                                                                                

In [7]:
predictions.show(truncate=80)

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------------------------------------------------------------------+--------+----------+
|                                                                        sentence|   label|     score|
+--------------------------------------------------------------------------------+--------+----------+
|                                                                                |POSITIVE|  0.748121|
|              I found myself getting increasingly angry as this movie progressed|NEGATIVE|0.99845886|
|                The comparisons between the 1995 version and this are inevitable|NEGATIVE| 0.9997198|
|Doesn't anyone bother to check where this kind of sludge comes from before bl...|NEGATIVE| 0.9984042|
|                 Don't get me wrong, I love the TV series of League Of Gentlemen|POSITIVE| 0.9998311|
|Made it through the first half an hour and deserved a medal for getting that far|POSITIVE|0.99915516|
|This movie seems a little clunky around the edges, like not quite enough

                                                                                

### Using Triton server

#### Start Triton Server on each executor

In [8]:
num_executors = 1

nodeRDD = sc.parallelize(list(range(num_executors)), num_executors)

def start_triton(it):
    import docker
    import time
    import tritonclient.grpc as grpcclient
    
    client=docker.from_env()
    containers=client.containers.list(filters={"name": "spark-triton"})
    if containers:
        print(">>>> containers: {}".format([c.short_id for c in containers]))
    else:
        container=client.containers.run(
            "nvcr.io/nvidia/tritonserver:22.07-py3", "tritonserver --model-repository=/models",
            detach=True,
            device_requests=[docker.types.DeviceRequest(device_ids=["0"], capabilities=[['gpu']])],
            environment=[
                "TRANSFORMERS_CACHE=/cache"
            ],
            name="spark-triton",
            network_mode="host",
            remove=True,
            shm_size="256M",
            volumes={
                "/home/leey/devpub/leewyang/sparkext/examples/models_hf": {"bind": "/models", "mode": "ro"},
                "/home/leey/huggingface/cache": {"bind": "/cache", "mode": "rw"}
            }
        )
        print(">>>> starting triton: {}".format(container.short_id))
        # wait for triton to be running
        time.sleep(15)
        
        client = grpcclient.InferenceServerClient("localhost:8001")
        
        elapsed = 0
        timeout = 120
        ready = False
        while not ready and elapsed < timeout:
            try:
                time.sleep(5)
                elapsed += 5
                ready = client.is_server_ready()
            except Exception as e:
                pass

    return [True]

nodeRDD.mapPartitions(start_triton).collect()

                                                                                

[True]

#### Run inference

In [9]:
import pandas as pd
from pyspark.sql.functions import col, struct, pandas_udf
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.types import FloatType, StringType, StructField, StructType

In [10]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(1000)
df.show(truncate=80)

+--------------------------------------------------------------------------------+
|                                                                        sentence|
+--------------------------------------------------------------------------------+
|                                                                                |
|              I found myself getting increasingly angry as this movie progressed|
|                The comparisons between the 1995 version and this are inevitable|
|Doesn't anyone bother to check where this kind of sludge comes from before bl...|
|                 Don't get me wrong, I love the TV series of League Of Gentlemen|
|Made it through the first half an hour and deserved a medal for getting that far|
|This movie seems a little clunky around the edges, like not quite enough zani...|
|                                                           Oh but this is woeful|
|                           Terry Cunningham directs this Sci-Fi Network original|
|Fir

In [11]:
def triton_fn(triton_uri, model_name):
    import numpy as np
    import tritonclient.grpc as grpcclient
    
    np_types = {
      "BOOL": np.dtype(np.bool8),
      "INT8": np.dtype(np.int8),
      "INT16": np.dtype(np.int16),
      "INT32": np.dtype(np.int32),
      "INT64": np.dtype(np.int64),
      "FP16": np.dtype(np.float16),
      "FP32": np.dtype(np.float32),
      "FP64": np.dtype(np.float64),
      "FP64": np.dtype(np.double),
      "BYTES": np.dtype(object)
    }

    client = grpcclient.InferenceServerClient(triton_uri)
    model_meta = client.get_model_metadata(model_name)
    
    def predict(inputs):
        if isinstance(inputs, np.ndarray):
            # single ndarray input
            request = [grpcclient.InferInput(model_meta.inputs[0].name, inputs.shape, model_meta.inputs[0].datatype)]
            request[0].set_data_from_numpy(inputs.astype(np_types[model_meta.inputs[0].datatype]))
        else:
            # dict of multiple ndarray inputs
            request = [grpcclient.InferInput(i.name, inputs[i.name].shape, i.datatype) for i in model_meta.inputs]
            for i in request:
                i.set_data_from_numpy(inputs[i.name].astype(np_types[i.datatype]))
        
        response = client.infer(model_name, inputs=request)
        
        if len(model_meta.outputs) > 1:
            # convert to rows of dictionaries form
            output_names = [o.name for o in model_meta.outputs]
            result = []
            for name in output_names:
                column = [{name: v} for v in response.as_numpy(name)]
                if result:
                    result = [{**old, **new} for old, new in zip(result, column)]
                else:
                    result = column
            return result
        else:
            # return single numpy array
            return response.as_numpy(model_meta.outputs[0].name)
        
    return predict

In [12]:
classify = predict_batch_udf(triton_fn,
                             triton_uri="localhost:8001",
                             model_name="hf_pipeline",
                             return_type=StructType([
                                 StructField("label", StringType(), True),
                                 StructField("score", FloatType(), True)
                             ]),
                             batch_size=100)

In [13]:
%%time
# note: expanding the "struct" return_type to top-level columns
predictions = df.withColumn("preds", classify(struct("sentence"))).collect()

[Stage 13:>                                                         (0 + 1) / 1]

CPU times: user 35.1 ms, sys: 4.09 ms, total: 39.2 ms
Wall time: 5.86 s


                                                                                

In [14]:
df.withColumn("preds", classify(struct("sentence"))).select("sentence", "preds.*").show(n=50, truncate=120)

[Stage 16:>                                                         (0 + 1) / 1]

+------------------------------------------------------------------------------------------------------------------------+--------+----------+
|                                                                                                                sentence|   label|     score|
+------------------------------------------------------------------------------------------------------------------------+--------+----------+
|                                                                                                                        |POSITIVE| 0.7481212|
|                                                      I found myself getting increasingly angry as this movie progressed|NEGATIVE|0.99845886|
|                                                        The comparisons between the 1995 version and this are inevitable|NEGATIVE| 0.9997198|
|Doesn't anyone bother to check where this kind of sludge comes from before blathering on about its supposed revelatio...|NEGATIVE| 0.9984042|

                                                                                

#### Stop Triton Server on each executor

In [15]:
def stop_triton(it):
    import docker
    import time
    
    client=docker.from_env()
    containers=client.containers.list(filters={"name": "spark-triton"})
    print(">>>> stopping containers: {}".format([c.short_id for c in containers]))
    if containers:
        container=containers[0]
        container.stop(timeout=120)

    return [True]

nodeRDD.mapPartitions(stop_triton).collect()

                                                                                

[True]

In [16]:
spark.stop()