# PySpark Huggingface Inferencing
### Text Classification using Pipelines

Based on: https://huggingface.co/docs/transformers/quicktour#pipeline-usage

In [None]:
import pandas as pd
import sparkext

from inspect import signature
from pyspark.sql.functions import col, pandas_udf
from sparkext.huggingface import pipeline_udf
from transformers import pipeline

In [None]:
pipe = pipeline("text-classification")

In [None]:
pipe("What can I say that hasn't been said already. I think this place is totally worth the hype.")

In [None]:
pipe("I will not say much about this film, because there is not much to say, because there is not much there to talk about.")

## Inference using Spark ML Model

In [None]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(100)
df.show(truncate=120)

In [None]:
my_model = sparkext.huggingface.PipelineModel(pipe, return_type="label string, score float") \
                    .setInputCol("sentence") \
                    .setOutputCol("preds")

In [None]:
predictions = my_model.transform(df).select("sentence", "preds.*")

In [None]:
predictions.show(truncate=80)

In [None]:
%%time
preds = predictions.collect()

## Inference using Spark DL UDF

In [None]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(100)
df.show(truncate=80)

In [None]:
# note: need to manually specify return_type per pipe output above
classify = pipeline_udf(pipe, return_type="label string, score float")

In [None]:
# note: expanding the "struct" return_type to top-level columns
predictions = df.withColumn("preds", classify(col("sentence"))).select("sentence", "preds.*")

In [None]:
%%time
preds = predictions.collect()

In [None]:
predictions.show(truncate=80)

### Using model loader

In [None]:
import pandas as pd

from pyspark.sql.functions import col, pandas_udf
from sparkext.huggingface import pipeline_udf

In [None]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(100)
df.show(truncate=80)

In [None]:
def model_loader(task: str):
    import torch
    from transformers import pipeline
    
    device_id = torch.cuda.current_device() if torch.cuda.is_available() else -1
    return pipeline(task, device=device_id)    

In [None]:
# note: need to manually specify return_type per pipe output above
classify = pipeline_udf("text-classification", model_loader=model_loader, return_type="label string, score float")

In [None]:
predictions = df.withColumn("preds", classify(col("sentence"))).select("sentence", "preds.*")

In [None]:
%%time
preds = predictions.collect()

In [None]:
predictions.show(truncate=80)

## Inference using Spark DL API

In [1]:
import pandas as pd
from pyspark.sql.functions import col, struct
from pyspark.ml.udf import model_udf, pandas_udf
from pyspark.sql.types import FloatType, StringType, StructField, StructType

In [2]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence").limit(100)
df.show(truncate=80)

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------------------------------------------------------------------+
|                                                                        sentence|
+--------------------------------------------------------------------------------+
|                                                                                |
|              I found myself getting increasingly angry as this movie progressed|
|                The comparisons between the 1995 version and this are inevitable|
|Doesn't anyone bother to check where this kind of sludge comes from before bl...|
|                 Don't get me wrong, I love the TV series of League Of Gentlemen|
|Made it through the first half an hour and deserved a medal for getting that far|
|This movie seems a little clunky around the edges, like not quite enough zani...|
|                                                           Oh but this is woeful|
|                           Terry Cunningham directs this Sci-Fi Network original|
|Fir

                                                                                

In [3]:
def model_fn():
    import numpy as np
    import pandas as pd
    from transformers import pipeline
    pipe = pipeline("text-classification")
    def predict(inputs):
        flattened = np.squeeze(inputs).tolist()
        return pipe(flattened)
    return predict

In [4]:
classify = model_udf(model_fn, 
                     input_shapes=[[-1,1]], 
                     return_type=StructType([StructField("label", StringType(), True), StructField("score", FloatType(), True)]), 
                     batch_size=10)

In [5]:
# note: expanding the "struct" return_type to top-level columns
predictions = df.withColumn("preds", classify(struct("sentence"))).select("sentence", "preds.*")

In [6]:
%%time
preds = predictions.collect()

[Stage 4:>                                                          (0 + 1) / 1]

CPU times: user 12 ms, sys: 6.26 ms, total: 18.3 ms
Wall time: 15.9 s


                                                                                

In [7]:
predictions.show(truncate=80)

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------------------------------------------------------------------+--------+----------+
|                                                                        sentence|   label|     score|
+--------------------------------------------------------------------------------+--------+----------+
|Hard up, No proper jobs going down at the pit, why not rent your kids! DIY pi...|NEGATIVE|0.99967253|
|                                     To be fair, I didn't see a lot of this show|NEGATIVE| 0.9983752|
|(This might have a spoiler)<br /><br />When I first started watching this mov...|POSITIVE|0.99846494|
|1st watched 3/17/2002 - 2 out of 10(Dir-Mario Pinzauti): Silly, sex-filled ma...|NEGATIVE| 0.9949014|
|I really didn't like this film~!!!! it was boring and didn't interest me that...|NEGATIVE|0.99949896|
|A wildly uneven film where the major problem is the uneasy mix of comedy and ...|NEGATIVE| 0.9996619|
|                   I was greatly disappointed by the quality of this doc

                                                                                

In [8]:
spark.stop()