# PySpark Huggingface Inferencing
### Text Classification using Pipelines

Based on: https://huggingface.co/docs/transformers/quicktour#pipeline-usage

In [None]:
import pandas as pd

from inspect import signature
from pyspark.sql.functions import col, pandas_udf
from sparkext.huggingface import pipeline_udf
from transformers import pipeline

In [None]:
pipe = pipeline("text-classification")

In [None]:
pipe("What can I say that hasn't been said already. I think this place is totally worth the hype.")

In [None]:
pipe("I will not say much about this film, because there is not much to say, because there is not much there to talk about.")

## Inference using Spark DL UDF

In [None]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence")
df.take(10)

In [None]:
# note: need to manually specify return_type per pipe output above
classify = pipeline_udf(pipe, return_type="label string, score float")

In [None]:
predictions = df.withColumn("preds", classify(col("sentence"))).select("sentence", "preds.*")

In [None]:
%%time
preds = predictions.take(20)

In [None]:
preds

### Using model loader

In [None]:
import pandas as pd

from pyspark.sql.functions import col, pandas_udf
from sparkext.huggingface import pipeline_udf

In [None]:
# only use first sentence of IMDB reviews
@pandas_udf("string")
def first_sentence(text: pd.Series) -> pd.Series:
    return pd.Series([s.split(".")[0] for s in text])

df = spark.read.parquet("imdb_test").withColumn("sentence", first_sentence(col("lines"))).select("sentence")
df.take(10)

In [None]:
def model_loader(task: str):
    import torch
    from transformers import pipeline
    
    device_id = torch.cuda.current_device() if torch.cuda.is_available() else -1
    return pipeline(task, device=device_id)    

In [None]:
# note: need to manually specify return_type per pipe output above
classify = pipeline_udf("text-classification", model_loader=model_loader, return_type="label string, score float")

In [None]:
predictions = df.withColumn("preds", classify(col("sentence"))).select("sentence", "preds.*")

In [None]:
%%time
preds = predictions.take(20)

In [None]:
preds