# PySpark Huggingface Inferencing
## Conditional generation

From: https://huggingface.co/docs/transformers/model_doc/t5

### Using PyTorch

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

max_source_length = 512
max_target_length = 128

task_prefix = "translate English to German: "

lines = [
    "The house is wonderful",
    "Welcome to NYC",
    "HuggingFace is a company"
]

input_sequences = [task_prefix + l for l in lines]

In [None]:
input_ids = tokenizer(input_sequences, 
                      padding="longest", 
                      max_length=max_source_length,
                      return_tensors="pt").input_ids
outputs = model.generate(input_ids)

In [None]:
[tokenizer.decode(o, skip_special_tokens=True) for o in outputs]

In [None]:
model.framework

### Using TensorFlow

In [None]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = TFT5ForConditionalGeneration.from_pretrained("t5-small")

max_source_length = 512
max_target_length = 128

task_prefix = "translate English to German: "

lines = [
    "The house is wonderful",
    "Welcome to NYC",
    "HuggingFace is a company"
]

input_sequences = [task_prefix + l for l in lines]

In [None]:
input_ids = tokenizer(input_sequences, 
                      padding="longest", 
                      max_length=max_source_length,
                      return_tensors="tf").input_ids
outputs = model.generate(input_ids)

In [None]:
[tokenizer.decode(o, skip_special_tokens=True) for o in outputs]

In [None]:
model.framework

## PySpark

In [None]:
import os
from pathlib import Path
from torchtext.datasets import IMDB

In [None]:
# load IMDB reviews (test) dataset
data = IMDB(split='test')
len(data)

In [None]:
# convert to nested array of string for pyspark
lines = []
for label, text in data:
    # only take first sentence of IMDB review
    lines.append([text])

### Create PySpark DataFrame

In [None]:
from pyspark.sql.types import *

In [None]:
df = spark.createDataFrame(lines, ['lines']).repartition(10)
df.schema

In [None]:
df.take(1)

### Save the test dataset as parquet files

In [None]:
df.write.mode("overwrite").parquet("imdb_test")

### Check arrow memory configuration

In [None]:
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "512")
# This line will fail if the vectorized reader runs out of memory
assert len(df.head()) > 0, "`df` should not be empty"

## Inference using Spark ML Model
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [None]:
import pandas as pd
import sparkext
from pyspark.sql.functions import col, pandas_udf

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [None]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(100)
df.show(truncate=120)

In [None]:
# only use first sentence and add prefix for conditional generation
def preprocess(text: pd.Series, prefix: str = "") -> pd.Series:
    @pandas_udf("string")
    def _preprocess(text: pd.Series) -> pd.Series:
        return pd.Series([prefix + s.split(".")[0] for s in text])
    return _preprocess(text)

In [None]:
# add prefix, only use first 100 rows, since generation takes a while
df1 = df.withColumn("input", preprocess(col("lines"), "Translate English to German: ")).select("input")
df1.show(truncate=120)

In [None]:
my_model = sparkext.huggingface.Model(model, tokenizer, 
                    max_length=128, padding="longest", return_tensors="pt", truncation=True, skip_special_tokens=True) \
                    .setInputCol("input") \
                    .setOutputCol("translation")

**Note**: "AutoModel from string" doesn't work here, because the T5ForConditionalGeneration model actually adds a 
language modeling head on top of the standard T5 model, where the AutoModel only loads the standard T5 model.
See: https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForConditionalGeneration
```
my_model = sparkext.huggingface.Model("t5-small")
```

In [None]:
predictions = my_model.transform(df1)

In [None]:
%%time
predictions.write.mode("overwrite").parquet("imdb_translations")
results = predictions.collect()

In [None]:
results[:5]

## Inference using Spark DL UDF (PyTorch)
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [None]:
import pandas as pd
from pyspark.sql.functions import col, pandas_udf
from sparkext.huggingface import model_udf

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [None]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(100)
df.show(truncate=120)

In [None]:
# only use first sentence and add prefix for conditional generation
def preprocess(text: pd.Series, prefix: str = "") -> pd.Series:
    @pandas_udf("string")
    def _preprocess(text: pd.Series) -> pd.Series:
        return pd.Series([prefix + s.split(".")[0] for s in text])
    return _preprocess(text)

In [None]:
# only use first 100 rows, since generation takes a while
df1 = df.withColumn("input", preprocess(col("lines"), "Translate English to German: ")).select("input").limit(100)

In [None]:
df1.show(truncate=120)

In [None]:
# note: default return_type is 'string'
generate = model_udf(model, tokenizer=tokenizer,
                     max_length=128, padding="longest", return_tensors="pt", truncation=True, skip_special_tokens=True)

In [None]:
predictions = df1.withColumn("preds", generate(col("input")))

In [None]:
predictions.show(truncate=60)

In [None]:
%%time
preds = predictions.collect()

In [None]:
# only use first 100 rows, since generation takes a while
df2 = df.withColumn("input", preprocess(col("lines"), "Translate English to French: ")).select("input").limit(100)

In [None]:
df2.show(truncate=120)

In [None]:
predictions = df2.withColumn("preds", generate(col("input")))

In [None]:
predictions.show(truncate=60)

In [None]:
%%time
preds = predictions.collect()

## Inference using Spark DL UDF (TensorFlow)
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [None]:
import pandas as pd
from pyspark.sql.functions import col, pandas_udf
from sparkext.huggingface import model_udf

In [None]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

In [None]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(100)
df.show(truncate=120)

In [None]:
# only use first sentence and add prefix for conditional generation
def preprocess(text: pd.Series, prefix: str = "") -> pd.Series:
    @pandas_udf("string")
    def _preprocess(text: pd.Series) -> pd.Series:
        return pd.Series([prefix + s.split(".")[0] for s in text])
    return _preprocess(text)

In [None]:
# only use first 100 rows, since generation takes a while
df1 = df.withColumn("input", preprocess(col("lines"), "Translate English to German: ")).select("input").limit(100)

In [None]:
df1.show(truncate=120)

In [None]:
# Need to use a model_loader since spark doesn't serialize this model correctly
def model_loader(model_id):
    from transformers import TFT5ForConditionalGeneration, T5Tokenizer
    model = TFT5ForConditionalGeneration.from_pretrained(model_id)
    tokenizer = T5Tokenizer.from_pretrained(model_id)
    return model, tokenizer

In [None]:
# note: default return_type for model_udf is 'string'
generate = model_udf("t5-small", tokenizer=tokenizer, model_loader=model_loader,
                     max_length=128, padding="longest", return_tensors="tf", truncation=True, skip_special_tokens=True)

In [None]:
predictions = df1.withColumn("preds", generate(col("input")))

In [None]:
predictions.show(truncate=60)

In [None]:
%%time
preds = predictions.collect()

In [None]:
# only use first 100 rows, since generation takes a while
df2 = df.withColumn("input", preprocess(col("lines"), "Translate English to French: ")).select("input").limit(100)

In [None]:
df2.show(truncate=120)

In [None]:
predictions = df2.withColumn("preds", generate(col("input")))

In [None]:
predictions.show(truncate=60)

In [None]:
%%time
preds = predictions.collect()

## Inference using Spark DL API
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [1]:
import pandas as pd
from pyspark.ml.udf import model_udf
from pyspark.sql.functions import col, pandas_udf, struct
from pyspark.sql.types import StringType

In [2]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(100)
df.show(truncate=120)

[Stage 1:>                                                          (0 + 1) / 1]

+------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                   lines|
+------------------------------------------------------------------------------------------------------------------------+
|...But not this one! I always wanted to know "what happened" next. We will never know for sure what happened because ...|
|I found myself getting increasingly angry as this movie progressed.<br /><br />Basically, Dr. Crawford (Dennis Hopper...|
|The comparisons between the 1995 version and this are inevitable. Sadly, this version falls far short.<br /><br />The...|
|Doesn't anyone bother to check where this kind of sludge comes from before blathering on about its supposed revelatio...|
|Don't get me wrong, I love the TV series of League Of Gentlemen. It was funny, twisted and completely inspired. I was...|
|Made it through

                                                                                

In [3]:
# only use first sentence and add prefix for conditional generation
def preprocess(text: pd.Series, prefix: str = "") -> pd.Series:
    @pandas_udf("string")
    def _preprocess(text: pd.Series) -> pd.Series:
        return pd.Series([prefix + s.split(".")[0] for s in text])
    return _preprocess(text)

In [4]:
# only use first 100 rows, since generation takes a while
df1 = df.withColumn("input", preprocess(col("lines"), "Translate English to German: ")).select("input").limit(100)

In [5]:
df1.show(truncate=120)

[Stage 4:>                                                          (0 + 1) / 1]

+------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                   input|
+------------------------------------------------------------------------------------------------------------------------+
|                                                                                           Translate English to German: |
|                         Translate English to German: I found myself getting increasingly angry as this movie progressed|
|                           Translate English to German: The comparisons between the 1995 version and this are inevitable|
|Translate English to German: Doesn't anyone bother to check where this kind of sludge comes from before blathering on...|
|                            Translate English to German: Don't get me wrong, I love the TV series of League Of Gentlemen|
|           Tran

                                                                                

In [6]:
def model_fn():
    import numpy as np
    from transformers import TFT5ForConditionalGeneration, T5Tokenizer
    model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    
    def predict(inputs):
        flattened = np.squeeze(inputs, axis=1).tolist()   # convert 2d numpy array of string into flattened python list
        input_ids = tokenizer(flattened, 
                              padding="longest", 
                              max_length=128,
                              return_tensors="tf").input_ids
        output_ids = model.generate(input_ids)
        string_outputs = [tokenizer.decode(o, skip_special_tokens=True) for o in output_ids]
        return string_outputs
    
    return predict

In [7]:
generate = model_udf(model_fn, 
                     input_shapes=[[-1,1]], 
                     return_type=StringType(), 
                     batch_size=10)

In [8]:
predictions = df1.withColumn("preds", generate(struct("input")))

In [9]:
predictions.show(truncate=60)

[Stage 7:>                                                          (0 + 1) / 1]

+------------------------------------------------------------+------------------------------------------------------------+
|                                                       input|                                                       preds|
+------------------------------------------------------------+------------------------------------------------------------+
|Translate English to German: Hard up, No proper jobs goin...|                              Warum nicht die Kinder mieten?|
|Translate English to German: To be fair, I didn't see a l...|               Ich habe es nicht gesehen, daß ich es gesehen|
|Translate English to German: (This might have a spoiler)<...|                    (Dies könnte einen spoiler haben)br />br|
|Translate English to German: 1st watched 3/17/2002 - 2 ou...|                     1st watched 3/17/2002 - 2 out of 10(Dir|
|Translate English to German: I really didn't like this fi...|Ich habe diesen Film wirklich nicht gefallen!!!! er war l...|
|Transla

                                                                                

In [10]:
%%time
preds = predictions.collect()

[Stage 10:>                                                         (0 + 1) / 1]

CPU times: user 21.2 ms, sys: 0 ns, total: 21.2 ms
Wall time: 30.9 s


                                                                                

In [11]:
# only use first 100 rows, since generation takes a while
df2 = df.withColumn("input", preprocess(col("lines"), "Translate English to French: ")).select("input").limit(100)

In [12]:
df2.show(truncate=120)

+------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                   input|
+------------------------------------------------------------------------------------------------------------------------+
|Translate English to French: Hard up, No proper jobs going down at the pit, why not rent your kids! DIY pimp story wi...|
|                                                Translate English to French: To be fair, I didn't see a lot of this show|
|Translate English to French: (This might have a spoiler)<br /><br />When I first started watching this movie, I thoug...|
|Translate English to French: 1st watched 3/17/2002 - 2 out of 10(Dir-Mario Pinzauti): Silly, sex-filled master & slav...|
|  Translate English to French: I really didn't like this film~!!!! it was boring and didn't interest me that much at all|
|      Translate

In [13]:
predictions = df2.withColumn("preds", generate(struct("input")))

In [14]:
predictions.show(truncate=60)

[Stage 16:>                                                         (0 + 1) / 1]

+------------------------------------------------------------+------------------------------------------------------------+
|                                                       input|                                                       preds|
+------------------------------------------------------------+------------------------------------------------------------+
|                               Translate English to French: |                                                           :|
|Translate English to French: I found myself getting incre...|  Je me suis rendu de plus en plus en colère à mesure que ce|
|Translate English to French: The comparisons between the ...|Les comparaisons entre la version de 1995 et cette versio...|
|Translate English to French: Doesn't anyone bother to che...|          Ne s'agit-il pas de vérifier où viennent ces boues|
|Translate English to French: Don't get me wrong, I love t...|Ne m'oubliez pas, je m'aime la série de télévision de League|
|Transla

                                                                                

In [15]:
%%time
preds = predictions.collect()

[Stage 19:>                                                         (0 + 1) / 1]

CPU times: user 8.08 ms, sys: 6.8 ms, total: 14.9 ms
Wall time: 30.2 s


                                                                                

In [16]:
spark.stop()