# PySpark Huggingface Inferencing
From: https://huggingface.co/docs/transformers/model_doc/t5

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

max_source_length = 512
max_target_length = 128

task_prefix = "translate English to German: "

lines = [
    "The house is wonderful",
    "Welcome to NYC",
    "HuggingFace is a company"
]

input_sequences = [task_prefix + l for l in lines]

In [None]:
input_ids = tokenizer(input_sequences, 
                      padding="longest", 
                      max_length=max_source_length,
                      return_tensors="pt").input_ids
outputs = model.generate(input_ids)

In [None]:
[tokenizer.decode(o, skip_special_tokens=True) for o in outputs]

## PySpark

In [None]:
import os
from pathlib import Path
from torchtext.datasets import IMDB

In [None]:
# load IMDB reviews (test) dataset
data = IMDB(split='test')
len(data)

In [None]:
# convert to nested array of string for pyspark
lines = []
for label, text in data:
    # only take first sentence of IMDB review
    lines.append([text.split('.')[0]])

### Test inference on this data

In [None]:
text = lines[0]
text

In [None]:
input_ids = tokenizer("translate English to German: " + text[0], return_tensors="pt").input_ids
input_ids

In [None]:
output_ids = model.generate(input_ids)
output_ids

In [None]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

### Create PySpark DataFrame

In [None]:
from pyspark.sql.types import *

In [None]:
df = spark.createDataFrame(lines, ['lines'])
df.schema

In [None]:
df.take(1)

### Save the test dataset as parquet files

In [None]:
df.write.mode("overwrite").parquet("imdb_test")

### Check arrow memory configuration

In [None]:
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "512")
# This line will fail if the vectorized reader runs out of memory
assert len(df.head()) > 0, "`df` should not be empty"

## Inference using Spark ML Model


In [1]:
import sparkext

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [3]:
# only use first N examples, since this is very slow on CPUs
df = spark.read.parquet("imdb_test").limit(100)

                                                                                

In [4]:
my_model = sparkext.huggingface.Model(model, tokenizer)

In [5]:
predictions = my_model.transform(df)



In [6]:
predictions.collect()

                                                                                

[Row(prediction='Dieser Film hat in vielen Bereichen ermangelte in vielen'),
 Row(prediction='movie begins with a man who appears to be a sports driver.'),
 Row(prediction='Eigentlich nicht all so viel zu diesem Film'),
 Row(prediction="Ah, another movie with motorcycles, hell's angels posse and Steve"),
 Row(prediction='n'),
 Row(prediction='Recap: A lone swordsman, living in the desert and acting as an'),
 Row(prediction='a project that was audacious, but ended up being a pretentious'),
 Row(prediction="Wong Kar-wai's 1994 movie Ashes of Time"),
 Row(prediction=': During my ventures into foreign cinema, I have taken a liking to'),
 Row(prediction='Die Regie von Wong ist vielleicht die schlimmste in der Filmgeschichte, die ich'),
 Row(prediction='Votre capacité à jouir des Ashes of Time peut dépendre de'),
 Row(prediction='a major Hollywood movie with major stars, stellar budgets, and MPAA t'),
 Row(prediction='is one of those movies where the acting, set location, direction, and effe