# PySpark Huggingface Inferencing
### Sentence Transformers

From: https://huggingface.co/sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

#Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']


#Sentences are encoded by calling model.encode()
embedding = model.encode(sentence)

In [None]:
embedding

## PySpark

## Inference using Spark ML Model
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

In [None]:
import sparkext

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(100)

In [None]:
df.show()

In [None]:
my_model = sparkext.huggingface.SentenceTransformerModel(model) \
                .setInputCol("lines") \
                .setOutputCol("embedding")

In [None]:
embeddings = my_model.transform(df)

In [None]:
%%time
results = embeddings.collect()

In [None]:
results[0]

## Inference using Spark DL UDF
Note: you can restart the kernel and run from this point to simulate running in a different node or environment.

### Using model instance on driver

In [None]:
from pyspark.sql.functions import col
from sparkext.huggingface import sentence_transformer_udf

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

In [None]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(200)

In [None]:
df.schema

In [None]:
encode = sentence_transformer_udf(model)

In [None]:
embeddings = df.withColumn("encoding", encode(col("lines")))

In [None]:
%%time
results = embeddings.collect()

In [None]:
results[0]

### Using model_id string on driver

In [None]:
from pyspark.sql.functions import col
from sparkext.huggingface import sentence_transformer_udf

In [None]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(200)

In [None]:
encode = sentence_transformer_udf("paraphrase-MiniLM-L6-v2")

In [None]:
embeddings = df.withColumn("encoding", encode(col("lines")))

In [None]:
%%time
results = embeddings.collect()

In [None]:
results[0]

### Using model loader

In [None]:
from pyspark.sql.functions import col
from sparkext.huggingface import sentence_transformer_udf

In [None]:
# only use first N examples, since this is slow
df = spark.read.parquet("imdb_test").limit(200)

In [None]:
def model_loader(model_name):
    from sentence_transformers import SentenceTransformer
    return SentenceTransformer(model_name)   

In [None]:
encode = sentence_transformer_udf("paraphrase-MiniLM-L6-v2", model_loader=model_loader)

In [None]:
embeddings = df.withColumn("encoding", encode(col("lines")))

In [None]:
%%time
results = embeddings.collect()

In [None]:
results[0]