In [0]:
pip install sentencepiece 

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import mlflow
import pandas as pd
import time
from pyspark.sql import functions as F
from pyspark.sql.types import *

# identify the model we'll pull from the model registry
model_name = "m2m100_translation_transformer" 


deltaDF = spark.read.table('kenjohnson_demo.default.english_phrases_and_sayings')
deltaDF = deltaDF.toPandas()
count = deltaDF['explanation'].count()
df_source = pd.DataFrame({'id':[_ for _ in range(count)]})
df_source['content'] = deltaDF['explanation'].astype(str)



In [0]:
#Uncomment this to use only 64 values for debugging purposes.
#df_source = df_source[df_source['id'].isin(df_source['id'].value_counts().head(64).index)]

In [0]:
# convert pandas dataframe to Spark dataframe, and force Spark to partition the dataframe across all available executors
df_source_spark = spark.createDataFrame(df_source).repartition(spark.sparkContext.defaultParallelism).cache()

In [0]:
# inferencing function we'll distribute as a Pandas UDF
def translation_predictions_function(df):
    translation_loaded = mlflow.pyfunc.load_model(f"models:/{model_name}/Staging")
    ##mlflow.pyfunc.PythonModel enforces a one argument predict function so we use a tuple to send in our src and target languages
    param_dict = {'src_lang': 'en', 'target_lang': 'pt', 'batch_size': 5}
    model_input = ([df, param_dict])
    return translation_loaded.predict(model_input)
    
# the Spark Pandas function API requires a return value schema
schema = StructType(
    [
      StructField("id", LongType(), True),
      StructField("content", StringType(), True),
      StructField("translation", StringType(), True)
    ]
)

inferencingStartTime = time.time()
# actual translation inference on the Spark dataframe
df_source_translation = (
    df_source_spark\
    .groupBy(F.spark_partition_id().alias("_pid"))\
    .applyInPandas(translation_predictions_function, schema)
).cache()
#df_source_translation.write.mode("overwrite").format("noop").save()

# viewing the results dataframe in a Databricks notebook
display(df_source_translation)

id,content,translation
0,The proverb 'A bird in the hand is worth two in the bush' means that it's,O proverbio 'um pássaro na mão vale dois no buraco' significa que é
1570,,Nenhuma
1586,,Nenhuma
1608,,Nenhuma
1561,,Nenhuma
1573,,Nenhuma
1593,,Nenhuma
1552,,Nenhuma
1575,,Nenhuma
1602,,Nenhuma


In [0]:
inferencingEndTime = time.time()
totalInferencingTime = inferencingEndTime - inferencingStartTime
dataRowCount = df_source["id"].count()

print (f"{dataRowCount} Source phrases were translated. Inferencing phase took {totalInferencingTime} seconds on {df_source_spark.rdd.getNumPartitions()} total nodes.")

In [0]:
print(df_source_spark.rdd.getNumPartitions())