In [0]:
pip install sentencepiece 

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import mlflow
import pandas as pd
import time
from pyspark.sql import functions as F
from pyspark.sql.types import *

# identify the model we'll pull from the model registry
model_name = "m2m100_translation_transformer" 

#start timing things 
start = time.time()

# pashto and english phrases the 'MEANING' column is in pashto, lets load that and translate it 
deltaDF = spark.read.table('kenjohnson_demo.default.train_ruen_df_short')
deltaDF = deltaDF.toPandas()
count = deltaDF['original'].count()
display( count )
sourcelang = "ru"  
targetlang = "en" 
df_source = pd.DataFrame({'id':[_ for _ in range(count)]})
df_source['content'] = deltaDF['original'].astype(str)



7000

In [0]:
#Uncomment this to use only 64 values for debugging purposes.
#df_source = df_source[df_source['id'].isin(df_source['id'].value_counts().head(64).index)]

In [0]:
# convert pandas dataframe to Spark dataframe, and force Spark to partition the dataframe across all available executors
df_source_spark = spark.createDataFrame(df_source).repartition(spark.sparkContext.defaultParallelism).cache()


In [0]:
# inferencing function we'll distribute as a Pandas UDF
def translation_predictions_function(df):
    translation_loaded = mlflow.pyfunc.load_model(f"models:/{model_name}/Staging")
    ##mlflow.pyfunc.PythonModel enforces a one argument predict function so we use a tuple to send in our params
    param_dict = {'src_lang': 'ru', 'target_lang': 'en', 'batch_size': 8}
    model_input = ([df, param_dict])
    return translation_loaded.predict(model_input)
  
# the Spark Pandas function API requires a return value schema
schema = StructType(
    [
      StructField("id", LongType(), True),
      StructField("content", StringType(), True),
      StructField("translation", StringType(), True)
    ]
)
#ArrayType(StringType())
inferencingStartTime = time.time()
# actual translation inference on the Spark dataframe
df_source_translation = (
    df_source_spark\
    .groupBy(F.spark_partition_id().alias("_pid"))\
    .applyInPandas(translation_predictions_function, schema)
).cache()
df_source_translation.write.mode("overwrite").format("noop").save()

# viewing the results dataframe in a Databricks notebook
display(df_source_translation)

id,content,translation
1300,"–¢–µ–ø–µ—Ä—å —è –ø–æ–¥–Ω–∏–º–∞—é —Å–≤–æ–π –±–æ–∫–∞–ª, —á—Ç–æ–±—ã –ø–æ–∂–µ–ª–∞—Ç—å –≤–∞–º —Å—á–∞—Å—Ç—å—è; –∏ –∂–µ–ª–∞—é –¥–æ–±—Ä—ã–º –ª—é–¥—è–º –±–µ–∑–æ–ø–∞—Å–Ω–æ–π –∂–∏–∑–Ω–∏.",Now I raise my glass to wish you happiness; and I wish good people a safe life.
483,–ë–µ—Å–ø–µ—á–∞–ª—å–Ω–æ–º—É —Å–æ–Ω —Å–ª–∞–¥–æ–∫.,The safe sleep is sweet.
831,"–ø—Ä–∏ –æ–ø—Ä–æ—Å–µ –¥—Ä—É–≥–∏—Ö –º–æ–¥–µ—Ä–∞—Ç–æ—Ä–æ–≤ –±—ã–ª–æ —Ä–µ—à–µ–Ω–æ –Ω–µ –ø–µ—Ä–µ–¥–∞–≤–∞—Ç—å —Å–∞–± –Ω–∏–∫–æ–º—É, –∞ –µ–≥–æ —Å–¥–µ–ª–∞—Ç—å –ø—Ä–∏–≤–∞—Ç–Ω—ã–º , —á—Ç–æ –∏ —Å–¥–µ–ª–∞–Ω–æ.","During the survey of other moderators, it was decided not to transfer the sab to anyone, but to make it private, what has been done."
1532,–ù–∞ –∫–ª–∞–¥–±–∏—â–µ —Ç–æ–∂–µ –Ω–∏–∫—Ç–æ –Ω–µ –≤—ã—Ö–æ–¥–∏—Ç –Ω–∞ –ø—Ä–æ—Ç–µ—Å—Ç–Ω—ã–µ –∞–∫—Ü–∏–∏,No one goes to the cemetery to protest.
617,–ò—Å–∫—É—Å—Å—Ç–≤–æ —Ä–µ–∫–ª–∞–º—ã,Art of Advertising
1152,–ë–µ–π—Å—è –Ω–µ –±–µ–π—Å—è: –±–µ–∑ —Ä–æ–∫—É —Å–º–µ—Ä—Ç–∏ –Ω–µ –±—É–¥–µ—Ç.,Don‚Äôt be afraid: without the year of death there will be no death.
1455,"–ö–∞–∫–æ–≤—ã —Å–∞–º–∏, —Ç–∞–∫–æ–≤—ã –∏ —Å–∞–Ω–∏.","So are they, so are they, so are they."
1062,> –ò –≤–æ—Ç —Å —Ç–µ—Ö —Å–∞–º—ã—Ö –ø–æ—Ä –µ—Å—Ç—å —É –º–µ–Ω—è –ø—É–Ω–∫—Ç–∏–∫:,‚ÄúI have a point since then:
1981,–ü–æ–∑–¥–Ω–µ–º—É –≥–æ—Å—Ç—é ‚Äî –∫–æ—Å—Ç–∏.,The late guest is the bones.
176,–ó–∞—á–µ–º –†–æ—Å—Å–∏—è —Ç—Ä–∞—Ç–∏—Ç –æ–≥—Ä–æ–º–Ω—ã–µ –¥–µ–Ω—å–≥–∏ –Ω–∞ —Å–æ–¥–µ—Ä–∂–∞–Ω–∏–µ –∑–∞–ø–æ–ª—è—Ä–Ω—ã—Ö –∫–æ–ª–æ–Ω–∏–π?,Why does Russia spend a huge amount of money on the maintenance of zapolar colonies?


In [0]:
inferencingEndTime = time.time()
totalInferencingTime = inferencingEndTime - inferencingStartTime
dataRowCount = df_source["id"].count()

print (f"{dataRowCount} Source phrases were translated. Inferencing phase took {totalInferencingTime} seconds on {df_source_spark.rdd.getNumPartitions()} total nodes.")

7000 Source phrases were translated. Inferencing phase took 247.0834527015686 seconds on 64 total nodes.


In [0]:
print(df_source_spark.rdd.getNumPartitions())

64
