In [0]:
pip install sentencepiece 

Python interpreter will be restarted.
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Python interpreter will be restarted.


In [0]:
#import os
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:516"

In [0]:
import mlflow
import pandas as pd
import time
from pyspark.sql import functions as F
from pyspark.sql.types import *

# identify the model we'll pull from the model registry
model_name = "m2m100_1B_translation_transformer" 

#start timing things 
start = time.time()

# pashto and english phrases the 'MEANING' column is in pashto, lets load that and translate it 
pashtoDF = spark.read.table('kenjohnson_demo.default.pashto_parallel_corpus')
pashtoDF = pashtoDF.toPandas()
count = pashtoDF['MEANING'].count()
display( count )
sourcelang = "ur" #urdu 
targetlang = "en" #english 
df_source = pd.DataFrame({'id':[_ for _ in range(count)]})
df_source['content'] = pashtoDF['MEANING'].astype(str)



29826

In [0]:
#Uncomment this to use only 64 values for debugging purposes.
#df_source = df_source[df_source['id'].isin(df_source['id'].value_counts().head(64).index)]

In [0]:
# convert pandas dataframe to Spark dataframe, and force Spark to partition the dataframe across all available executors
df_source_spark = spark.createDataFrame(df_source).repartition(spark.sparkContext.defaultParallelism).cache()

In [0]:
# inferencing function we'll distribute as a Pandas UDF
def translation_predictions_function(df):
    translation_loaded = mlflow.pyfunc.load_model(f"models:/{model_name}/Production")
    ##mlflow.pyfunc.PythonModel enforces a one argument predict function so we use a tuple to send in our params
    param_dict = {'src_lang': 'ur', 'target_lang': 'en', 'batch_size': 4}
    model_input = ([df, param_dict])
    return translation_loaded.predict(model_input)
  
# the Spark Pandas function API requires a return value schema
schema = StructType(
    [
      StructField("id", LongType(), True),
      StructField("content", StringType(), True),
      StructField("translation", StringType(), True)
    ]
)
#ArrayType(StringType())
inferencingStartTime = time.time()
# actual translation inference on the Spark dataframe
df_source_translation = (
    df_source_spark\
    .groupBy(F.spark_partition_id().alias("_pid"))\
    .applyInPandas(translation_predictions_function, schema)
).cache()

# viewing the results dataframe in a Databricks notebook
display(df_source_translation)

id,content,translation
7033,,
7147,بیٹھنا اور سروس اتنی اچھی نہیں ہے لیکن پشاوری کرائی واقعی اچھی ہے۔,"Sitting and service are not so good, but the postcard is really good."
7130,,
7392,کریم ہانڈی، پنیر ہانڈی، دال چنا، مکنی، ریشمی چکن بہت لذیذ,"Cream Hundee, Cheese Hundee, Dahl Chana, Mackie, Rice Chicken very delicious"
7250,فش فرائی اچھی تھی، لیکن یہ کافی مہنگی ہے،,"The porn was good, but it is expensive enough,"
7143,کھانے کا معیار اچھا ہے لیکن مقدار کے مطابق ان کی قیمت زیادہ ہے۔,"The quality of the food is good, but the price is higher according to the quantity."
7113,,
3395,یہ میرا کارڈ ہے,This is my card.
3541,کمرہ کا رخ صحن کی طرف ہے۔,The room is in front of the scene.
3396,آپ میرے ہم نام ہیں,You are my name.


In [0]:
count

Out[8]: 29826

In [0]:
%sql 
describe kenjohnson_demo.default.pashto_parallel_corpus

col_name,data_type,comment
SENTENCES,string,
MEANING,string,
M2M1001BTranslation,string,


In [0]:
%sql 
ALTER TABLE kenjohnson_demo.default.pashto_parallel_corpus ADD COLUMNS (M2M1001BTranslation string)

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-649139525336546>[0m in [0;36m<cell line: 1>[0;34m()[0m
[1;32m      5[0m     [0mdisplay[0m[0;34m([0m[0mdf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      6[0m     [0;32mreturn[0m [0mdf[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 7[0;31m   [0m_sqldf[0m [0;34m=[0m [0m____databricks_percent_sql[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      8[0m [0;32mfinally[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m      9[0m   [0;32mdel[0m [0m____databricks_percent_sql[0m[0;34m[0m[0;34m[0m[0m

[0;32m<command-649139525336546>[0m in [0;36m____databricks_percent_sql[0;34m()[0m
[1;32m      2[0m   [0;32mdef[0m [0m____databricks_percent_sql[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m     [0;32mimport[0

In [0]:
%sql 
INSERT INTO kenjohnson_demo.default.pashto_parallel_corpus (M2M1001BTranslation) VALUES
    as select translation from kenjohnson_demo.default.pashto_parallel_corpustrans



In [0]:
Edf_source_translation.write.saveAsTable("kenjohnson_demo.default.pashto_parallel_corpustrans")



In [0]:
print(df_source_spark.rdd.getNumPartitions())

