In [2]:
#### Imports for the MLFlow based Tensorflow model inference

from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, FloatType, StructField, StructType, DoubleType
from pyspark.sql.functions import *
from pyspark.sql.functions import udf, array, col
import mlflow
import mlflow.pyfunc
import pandas as pd
from pyspark.sql.functions import struct
import sklearn

### Instantiate SparkSession
spark = SparkSession.builder.appName('MLFlow model inference').getOrCreate()

Setting spark.hadoop.yarn.resourcemanager.principal to hrongali


In [19]:
#### Generate Sample Input data
d={}
for i in range(25):
    d[str(i)]=1.0

df=spark.createDataFrame([d for i in range (1000)]).repartition(4)
df.show()


df_data=df.select(struct([df[i] for i in range(25)]).alias('domain_tokens'))
df_data.show()
df_data.rdd.getNumPartitions()

+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|  0|  1| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19|  2| 20| 21| 22| 23| 24|  3|  4|  5|  6|  7|  8|  9|
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|
|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|
|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|
|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|
|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|
|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|
|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1.0|1

4

In [20]:
### Register the Spark UDF for the MlFLow Tensorflow model
predict = mlflow.pyfunc.spark_udf(spark, "/home/cdsw/cvsmodel")



In [21]:
## Model Inference
df_data.withColumn("prediction", predict(df_data.domain_tokens)).show()


[Stage 74:>                                                         (0 + 1) / 1]

+--------------------+--------------------+
|       domain_tokens|          prediction|
+--------------------+--------------------+
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.881858901877422E-6|
|{1.0, 1.0, 1.0, 1...|3.88185890

                                                                                