# Environment

In [None]:
// Download JPMML-SparkML for Apache Spark 3.4.X
%AddJar https://github.com/jpmml/jpmml-sparkml/releases/download/3.0.8/pmml-sparkml-example-executable-3.0.8.jar

# Dataset

In [None]:
val df = spark.read
    .format("libsvm")
    .option("numFeatures", "4")
    .option("vectorType", "dense")
    .load("Iris.libsvm")

df.printSchema()

# Workflow

## The modeling part of the split pipeline

In [None]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression

val classifier = new LogisticRegression()

val libsvmPipeline = new Pipeline()
    .setStages(Array(classifier))

val libsvmPipelineModel = libsvmPipeline.fit(df)

## The data pre-processing part of the split pipeline

In [None]:
import org.apache.spark.ml.feature.{StringIndexerModel, VectorAssembler}
import org.apache.spark.sql.types.{DoubleType, StringType, StructType}

val irisLabelSchema = new StructType()
    .add("Species", StringType, nullable = false)

val speciesIndexerModel = new StringIndexerModel("speciesIndexerModel", Array("setosa", "versicolor", "virginica"))
    .setInputCol("Species")
    .setOutputCol("label")

val irisFeaturesSchema = new StructType()
    .add("Sepal.Length", DoubleType, nullable = false)
    .add("Sepal.Width", DoubleType, nullable = false)
    .add("Petal.Length", DoubleType, nullable = false)
    .add("Petal.Width", DoubleType, nullable = false)

val featuresAssembler = new VectorAssembler("featuresAssembler")
    .setInputCols(Array("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"))
    .setOutputCol("features")

## Final assembly

In [None]:
import org.apache.spark.ml.{PipelineModel, Transformer}
import org.apache.spark.sql.types.StructType
import org.jpmml.sparkml.PipelineModelUtil

val irisSchema = new StructType(irisLabelSchema.fields ++ irisFeaturesSchema.fields)

val irisStages: Array[Transformer] = Array(speciesIndexerModel, featuresAssembler) ++ libsvmPipelineModel.stages

val irisPipelineModel = PipelineModelUtil.create("irisPipelineModel", irisStages)

# Export to PMML

In [None]:
import org.jpmml.sparkml.PMMLBuilder
import org.jpmml.sparkml.model.HasPredictionModelOptions

val irisPmmlBuilder = new PMMLBuilder(irisSchema, irisPipelineModel)
    .putOption(HasPredictionModelOptions.OPTION_KEEP_PREDICTIONCOL, false)

println(irisPmmlBuilder.buildString)