Document: [PySpark API](https://spark.apache.org/docs/latest/api/python/index.html)

In [None]:
%matplotlib inline

In [None]:
from pyspark.sql.functions import col
from pyspark.sql.functions import explode
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import IndexToString
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import OneVsRest
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


## Load Data from PIO

In [None]:
event_df = p_event_store.find('IrisApp')

In [None]:
event_df.show(5)

In [None]:
def get_field_type(name):
    if name.startswith('attr'):
        return 'double'
    else:
        return 'string'

field_names = (event_df
            .select(explode("fields"))
            .select("key")
            .distinct()
            .rdd.flatMap(lambda x: x)
            .collect())
field_names.sort()
exprs = [col("fields").getItem(k).cast(get_field_type(k)).alias(k) for k in field_names]
data_df = event_df.select(*exprs)

In [None]:
data_df.show(5)

## Pandas

In [None]:
p_data_df = data_df.toPandas()

In [None]:
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
scatter_matrix(p_data_df, diagonal='kde', color='k', alpha=0.3)

plt.show()

## Train and Test

In [None]:
(train_df, test_df) = data_df.randomSplit([0.9, 0.1])


In [None]:
labelIndexer = StringIndexer(inputCol="target", outputCol="label").fit(train_df)

featureAssembler = VectorAssembler(inputCols=[x for x in field_names if x.startswith('attr')],
                                   outputCol="features")
clf = RandomForestClassifier(featuresCol="features", labelCol="label", predictionCol="prediction",
                             probabilityCol="probability", rawPredictionCol="rawPrediction",
                             maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                             maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                             impurity="gini", numTrees=20, featureSubsetStrategy="auto",
                             seed=None, subsamplingRate=1.0)
# clf = DecisionTreeClassifier(featuresCol="features", labelCol="label", predictionCol="prediction",
#                              probabilityCol="probability", rawPredictionCol="rawPrediction",
#                              maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
#                              maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
#                              impurity="gini", seed=None)
# TODO MultilayerPerceptronClassifier is NPE...
# clf = MultilayerPerceptronClassifier(featuresCol="features", labelCol="label",
#                                      predictionCol="prediction", maxIter=100, tol=1e-6, seed=None,
#                                      layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs",
#                                      initialWeights=None)
# TODO NPE...
# lr = LogisticRegression(featuresCol="features", labelCol="label", predictionCol="prediction",
#                         maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
#                         threshold=0.5, probabilityCol="probability", # thresholds=None,
#                         rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
#                         aggregationDepth=2, family="auto")
# lr = LogisticRegression()
# clf = OneVsRest(classifier=lr)
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
pipeline = Pipeline(stages=[featureAssembler, labelIndexer, clf, labelConverter])


In [None]:
model = pipeline.fit(train_df)


In [None]:
predict_df = model.transform(test_df)


In [None]:
predict_df.select("predictedLabel", "target", "features").show(5)


In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predict_df)
print("Test Error = %g" % (1.0 - accuracy))
