In [2]:
from pathlib import Path
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


BASE_DATA_DIR = Path().home() / "Documents/PySparkCurso/download"

spark: SparkSession = (
    SparkSession.builder.master("local").appName("Ml with spark").getOrCreate()
)

In [3]:
iris: DataFrame = spark.read.csv(
    str(BASE_DATA_DIR / "iris.csv"), inferSchema=True, header=True
)
iris.show(5)

+-----------+----------+-----------+----------+-----------+
|sepallength|sepalwidth|petallength|petalwidth|      class|
+-----------+----------+-----------+----------+-----------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|
+-----------+----------+-----------+----------+-----------+
only showing top 5 rows



In [4]:
formula = RFormula(
    formula="class ~ .",
    featuresCol="features",
    labelCol="label",
    handleInvalid="skip",
)
iris_transf = formula.fit(iris).transform(iris).select("features", "label")
iris_transf.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
+-----------------+-----+
only showing top 5 rows



In [5]:
iris_traine, iris_test = iris_transf.randomSplit([0.7, 0.3])
nb = NaiveBayes(labelCol="label", featuresCol="features")
model = nb.fit(iris_traine)
prev = model.transform(iris_test)
prev.show()

+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.3,3.0,1.1,0.1]|  0.0|[-9.9459586310684...|[0.72755589946077...|       0.0|
|[4.5,2.3,1.3,0.3]|  0.0|[-10.438288584419...|[0.54430055052198...|       0.0|
|[4.6,3.4,1.4,0.3]|  0.0|[-11.896293816194...|[0.68699461089321...|       0.0|
|[4.8,3.0,1.4,0.3]|  0.0|[-11.604866045444...|[0.64230264375293...|       0.0|
|[4.8,3.4,1.9,0.2]|  0.0|[-12.639936254414...|[0.64275970286444...|       0.0|
|[4.9,2.5,4.5,1.7]|  2.0|[-22.173459720037...|[0.01852666087314...|       2.0|
|[4.9,3.1,1.5,0.1]|  0.0|[-11.253184110024...|[0.70317421554128...|       0.0|
|[4.9,3.1,1.5,0.1]|  0.0|[-11.253184110024...|[0.70317421554128...|       0.0|
|[5.0,2.0,3.5,1.0]|  1.0|[-17.241407622542...|[0.07406362363633...|       2.0|
|[5.0,3.2,1.2,0.2]|  0.0|[-11.217428355864...|[0.737

In [6]:
aval = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="label", metricName="accuracy"
)

result = aval.evaluate(prev)
print(result)

0.6226415094339622
