**Lesson exercise notebook for Spark-for-Machine-Learning-AI Course**
* Created by Kevin Chao (kevinchao@gmail.com)
* Latest updated on Feb 14, 2024

In [6]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("YourAppName").getOrCreate()

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [35]:
# Re-create iviris_df
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

iris_df =spark.read.csv("Exercise_Files/iris.txt", inferSchema=True)

iris_df = iris_df.select(
    col("_c0").alias("sepal_length"),
    col("_c1").alias("sepal_width"),
    col("_c2").alias("petal_length"),
    col("_c3").alias("petal_width"),
    col("_c4").alias("species")
    )

vectorAssembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features")
viris_df = vectorAssembler.transform(iris_df)
indexer = StringIndexer(inputCol="species", outputCol="label")
iviris_df = indexer.fit(viris_df).transform(viris_df)

In [36]:
iviris_df.take(1)

[Row(sepal_length=5.1, sepal_width=3.5, petal_length=1.4, petal_width=0.2, species='Iris-setosa', features=DenseVector([5.1, 3.5, 1.4, 0.2]), label=0.0)]

In [37]:
iviris_df.show(1)

+------------+-----------+------------+-----------+-----------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|label|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
only showing top 1 row



In [38]:
# Split the data
splits = iviris_df.randomSplit([0.6, 0.4], 1)  # Split to two groups: 60% and 40%; seed = 1

In [39]:
train_df = splits[0]
test_df = splits[1]

In [40]:
train_df.count()

98

In [41]:
test_df.count()

52

In [42]:
iviris_df.count()

150

In [43]:
nb = NaiveBayes(modelType="multinomial")

In [44]:
nbmodel = nb.fit(train_df)

In [45]:
predictions_df = nbmodel.transform(test_df)

In [47]:
predictions_df.take(1)

[Row(sepal_length=4.3, sepal_width=3.0, petal_length=1.1, petal_width=0.1, species='Iris-setosa', features=DenseVector([4.3, 3.0, 1.1, 0.1]), label=0.0, rawPrediction=DenseVector([-9.9894, -11.3476, -11.902]), probability=DenseVector([0.7118, 0.183, 0.1051]), prediction=0.0)]

In [48]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [49]:
nbaccuracy = evaluator.evaluate(predictions_df)

In [50]:
nbaccuracy

0.9807692307692307