# Hello MLlib simple NaiveBayesClassifier  

In [35]:
# Prereqquisites
from pyspark.sql import SparkSession

In [36]:
# Spark Session and Context
spark = SparkSession.builder.master("local") \
        .appName("Simple_MLlib_NaiveBayesClassifier") \
        .getOrCreate()
print("Spark Version: ", spark.version)

Spark Version:  3.4.1


#### Create Data

In [37]:

# Make up some sample data
data = [
    (0, "A", 1.0, 0.1, 0.3),
    (1, "B", 0.5, 0.8, 0.7),
    (2, "A", 0.8, 0.6, 0.4),
    (3, "C", 0.2, 0.3, 0.9),
    (4, "B", 0.9, 0.5, 0.6),
    (5, "C", 0.3, 0.7, 0.2)
]
columns = ["id", "label", "feature1", "feature2", "feature3"]
df = spark.createDataFrame(data, columns)
df.show()

+---+-----+--------+--------+--------+
| id|label|feature1|feature2|feature3|
+---+-----+--------+--------+--------+
|  0|    A|     1.0|     0.1|     0.3|
|  1|    B|     0.5|     0.8|     0.7|
|  2|    A|     0.8|     0.6|     0.4|
|  3|    C|     0.2|     0.3|     0.9|
|  4|    B|     0.9|     0.5|     0.6|
|  5|    C|     0.3|     0.7|     0.2|
+---+-----+--------+--------+--------+



In [47]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import VectorAssembler, StringIndexer

# Index the Labels
# Convert string labels ("A", "B", "C") into numeric labels (0, 1, 2)
string_indexer = StringIndexer(inputCol="label", outputCol="label_indexed")

# Assemble Features
vec_assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3"], outputCol="feature_vector")

# Split data into Train and Test datasets
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

# Define Naive Bayes Classifier
nb_classifier = NaiveBayes(featuresCol="feature_vector", labelCol="label_indexed", modelType="multinomial")

# Build the Pipeline
nb_pipeline = Pipeline(stages=[string_indexer, vec_assembler, nb_classifier])

# Fit the Pipeline
nb_pipeline_model = nb_pipeline.fit(df_train)

# Make Predictions
df_predictions = nb_pipeline_model.transform(df_train)

df_predictions.show(5)

+---+-----+--------+--------+--------+-------------+--------------+--------------------+--------------------+----------+
| id|label|feature1|feature2|feature3|label_indexed|feature_vector|       rawPrediction|         probability|prediction|
+---+-----+--------+--------+--------+-------------+--------------+--------------------+--------------------+----------+
|  0|    A|     1.0|     0.1|     0.3|          2.0| [1.0,0.1,0.3]|[-2.4964710751612...|[0.37697596573430...|       0.0|
|  1|    B|     0.5|     0.8|     0.7|          0.0| [0.5,0.8,0.7]|[-3.1855514980427...|[0.38327437818270...|       1.0|
|  3|    C|     0.2|     0.3|     0.9|          1.0| [0.2,0.3,0.9]|[-2.5305187666962...|[0.37177717748222...|       1.0|
|  4|    B|     0.9|     0.5|     0.6|          0.0| [0.9,0.5,0.6]|[-3.1685276522752...|[0.38358247423038...|       0.0|
|  5|    C|     0.3|     0.7|     0.2|          1.0| [0.3,0.7,0.2]|[-2.3036626000303...|[0.38500113359949...|       1.0|
+---+-----+--------+--------+---

In [48]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate the Model
# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(df_predictions)
print(f"Accuracy: {accuracy}")

# Evaluate precision, recall, and F1-score
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="weightedPrecision"
)
precision = precision_evaluator.evaluate(df_predictions)
print(f"Precision: {precision}")

recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="weightedRecall"
)
recall = recall_evaluator.evaluate(df_predictions)
print(f"Recall: {recall}")

f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="f1"
)
f1 = f1_evaluator.evaluate(df_predictions)
print(f"F1 Score: {f1}")

# Inspect Predictions
df_predictions.select("id", "feature_vector", "label", "label_indexed", "prediction", "probability").show(5)

Accuracy: 0.6
Precision: 0.4666666666666667
Recall: 0.6000000000000001
F1 Score: 0.52
+---+--------------+-----+-------------+----------+--------------------+
| id|feature_vector|label|label_indexed|prediction|         probability|
+---+--------------+-----+-------------+----------+--------------------+
|  0| [1.0,0.1,0.3]|    A|          2.0|       0.0|[0.37697596573430...|
|  1| [0.5,0.8,0.7]|    B|          0.0|       1.0|[0.38327437818270...|
|  3| [0.2,0.3,0.9]|    C|          1.0|       1.0|[0.37177717748222...|
|  4| [0.9,0.5,0.6]|    B|          0.0|       0.0|[0.38358247423038...|
|  5| [0.3,0.7,0.2]|    C|          1.0|       1.0|[0.38500113359949...|
+---+--------------+-----+-------------+----------+--------------------+

