# Hello MLLib simple Random Classifier  

In [54]:
# Prereqquisites
from pyspark.sql import SparkSession

In [55]:
# Spark Session and Context
spark = SparkSession.builder.master("local") \
        .appName("Simple_MLlib_Random_Forest_Classifier") \
        .getOrCreate()
print("Spark Version: ", spark.version)

Spark Version:  3.4.1


#### Create Data

In [56]:

# Make up some sample data
data = [
    (0, "A", 1.0, 0.1, 0.3),
    (1, "B", 0.5, 0.8, 0.7),
    (2, "A", 0.8, 0.6, 0.4),
    (3, "C", 0.2, 0.3, 0.9),
    (4, "B", 0.9, 0.5, 0.6),
    (5, "C", 0.3, 0.7, 0.2)
]
columns = ["id", "label", "feature1", "feature2", "feature3"]
df = spark.createDataFrame(data, columns)
df.show()

+---+-----+--------+--------+--------+
| id|label|feature1|feature2|feature3|
+---+-----+--------+--------+--------+
|  0|    A|     1.0|     0.1|     0.3|
|  1|    B|     0.5|     0.8|     0.7|
|  2|    A|     0.8|     0.6|     0.4|
|  3|    C|     0.2|     0.3|     0.9|
|  4|    B|     0.9|     0.5|     0.6|
|  5|    C|     0.3|     0.7|     0.2|
+---+-----+--------+--------+--------+



In [57]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer

# Index the Labels
# Convert string labels ("A", "B", "C") into numeric labels (0, 1, 2)
string_indexer = StringIndexer(inputCol="label", outputCol="label_indexed")

# Assemble Features
vec_assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3"], outputCol="feature_vector")

# Split data into Train and Test datasets
df_train, df_test = df.randomSplit([0.8, 0.2], seed=42)

# Define Naive Bayes Classifier
rf_classifier = RandomForestClassifier(featuresCol="feature_vector", labelCol="label_indexed", numTrees=10, maxDepth=5)


# Build the Pipeline
rf_pipeline = Pipeline(stages=[string_indexer, vec_assembler, rf_classifier])

# Fit the Pipeline
rf_pipeline_model = rf_pipeline.fit(df_train)

# Make Predictions
df_predictions = rf_pipeline_model.transform(df_train)

df_predictions.show(5)

+---+-----+--------+--------+--------+-------------+--------------+--------------+-------------+----------+
| id|label|feature1|feature2|feature3|label_indexed|feature_vector| rawPrediction|  probability|prediction|
+---+-----+--------+--------+--------+-------------+--------------+--------------+-------------+----------+
|  0|    A|     1.0|     0.1|     0.3|          2.0| [1.0,0.1,0.3]| [4.0,1.0,5.0]|[0.4,0.1,0.5]|       2.0|
|  1|    B|     0.5|     0.8|     0.7|          0.0| [0.5,0.8,0.7]|[10.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|  3|    C|     0.2|     0.3|     0.9|          1.0| [0.2,0.3,0.9]| [1.0,9.0,0.0]|[0.1,0.9,0.0]|       1.0|
|  4|    B|     0.9|     0.5|     0.6|          0.0| [0.9,0.5,0.6]| [9.0,1.0,0.0]|[0.9,0.1,0.0]|       0.0|
|  5|    C|     0.3|     0.7|     0.2|          1.0| [0.3,0.7,0.2]| [2.0,7.0,1.0]|[0.2,0.7,0.1]|       1.0|
+---+-----+--------+--------+--------+-------------+--------------+--------------+-------------+----------+



In [58]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate the Model
# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(df_predictions)
print(f"Accuracy: {accuracy}")

# Evaluate precision, recall, and F1-score
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="weightedPrecision"
)
precision = precision_evaluator.evaluate(df_predictions)
print(f"Precision: {precision}")

recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="weightedRecall"
)
recall = recall_evaluator.evaluate(df_predictions)
print(f"Recall: {recall}")

f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label_indexed", predictionCol="prediction", metricName="f1"
)
f1 = f1_evaluator.evaluate(df_predictions)
print(f"F1 Score: {f1}")

# Inspect Predictions
df_predictions.select("id", "feature_vector", "label", "label_indexed", "prediction", "probability").show(5)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
+---+--------------+-----+-------------+----------+-------------+
| id|feature_vector|label|label_indexed|prediction|  probability|
+---+--------------+-----+-------------+----------+-------------+
|  0| [1.0,0.1,0.3]|    A|          2.0|       2.0|[0.4,0.1,0.5]|
|  1| [0.5,0.8,0.7]|    B|          0.0|       0.0|[1.0,0.0,0.0]|
|  3| [0.2,0.3,0.9]|    C|          1.0|       1.0|[0.1,0.9,0.0]|
|  4| [0.9,0.5,0.6]|    B|          0.0|       0.0|[0.9,0.1,0.0]|
|  5| [0.3,0.7,0.2]|    C|          1.0|       1.0|[0.2,0.7,0.1]|
+---+--------------+-----+-------------+----------+-------------+

