# Hello MLLib simple DecisionTreeClassifier  

In [27]:
# Prereqquisites
from pyspark.sql import SparkSession

In [29]:
# Spark Session and Context
spark = SparkSession.builder.master("local") \
        .appName("Simple MLlib DecisionTreeClassifier") \
        .getOrCreate()
print("Spark Version: ", spark.version)

Spark Version:  3.4.1


#### Create Data

In [32]:

# Make up some sample data
data = [
    (0, 1.0, 2.0, 0),
    (1, 2.0, 3.0, 1),
    (2, 3.0, 4.0, 0),
    (3, 4.0, 5.0, 1),
    (4, 5.0, 6.0, 0),
]
columns = ["id", "feature1", "feature2", "label"]
df = spark.createDataFrame(data, columns)
df.show()



+---+--------+--------+-----+
| id|feature1|feature2|label|
+---+--------+--------+-----+
|  0|     1.0|     2.0|    0|
|  1|     2.0|     3.0|    1|
|  2|     3.0|     4.0|    0|
|  3|     4.0|     5.0|    1|
|  4|     5.0|     6.0|    0|
+---+--------+--------+-----+



In [31]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer

# Assemble Features
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")
df_transformed = assembler.transform(df)

# Split data into Train and Test datasets
train_data, test_data = df_transformed.randomSplit([0.8, 0.2], seed=42)

# Initialize and Train DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label", maxDepth=3)
model = dt.fit(train_data)

# Step 6: Make Predictions
predictions = model.transform(test_data)
predictions.show()

+---+--------+--------+-----+---------+-------------+-----------+----------+
| id|feature1|feature2|label| features|rawPrediction|probability|prediction|
+---+--------+--------+-----+---------+-------------+-----------+----------+
|  2|     3.0|     4.0|    0|[3.0,4.0]|    [0.0,2.0]|  [0.0,1.0]|       1.0|
+---+--------+--------+-----+---------+-------------+-----------+----------+



In [33]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate the Model
# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

# Evaluate precision, recall, and F1-score
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision"
)
precision = precision_evaluator.evaluate(predictions)
print(f"Precision: {precision}")

recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall"
)
recall = recall_evaluator.evaluate(predictions)
print(f"Recall: {recall}")

f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)
f1 = f1_evaluator.evaluate(predictions)
print(f"F1 Score: {f1}")

# Inspect Predictions
predictions.select("id", "features", "label", "prediction", "probability").show()

Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
+---+---------+-----+----------+-----------+
| id| features|label|prediction|probability|
+---+---------+-----+----------+-----------+
|  2|[3.0,4.0]|    0|       1.0|  [0.0,1.0]|
+---+---------+-----+----------+-----------+

