In [1]:
# Example: learning about dicision tree using Spark MLlib.
# Dataset: datasets/iris_bezdekIris.csv
# Author: Humberto Bianchini

In [2]:
# 1) Importing all necessary libraries and Spark session creation.
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

spark = SparkSession.builder.appName("Decision Tree").getOrCreate()

In [3]:
# 2) Reading and showing the details of dataset.
df_iris = spark.read.format("csv").options(inferSchema=True, header='false', delimiter=',').load("iris_bezdekIris.csv")
df_iris = df_iris.selectExpr("_c0 as sep_len", "_c1 as sep_wid", "_c2 as pet_len", "_c3 as pet_wid", "_c4 as label")
df_iris.printSchema()

root
 |-- sep_len: double (nullable = true)
 |-- sep_wid: double (nullable = true)
 |-- pet_len: double (nullable = true)
 |-- pet_wid: double (nullable = true)
 |-- label: string (nullable = true)



In [4]:
df_iris.show(10)

+-------+-------+-------+-------+-----------+
|sep_len|sep_wid|pet_len|pet_wid|      label|
+-------+-------+-------+-------+-----------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|
|    5.4|    3.9|    1.7|    0.4|Iris-setosa|
|    4.6|    3.4|    1.4|    0.3|Iris-setosa|
|    5.0|    3.4|    1.5|    0.2|Iris-setosa|
|    4.4|    2.9|    1.4|    0.2|Iris-setosa|
|    4.9|    3.1|    1.5|    0.1|Iris-setosa|
+-------+-------+-------+-------+-----------+
only showing top 10 rows



In [5]:
df_iris.describe().show()

+-------+------------------+-------------------+------------------+------------------+--------------+
|summary|           sep_len|            sep_wid|           pet_len|           pet_wid|         label|
+-------+------------------+-------------------+------------------+------------------+--------------+
|  count|               150|                150|               150|               150|           150|
|   mean| 5.843333333333335|  3.057333333333334|3.7580000000000027| 1.199333333333334|          null|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467|          null|
|    min|               4.3|                2.0|               1.0|               0.1|   Iris-setosa|
|    max|               7.9|                4.4|               6.9|               2.5|Iris-virginica|
+-------+------------------+-------------------+------------------+------------------+--------------+



In [6]:
# 3) Starting of the decision tree construction and application process
vector_assembler = VectorAssembler(inputCols=['sep_len', 'sep_wid', 'pet_len', 'pet_wid'], outputCol='features')
df_temp = vector_assembler.transform(df_iris)
df_temp.show(10)

+-------+-------+-------+-------+-----------+-----------------+
|sep_len|sep_wid|pet_len|pet_wid|      label|         features|
+-------+-------+-------+-------+-----------+-----------------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|    5.4|    3.9|    1.7|    0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|    4.6|    3.4|    1.4|    0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|    5.0|    3.4|    1.5|    0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|    4.4|    2.9|    1.4|    0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|
|    4.9|    3.1|    1.5|    0.1|Iris-setosa|[4.9,3.1,1.5,0.1]|
+-------+-------+-------+-------+-----------+-----------------+
only showing top 10 rows



In [7]:
df_mini = df_temp.drop('sep_len', 'sep_wid', 'pet_len', 'pet_wid')
df_mini.show(10)

+-----------+-----------------+
|      label|         features|
+-----------+-----------------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|
|Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-setosa|[4.7,3.2,1.3,0.2]|
|Iris-setosa|[4.6,3.1,1.5,0.2]|
|Iris-setosa|[5.0,3.6,1.4,0.2]|
|Iris-setosa|[5.4,3.9,1.7,0.4]|
|Iris-setosa|[4.6,3.4,1.4,0.3]|
|Iris-setosa|[5.0,3.4,1.5,0.2]|
|Iris-setosa|[4.4,2.9,1.4,0.2]|
|Iris-setosa|[4.9,3.1,1.5,0.1]|
+-----------+-----------------+
only showing top 10 rows



In [8]:
l_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
df_final = l_indexer.fit(df_mini).transform(df_mini)
df_final.show(10)

+-----------+-----------------+----------+
|      label|         features|labelIndex|
+-----------+-----------------+----------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|
|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|
|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|
|Iris-setosa|[4.6,3.1,1.5,0.2]|       0.0|
|Iris-setosa|[5.0,3.6,1.4,0.2]|       0.0|
|Iris-setosa|[5.4,3.9,1.7,0.4]|       0.0|
|Iris-setosa|[4.6,3.4,1.4,0.3]|       0.0|
|Iris-setosa|[5.0,3.4,1.5,0.2]|       0.0|
|Iris-setosa|[4.4,2.9,1.4,0.2]|       0.0|
|Iris-setosa|[4.9,3.1,1.5,0.1]|       0.0|
+-----------+-----------------+----------+
only showing top 10 rows



In [9]:
(train, test) = df_final.randomSplit([0.7, 0.3])
test.show(5)
train.show(5)

+-----------+-----------------+----------+
|      label|         features|labelIndex|
+-----------+-----------------+----------+
|Iris-setosa|[4.3,3.0,1.1,0.1]|       0.0|
|Iris-setosa|[4.6,3.1,1.5,0.2]|       0.0|
|Iris-setosa|[4.8,3.0,1.4,0.1]|       0.0|
|Iris-setosa|[4.8,3.0,1.4,0.3]|       0.0|
|Iris-setosa|[4.8,3.1,1.6,0.2]|       0.0|
+-----------+-----------------+----------+
only showing top 5 rows

+-----------+-----------------+----------+
|      label|         features|labelIndex|
+-----------+-----------------+----------+
|Iris-setosa|[4.4,2.9,1.4,0.2]|       0.0|
|Iris-setosa|[4.4,3.0,1.3,0.2]|       0.0|
|Iris-setosa|[4.4,3.2,1.3,0.2]|       0.0|
|Iris-setosa|[4.5,2.3,1.3,0.3]|       0.0|
|Iris-setosa|[4.6,3.2,1.4,0.2]|       0.0|
+-----------+-----------------+----------+
only showing top 5 rows



In [10]:
# 4) Training our model.
treeModel = DecisionTreeClassifier(labelCol="labelIndex", featuresCol="features")
model = treeModel.fit(train)

In [11]:
# 5) Predicting the values.
prediction = model.transform(test)
prediction.select("prediction", "labelIndex").show(5)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [12]:
# 6) Calculating accuracy of the model.
evaluation = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluation.evaluate(prediction)
print(f"Accuracy of the model: {accuracy}")

Accuracy of the model: 0.875


In [13]:
# 7) Using Random Forest Classifier
modelRF = RandomForestClassifier(labelCol="labelIndex", featuresCol="features", numTrees=10)
model = modelRF.fit(train)

In [14]:
prediction = model.transform(test)
prediction.select("prediction", "labelIndex").show(5)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [15]:
evaluation = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluation.evaluate(prediction)
print(f"Accuracy of the model: {accuracy}")

Accuracy of the model: 0.9375
