In [1]:
#importing pyspark
import findspark
findspark.init()
import pyspark

In [2]:
#creating sparksession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Decision_Tree').getOrCreate()

In [3]:
#importing required libraries
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
#loading the dataset
iris_df=spark.read.csv('Iris.csv',inferSchema=True,header=False)

In [5]:
iris_df.printSchema()

root
 |-- _c0: double (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: string (nullable = true)



In [6]:
#total data instances in the dataset
iris_df.count()

150

In [7]:
#statistical summary
iris_df.describe().show(5,False)

+-------+------------------+-------------------+------------------+------------------+--------------+
|summary|_c0               |_c1                |_c2               |_c3               |_c4           |
+-------+------------------+-------------------+------------------+------------------+--------------+
|count  |150               |150                |150               |150               |150           |
|mean   |5.843333333333335 |3.0540000000000007 |3.7586666666666693|1.1986666666666672|null          |
|stddev |0.8280661279778637|0.43359431136217375|1.764420419952262 |0.7631607417008414|null          |
|min    |4.3               |2.0                |1.0               |0.1               |Iris-setosa   |
|max    |7.9               |4.4                |6.9               |2.5               |Iris-virginica|
+-------+------------------+-------------------+------------------+------------------+--------------+



In [8]:
#fitting string indexer for target label
labelIndexer = StringIndexer(inputCol="_c4", outputCol="indexedLabel").fit(iris_df)

In [9]:
feature_list=['_c0','_c1','_c2','_c3']
#fitting vector assembler for features
featureAssembler=VectorAssembler(inputCols=feature_list,outputCol='indexedFeatures')

In [10]:
#splitting the data into 75/25 ratio for training and testing set
train_df,test_df=iris_df.randomSplit([0.75,0.25])

In [11]:
#creating DecisionTreeClassifier instance
dt=DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [12]:
#creating pipeline for all stages of model creation
pipeline=Pipeline(stages=[labelIndexer, featureAssembler, dt])

In [13]:
#fitting the pipeline
dt_model=pipeline.fit(train_df)

In [14]:
#predicting for test dataset
predictions=dt_model.transform(test_df)

In [15]:
#viewing predictions
predictions.select("prediction", "indexedLabel", "indexedFeatures").show(5)

+----------+------------+-----------------+
|prediction|indexedLabel|  indexedFeatures|
+----------+------------+-----------------+
|       0.0|         0.0|[4.4,2.9,1.4,0.2]|
|       0.0|         0.0|[4.4,3.0,1.3,0.2]|
|       0.0|         0.0|[4.6,3.1,1.5,0.2]|
|       0.0|         0.0|[4.8,3.1,1.6,0.2]|
|       1.0|         1.0|[5.0,2.0,3.5,1.0]|
+----------+------------+-----------------+
only showing top 5 rows



In [17]:
#creating dataframe with prediction and actual labels of the target variable
predictionAndLabels=predictions.select("prediction", "indexedLabel")

In [25]:
#evaluating the model
evaluator=MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")

#list of metrics
acc=evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "accuracy"})
f1=evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "f1"})
weightedPrecision=evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "weightedPrecision"})
weightedRecall=evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "weightedRecall"})

In [26]:
print('Accuracy of the model:',round(acc*100,3))

Accuracy of the model: 93.023


In [27]:
print('f1-score of the model:',round(f1*100,3))

f1-score of the model: 93.042


In [28]:
print('Precision of the model:',round(weightedPrecision*100,3)) 

Precision of the model: 93.185


In [29]:
print('Recall of the model:',round(weightedRecall*100,3)) 

Recall of the model: 93.023
