In [1]:
#importing pyspark
import findspark
findspark.init()
import pyspark

In [2]:
#creating sparksession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Classifier').getOrCreate()

In [3]:
#importing required libraries
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier, MultilayerPerceptronClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
#loading the dataset
iris_df=spark.read.csv('Iris.csv',inferSchema=True,header=False)

In [5]:
iris_df.printSchema()

root
 |-- _c0: double (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: string (nullable = true)



In [6]:
#total data instances in the dataset
iris_df.count()

150

In [7]:
#statistical summary
iris_df.describe().show(5,False)

+-------+------------------+-------------------+------------------+------------------+--------------+
|summary|_c0               |_c1                |_c2               |_c3               |_c4           |
+-------+------------------+-------------------+------------------+------------------+--------------+
|count  |150               |150                |150               |150               |150           |
|mean   |5.843333333333335 |3.0540000000000007 |3.7586666666666693|1.1986666666666672|null          |
|stddev |0.8280661279778637|0.43359431136217375|1.764420419952262 |0.7631607417008414|null          |
|min    |4.3               |2.0                |1.0               |0.1               |Iris-setosa   |
|max    |7.9               |4.4                |6.9               |2.5               |Iris-virginica|
+-------+------------------+-------------------+------------------+------------------+--------------+



In [8]:
#fitting string indexer for target label
labelIndexer = StringIndexer(inputCol="_c4", outputCol="indexedLabel").fit(iris_df)
df=labelIndexer.transform(iris_df)
df.show(5)

+---+---+---+---+-----------+------------+
|_c0|_c1|_c2|_c3|        _c4|indexedLabel|
+---+---+---+---+-----------+------------+
|5.1|3.5|1.4|0.2|Iris-setosa|         0.0|
|4.9|3.0|1.4|0.2|Iris-setosa|         0.0|
|4.7|3.2|1.3|0.2|Iris-setosa|         0.0|
|4.6|3.1|1.5|0.2|Iris-setosa|         0.0|
|5.0|3.6|1.4|0.2|Iris-setosa|         0.0|
+---+---+---+---+-----------+------------+
only showing top 5 rows



In [9]:
feature_list=['_c0','_c1','_c2','_c3']
#fitting vector assembler for features
featureAssembler=VectorAssembler(inputCols=feature_list,outputCol='indexedFeatures')
output_df=featureAssembler.transform(df)
output_df.show(5)

+---+---+---+---+-----------+------------+-----------------+
|_c0|_c1|_c2|_c3|        _c4|indexedLabel|  indexedFeatures|
+---+---+---+---+-----------+------------+-----------------+
|5.1|3.5|1.4|0.2|Iris-setosa|         0.0|[5.1,3.5,1.4,0.2]|
|4.9|3.0|1.4|0.2|Iris-setosa|         0.0|[4.9,3.0,1.4,0.2]|
|4.7|3.2|1.3|0.2|Iris-setosa|         0.0|[4.7,3.2,1.3,0.2]|
|4.6|3.1|1.5|0.2|Iris-setosa|         0.0|[4.6,3.1,1.5,0.2]|
|5.0|3.6|1.4|0.2|Iris-setosa|         0.0|[5.0,3.6,1.4,0.2]|
+---+---+---+---+-----------+------------+-----------------+
only showing top 5 rows



In [10]:
#iris dataset with feature and output vector
iris_data=output_df.select('indexedFeatures','indexedLabel')

In [11]:
#splitting the data into 75/25 ratio for training and testing set
train_df,test_df=iris_data.randomSplit([0.8,0.2])

## Decision Tree Classifier

In [12]:
#creating DecisionTreeClassifier instance
dt=DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [14]:
#fitting the model
dt_model=dt.fit(train_df)

In [15]:
#predicting for test dataset
dt_pred=dt_model.transform(test_df)

In [16]:
#viewing predictions
dt_pred.select("prediction", "indexedLabel", "indexedFeatures").show(5)

+----------+------------+-----------------+
|prediction|indexedLabel|  indexedFeatures|
+----------+------------+-----------------+
|       0.0|         0.0|[4.4,3.0,1.3,0.2]|
|       0.0|         0.0|[4.6,3.4,1.4,0.3]|
|       2.0|         1.0|[4.9,2.4,3.3,1.0]|
|       0.0|         0.0|[4.9,3.0,1.4,0.2]|
|       0.0|         0.0|[4.9,3.1,1.5,0.1]|
+----------+------------+-----------------+
only showing top 5 rows



In [17]:
#evaluating the model
evaluator=MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
acc=evaluator.evaluate(dt_pred, {evaluator.metricName: "accuracy"})

In [18]:
print('Accuracy of Decision tree classifier model:',round(acc*100,3))

Accuracy of Decision tree classifier model: 93.939


## Multilayer Perceptron Classifier

In [19]:
#creating MultilayerPerceptronClassifier instance
mlp=MultilayerPerceptronClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=200, layers=[4,3,2,3], 
                                   blockSize=64)

In [20]:
#fitting the model
mlp_model=mlp.fit(train_df)

In [21]:
#predicting for test dataset
mlp_pred=mlp_model.transform(test_df)

In [22]:
#viewing predictions
mlp_pred.select("prediction", "indexedLabel", "indexedFeatures").show(5)

+----------+------------+-----------------+
|prediction|indexedLabel|  indexedFeatures|
+----------+------------+-----------------+
|       0.0|         0.0|[4.4,3.0,1.3,0.2]|
|       0.0|         0.0|[4.6,3.4,1.4,0.3]|
|       1.0|         1.0|[4.9,2.4,3.3,1.0]|
|       0.0|         0.0|[4.9,3.0,1.4,0.2]|
|       0.0|         0.0|[4.9,3.1,1.5,0.1]|
+----------+------------+-----------------+
only showing top 5 rows



In [23]:
#evaluating the model
evaluator=MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
acc=evaluator.evaluate(mlp_pred, {evaluator.metricName: "accuracy"})

In [24]:
print('Accuracy of Multilayer Perceptron Classifier model:',round(acc*100,3))

Accuracy of Multilayer Perceptron Classifier model: 96.97


# Ensemble Model

## Random Forest Classifier

In [37]:
#creating RandomForestClassifier instance
rfc=RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

In [38]:
#fitting the model
rfc_model=rfc.fit(train_df)

In [39]:
#predicting for test dataset
rfc_pred=rfc_model.transform(test_df)

In [40]:
#viewing predictions
rfc_pred.select("prediction", "indexedLabel", "indexedFeatures").show(5)

+----------+------------+-----------------+
|prediction|indexedLabel|  indexedFeatures|
+----------+------------+-----------------+
|       0.0|         0.0|[4.4,3.0,1.3,0.2]|
|       0.0|         0.0|[4.6,3.4,1.4,0.3]|
|       1.0|         1.0|[4.9,2.4,3.3,1.0]|
|       0.0|         0.0|[4.9,3.0,1.4,0.2]|
|       0.0|         0.0|[4.9,3.1,1.5,0.1]|
+----------+------------+-----------------+
only showing top 5 rows



In [41]:
#evaluating the model
evaluator=MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
acc=evaluator.evaluate(rfc_pred, {evaluator.metricName: "accuracy"})

In [42]:
print('Accuracy of Random Forest Classifier model:',round(acc*100,3))

Accuracy of Random Forest Classifier model: 100.0
