# Notebook : Iris Multi-classification problem : A Spark + MLLib approach

In [1]:
# Importing relevant libraries
import pandas as pd

from pyspark import SparkConf, SparkContext,  SparkFiles
from pyspark.sql import SQLContext, Row

from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import DecisionTreeClassifier,  LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## I. Setting Spark Configuration

In [2]:
# Spark Local Configuration 
sconf =  SparkConf()
sconf.setAppName("irisPreds")
sconf.setMaster("local[1]")
sconf.set('spark.executor.memory', '4g')
sconf.set('spark.executor.cores', 8)
sconf.set('spark.logConf', True)

#Spark Context
sc = SparkContext(conf=sconf)
sqlContext = SQLContext(sc)

## II. Data Loading & Processing Pipeline

In [4]:
irisData = sqlContext.createDataFrame(pd.read_csv("iris.csv"))
irisData.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- variety: string (nullable = true)



In [5]:
irisData.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [6]:
# Gathering features together into ML models supported format (inputCols)
vecAssembler = VectorAssembler( inputCols=["sepal_length", "sepal_width", "petal_length","petal_width"], 
                              outputCol="features")

In [7]:
# Numeric indexing of the target variable
labellizer = StringIndexer(inputCol="variety", 
                           outputCol="label")

In [8]:
# Scaling features set (mean + std)
scaler = StandardScaler(inputCol="features", 
                        outputCol="scaled", 
                        withStd=True, 
                        withMean=True)

In [9]:
# Splitting into training/testing sets
(train, test)   = irisData.randomSplit([0.7, 0.3])

In [10]:
# Aggregated Preprocessing Pipeline
dataPreprocessor = Pipeline( stages=[vecAssembler, labellizer, scaler]).fit(train)

## III. ML Modeling, training and evaluation

In [11]:
# ML Models
## 1 - Decision Tree
dtClassifier = DecisionTreeClassifier(labelCol="label", 
                                      featuresCol="scaled", 
                                      maxDepth=4)
## 2 - Logistic Regression
lrClassifier  = LogisticRegression(labelCol="label", 
                                   featuresCol="scaled", 
                                   maxIter=30)

## 3 - Random Forest
rfClassifier = RandomForestClassifier(labelCol="label", 
                                      featuresCol="scaled",
                                      numTrees=20,
                                      maxDepth=3,
                                      minInstancesPerNode=6)

In [12]:
# Data Preprocessing Pipeline
readyTrain  = dataPreprocessor.transform(train)
readyTest   = dataPreprocessor.transform(test)

In [13]:
# Models objects fitting
dtModel     = dtClassifier.fit(readyTrain)
lrModel     = lrClassifier.fit(readyTrain)
rfModel     = rfClassifier.fit(readyTrain)

In [14]:
# Predicting on the test data
dtPreds  = dtModel.transform(readyTest)
lrPreds  = lrModel.transform(readyTest)
rfPreds  = rfModel.transform(readyTest)

In [15]:
# Predicting on the train data
dtPredsTrain  = dtModel.transform(readyTrain)
lrPredsTrain  = lrModel.transform(readyTrain)
rfPredsTrain = rfModel.transform(readyTrain)

In [16]:
# Evaluating accuracy
accEvaluator = MulticlassClassificationEvaluator(predictionCol="prediction", 
                                                 labelCol="label", 
                                                 metricName="accuracy")

In [17]:
dtTrainAcc = accEvaluator.evaluate(dtPredsTrain)
dtTestAcc =  accEvaluator.evaluate(dtPreds)

In [18]:
lrTrainAcc = accEvaluator.evaluate(lrPredsTrain)
lrTestAcc =  accEvaluator.evaluate(lrPreds)

In [19]:
rfTrainAcc = accEvaluator.evaluate(rfPredsTrain)
rfTestAcc =  accEvaluator.evaluate(rfPreds)

In [20]:
print("[TRAIN] Decision Tree: ", dtTrainAcc)
print("[TRAIN] Logistic Regression: ",lrTrainAcc )
print("[TRAIN] Random Forest: " , rfTrainAcc )

[TRAIN] Decision Tree:  0.9831932773109243
[TRAIN] Logistic Regression:  0.9831932773109243
[TRAIN] Random Forest:  0.9663865546218487


In [21]:
print("[TEST] Decision Tree: ", dtTestAcc)
print("[TEST] Logistic Regression: ", lrTestAcc)
print("[TEST] Random Forest: " , rfTestAcc)

[TEST] Decision Tree:  0.967741935483871
[TEST] Logistic Regression:  0.967741935483871
[TEST] Random Forest:  0.9032258064516129


In [22]:
#Closing Context
sc.stop()