## Decision Tree

<strong> Classification - Decision Tree </strong>
<ul style="list-style-type:square">
  <li>Features :   PETAL_LENGTH, PETAL_WIDTH, SEPAL_LENGTH, SEPAL_WIDTH.</li>
  <li>Target : SPECIES</li>
  <li>Model : Decision Tree</li>
</ul>

## Flowers Classification - Iris Dataset

In [2]:
# Spark Session - to work with dataframes
spSession = SparkSession.builder.master("local").appName("Decision-Tree").config("some.config", "session").getOrCreate()

In [31]:
from pyspark.sql import Row
# StringIndexer - To convert labels into numeric values
from pyspark.ml.feature import StringIndexer
# Vector - To create a LabeledPoint
from pyspark.ml.linalg import Vectors
# Model
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 


In [32]:
# Load dataset and store it in cache
irisRDD = sc.textFile("iris.csv")
irisRDD.cache()

iris.csv MapPartitionsRDD[83] at textFile at <unknown>:0

In [33]:
irisRDD.take(3)

['Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species',
 '5.1,3.5,1.4,0.2,setosa',
 '4.9,3,1.4,0.2,setosa']

In [34]:
# Remmoving header
irisRDD2 = irisRDD.filter(lambda x : 'Sepal' not in x)
irisRDD2.take(2)

['5.1,3.5,1.4,0.2,setosa', '4.9,3,1.4,0.2,setosa']

## Data Cleaning

In [35]:
# Splitting Columns
irisRDD3 = irisRDD2.map(lambda x : x.split(","))

In [36]:
# Maping all Columns 
irisRDD4 = irisRDD3.map(lambda x : Row(SEPAL_LENGTH = float(x[0]), SEPAL_WIDTH = float(x[1]), PETAL_LENGTH = float(x[2]),
                                       PETAL_WIDTH = float(x[3]), SPECIES = x[4] ) )

In [37]:
# Create dataframe
irisDF = spSession.createDataFrame(irisRDD4)

In [38]:
irisDF.take(3)

[Row(PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SEPAL_LENGTH=5.1, SEPAL_WIDTH=3.5, SPECIES='setosa'),
 Row(PETAL_LENGTH=1.4, PETAL_WIDTH=0.2, SEPAL_LENGTH=4.9, SEPAL_WIDTH=3.0, SPECIES='setosa'),
 Row(PETAL_LENGTH=1.3, PETAL_WIDTH=0.2, SEPAL_LENGTH=4.7, SEPAL_WIDTH=3.2, SPECIES='setosa')]

In [39]:
# Set numerical labels to the species
string_Indexer = StringIndexer(inputCol="SPECIES", outputCol="SPECIES_IDX") # labeled object
Str_ind_model = string_Indexer.fit(irisDF) # Fit in dataframe
irisNormDF = Str_ind_model.transform(irisDF)  # Final Dataframe

In [40]:
#Labeldd Species
irisNormDF.select("SPECIES", "SPECIES_IDX").distinct(). collect()

[Row(SPECIES='versicolor', SPECIES_IDX=0.0),
 Row(SPECIES='setosa', SPECIES_IDX=2.0),
 Row(SPECIES='virginica', SPECIES_IDX=1.0)]

## Data Exploration

In [41]:
# Statistics
irisNormDF.describe().show()

+-------+------------------+------------------+------------------+------------------+---------+------------------+
|summary|      PETAL_LENGTH|       PETAL_WIDTH|      SEPAL_LENGTH|       SEPAL_WIDTH|  SPECIES|       SPECIES_IDX|
+-------+------------------+------------------+------------------+------------------+---------+------------------+
|  count|               150|               150|               150|               150|      150|               150|
|   mean| 3.758000000000001|1.1993333333333331| 5.843333333333332|3.0573333333333337|     null|               1.0|
| stddev|1.7652982332594662|0.7622376689603467|0.8280661279778634|0.4358662849366978|     null|0.8192319205190404|
|    min|               1.0|               0.1|               4.3|               2.0|   setosa|               0.0|
|    max|               6.9|               2.5|               7.9|               4.4|virginica|               2.0|
+-------+------------------+------------------+------------------+--------------

In [42]:
# Correlation and dependence between the target variable and features
for i in irisNormDF.columns:
    if not( isinstance(irisNormDF.select(i).take(1)[0][0], str)):
        print('Correlation between Species and', i , irisNormDF.stat.corr("SPECIES_IDX", i))

Correlation between Species and PETAL_LENGTH -0.649241830764174
Correlation between Species and PETAL_WIDTH -0.5803770334306263
Correlation between Species and SEPAL_LENGTH -0.4600391565002369
Correlation between Species and SEPAL_WIDTH 0.6183715308237433
Correlation between Species and SPECIES_IDX 1.0


# Data Pre-Processing 

In [43]:
# Create a lebeledPoint (target, Vector[features])
# We can use more than one target variable
def transformaVar(row) :
    obj = (row["SPECIES"], row["SPECIES_IDX"], Vectors.dense([row["SEPAL_LENGTH"], row["SEPAL_WIDTH"], 
                                                              row["PETAL_LENGTH"], row["PETAL_WIDTH"]]))
    return obj

In [44]:
# Apply the function to the dataset
irisRDD5 = irisNormDF.rdd.map(transformVar) 

In [46]:
irisRDD5.take(5)

[('setosa', 2.0, DenseVector([5.1, 3.5, 1.4, 0.2])),
 ('setosa', 2.0, DenseVector([4.9, 3.0, 1.4, 0.2])),
 ('setosa', 2.0, DenseVector([4.7, 3.2, 1.3, 0.2])),
 ('setosa', 2.0, DenseVector([4.6, 3.1, 1.5, 0.2])),
 ('setosa', 2.0, DenseVector([5.0, 3.6, 1.4, 0.2]))]

In [47]:
# DataFrame with species, label and features
irisDF = spSession.createDataFrame(irisRDD5,["species", "label","features"])
irisDF.select("species","label","features").show(10)
irisDF.cache()

+-------+-----+-----------------+
|species|label|         features|
+-------+-----+-----------------+
| setosa|  2.0|[5.1,3.5,1.4,0.2]|
| setosa|  2.0|[4.9,3.0,1.4,0.2]|
| setosa|  2.0|[4.7,3.2,1.3,0.2]|
| setosa|  2.0|[4.6,3.1,1.5,0.2]|
| setosa|  2.0|[5.0,3.6,1.4,0.2]|
| setosa|  2.0|[5.4,3.9,1.7,0.4]|
| setosa|  2.0|[4.6,3.4,1.4,0.3]|
| setosa|  2.0|[5.0,3.4,1.5,0.2]|
| setosa|  2.0|[4.4,2.9,1.4,0.2]|
| setosa|  2.0|[4.9,3.1,1.5,0.1]|
+-------+-----+-----------------+
only showing top 10 rows



DataFrame[species: string, label: double, features: vector]

## Machine learnining 

In [48]:
# training and test dataset - 70 % training and 30 % test
(training_set, test_set) = irisDF.randomSplit([0.7,0.3])

In [49]:
training_set.count()

103

In [50]:
test_set.count()

47

In [51]:
# Model
dt_classifier = DecisionTreeClassifier(maxDepth= 2, labelCol= "label", featuresCol= "features")
model = dt_classifier.fit(training_set)

In [52]:
# Nodes of the model
model.numNodes

5

In [54]:
# Depth of the model
model.depth

2

In [56]:
# Prediction
predictions =  model.transform(test_set)
predictions.select("prediction", "species", "label").collect()

[Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='setosa', label=2.0),
 Row(prediction=2.0, species='s

In [57]:
# Accuracy of the model
accuracy = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label", metricName= "accuracy" )
accuracy.evaluate(predictions)

0.9787234042553191

In [59]:
# Confusion matrix shows whether  the prediction is correct or not
predictions.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|   12|
|  2.0|       2.0|   20|
|  1.0|       0.0|    1|
|  0.0|       0.0|   14|
+-----+----------+-----+

