# Decision Tree classification with Pyspark


In [21]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('treecode').getOrCreate()


In [22]:

data = spark.read.csv('hdfs://localhost:9000/dataset/drug.csv',inferSchema=True,header=True)

In [23]:
#Afficher nom des colonnes et les types des données
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- bp: string (nullable = true)
 |-- cholesterol: string (nullable = true)
 |-- Na_to_K: double (nullable = true)
 |-- drug: string (nullable = true)



In [24]:
data.head()

Row(age=23, sex='F', bp='HIGH', cholesterol='HIGH', Na_to_K=25.355, drug='drugY')

# Mise en forme Spark des données

In [25]:
#Pour Spark accepter les données, Il doit être sous la forme de deux colonnes !("label","features")
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer

In [26]:
#Les arbres de décision ne gèrent pas les variables catégorielles. Mais nous pouvons toujours
#convertir ces caractéristiques en valeurs numériques.
data.show()


+---+---+------+-----------+-------+-----+
|age|sex|    bp|cholesterol|Na_to_K| drug|
+---+---+------+-----------+-------+-----+
| 23|  F|  HIGH|       HIGH| 25.355|drugY|
| 47|  M|   LOW|       HIGH| 13.093|drugC|
| 47|  M|   LOW|       HIGH| 10.114|drugC|
| 28|  F|NORMAL|       HIGH|  7.798|drugX|
| 61|  F|   LOW|       HIGH| 18.043|drugY|
| 22|  F|NORMAL|       HIGH|  8.607|drugX|
| 49|  F|NORMAL|       HIGH| 16.275|drugY|
| 41|  M|   LOW|       HIGH| 11.037|drugC|
| 60|  M|NORMAL|       HIGH| 15.171|drugY|
| 43|  M|   LOW|     NORMAL| 19.368|drugY|
| 47|  F|   LOW|       HIGH| 11.767|drugC|
| 34|  F|  HIGH|     NORMAL| 19.199|drugY|
| 43|  M|   LOW|       HIGH| 15.376|drugY|
| 74|  F|   LOW|       HIGH| 20.942|drugY|
| 50|  F|NORMAL|       HIGH| 12.703|drugX|
| 16|  F|  HIGH|     NORMAL| 15.516|drugY|
| 69|  M|   LOW|     NORMAL| 11.455|drugX|
| 43|  M|  HIGH|       HIGH| 13.972|drugA|
| 23|  M|   LOW|       HIGH|  7.298|drugC|
| 32|  F|  HIGH|     NORMAL| 25.974|drugY|
+---+---+--

In [27]:
data.columns


['age', 'sex', 'bp', 'cholesterol', 'Na_to_K', 'drug']

In [28]:
#Nous pouvons appliquer StringIndexer à plusieurs colonnes dans un PySpark Dataframe
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(data) for column in list(set(data.columns)-set(['drug','Na_to_K','age'])) ]


In [29]:
pipeline = Pipeline(stages=indexers)

In [30]:
df_r = pipeline.fit(data).transform(data)


In [31]:
df_r.show()

+---+---+------+-----------+-------+-----+--------+-----------------+---------+
|age|sex|    bp|cholesterol|Na_to_K| drug|bp_index|cholesterol_index|sex_index|
+---+---+------+-----------+-------+-----+--------+-----------------+---------+
| 23|  F|  HIGH|       HIGH| 25.355|drugY|     0.0|              0.0|      1.0|
| 47|  M|   LOW|       HIGH| 13.093|drugC|     1.0|              0.0|      0.0|
| 47|  M|   LOW|       HIGH| 10.114|drugC|     1.0|              0.0|      0.0|
| 28|  F|NORMAL|       HIGH|  7.798|drugX|     2.0|              0.0|      1.0|
| 61|  F|   LOW|       HIGH| 18.043|drugY|     1.0|              0.0|      1.0|
| 22|  F|NORMAL|       HIGH|  8.607|drugX|     2.0|              0.0|      1.0|
| 49|  F|NORMAL|       HIGH| 16.275|drugY|     2.0|              0.0|      1.0|
| 41|  M|   LOW|       HIGH| 11.037|drugC|     1.0|              0.0|      0.0|
| 60|  M|NORMAL|       HIGH| 15.171|drugY|     2.0|              0.0|      0.0|
| 43|  M|   LOW|     NORMAL| 19.368|drug

In [32]:
#Transformer les caractéristiques qui fusionne plusieurs colonnes en une colonne vectorielle
assembler = VectorAssembler(
  inputCols=['age',
             'sex_index', 
             'bp_index', 
             'cholesterol_index', 
             'Na_to_K'],
              outputCol="features")

In [33]:
output = assembler.transform(df_r)


In [34]:
output.show()

+---+---+------+-----------+-------+-----+--------+-----------------+---------+--------------------+
|age|sex|    bp|cholesterol|Na_to_K| drug|bp_index|cholesterol_index|sex_index|            features|
+---+---+------+-----------+-------+-----+--------+-----------------+---------+--------------------+
| 23|  F|  HIGH|       HIGH| 25.355|drugY|     0.0|              0.0|      1.0|[23.0,1.0,0.0,0.0...|
| 47|  M|   LOW|       HIGH| 13.093|drugC|     1.0|              0.0|      0.0|[47.0,0.0,1.0,0.0...|
| 47|  M|   LOW|       HIGH| 10.114|drugC|     1.0|              0.0|      0.0|[47.0,0.0,1.0,0.0...|
| 28|  F|NORMAL|       HIGH|  7.798|drugX|     2.0|              0.0|      1.0|[28.0,1.0,2.0,0.0...|
| 61|  F|   LOW|       HIGH| 18.043|drugY|     1.0|              0.0|      1.0|[61.0,1.0,1.0,0.0...|
| 22|  F|NORMAL|       HIGH|  8.607|drugX|     2.0|              0.0|      1.0|[22.0,1.0,2.0,0.0...|
| 49|  F|NORMAL|       HIGH| 16.275|drugY|     2.0|              0.0|      1.0|[49.0,1.0,2.

In [35]:
from pyspark.ml.feature import StringIndexer


In [36]:
#Maintenant, nous pouvons remplir la variable cible drug
indexer = StringIndexer(inputCol="drug", outputCol="drugIndex")
output_fixed = indexer.fit(output).transform(output)

In [37]:
final_data = output_fixed.select("features",'drugIndex')
final_data

DataFrame[features: vector, drugIndex: double]

In [38]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [39]:
train_data.show()

+--------------------+---------+
|            features|drugIndex|
+--------------------+---------+
|(5,[0,4],[23.0,8....|      2.0|
|(5,[0,4],[29.0,12...|      2.0|
|(5,[0,4],[31.0,30...|      0.0|
|(5,[0,4],[39.0,9....|      2.0|
|(5,[0,4],[40.0,27...|      0.0|
|(5,[0,4],[43.0,13...|      2.0|
|(5,[0,4],[47.0,10...|      2.0|
|(5,[0,4],[50.0,7....|      2.0|
|(5,[0,4],[58.0,18...|      0.0|
|(5,[0,4],[59.0,13...|      3.0|
|(5,[0,4],[66.0,16...|      0.0|
|(5,[0,4],[68.0,11...|      3.0|
|(5,[0,4],[70.0,9....|      3.0|
|(5,[0,4],[74.0,9....|      3.0|
|[15.0,1.0,0.0,1.0...|      0.0|
|[16.0,0.0,0.0,1.0...|      0.0|
|[16.0,0.0,1.0,0.0...|      4.0|
|[16.0,1.0,0.0,1.0...|      0.0|
|[17.0,0.0,2.0,1.0...|      1.0|
|[18.0,1.0,0.0,0.0...|      0.0|
+--------------------+---------+
only showing top 20 rows



In [40]:
from pyspark.ml.classification import DecisionTreeClassifier


In [41]:
dtc = DecisionTreeClassifier(labelCol='drugIndex',featuresCol='features')


In [42]:
# Train the model
dtc_model = dtc.fit(train_data)

In [43]:
dtc_predictions = dtc_model.transform(test_data)
predicted = dtc_predictions.select("features","drugIndex", "prediction","rawPrediction")
predicted.show()

+--------------------+---------+----------+--------------------+
|            features|drugIndex|prediction|       rawPrediction|
+--------------------+---------+----------+--------------------+
|(5,[0,4],[34.0,18...|      0.0|       0.0|[63.0,0.0,0.0,0.0...|
|(5,[0,4],[51.0,18...|      0.0|       0.0|[63.0,0.0,0.0,0.0...|
|(5,[0,4],[60.0,13...|      3.0|       3.0|[0.0,0.0,0.0,11.0...|
|(5,[0,4],[70.0,13...|      3.0|       3.0|[0.0,0.0,0.0,11.0...|
|[15.0,0.0,0.0,1.0...|      0.0|       0.0|[63.0,0.0,0.0,0.0...|
|[15.0,0.0,2.0,0.0...|      1.0|       1.0|[0.0,24.0,0.0,0.0...|
|[20.0,1.0,1.0,1.0...|      1.0|       1.0|[0.0,12.0,0.0,0.0...|
|[22.0,0.0,1.0,0.0...|      4.0|       4.0|[0.0,0.0,0.0,0.0,...|
|[23.0,0.0,2.0,0.0...|      1.0|       1.0|[0.0,24.0,0.0,0.0...|
|[23.0,0.0,2.0,0.0...|      0.0|       0.0|[63.0,0.0,0.0,0.0...|
|[25.0,0.0,2.0,0.0...|      0.0|       0.0|[63.0,0.0,0.0,0.0...|
|[28.0,0.0,2.0,0.0...|      0.0|       0.0|[63.0,0.0,0.0,0.0...|
|[28.0,1.0,1.0,0.0...|   

In [44]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [45]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="drugIndex", predictionCol="prediction", metricName="accuracy")

In [46]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)


In [47]:
print("Here are the results!")
print('-'*80)
print('A single decision tree had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))

Here are the results!
--------------------------------------------------------------------------------
A single decision tree had an accuracy of: 96.43%
