In [1]:
'''
    @Author: King
    @Date: 2019.05.20
    @Purpose: Spark 2.1.0入门：决策树分类器(Python版)
    @Introduction:  Spark 2.1.0入门：决策树分类器(Python版)
    @Datasets: 
    @Link : http://dblab.xmu.edu.cn/blog/1776-2/
    @Reference : Spark 2.1.0入门：决策树分类器(Python版)
'''

'\n    @Author: King\n    @Date: 2019.05.20\n    @Purpose: Spark 2.1.0入门：决策树分类器(Python版)\n    @Introduction:  Spark 2.1.0入门：决策树分类器(Python版)\n    @Datasets: \n    @Link : http://dblab.xmu.edu.cn/blog/1776-2/\n    @Reference : Spark 2.1.0入门：决策树分类器(Python版)\n'

![作者](../img/bigdata-roadmap.jpg)
【版权声明】博客内容由厦门大学数据库实验室拥有版权，未经允许，请勿转载！

## 一、决策树分类器

### 1、方法简介

​ 决策树（decision tree）是一种基本的分类与回归方法，这里主要介绍用于分类的决策树。决策树模式呈树形结构，其中每个内部节点表示一个属性上的测试，每个分支代表一个测试输出，每个叶节点代表一种类别。学习时利用训练数据，根据损失函数最小化的原则建立决策树模型；预测时，对新的数据，利用决策树模型进行分类。

### 二、示例代码

我们以iris数据集（iris）为例进行分析。iris以鸢尾花的特征作为数据来源，数据集包含150个数据集，分为3类，每类50个数据，每个数据包含4个属性，是在数据挖掘、数据分类中非常常用的测试集、训练集。决策树可以用于分类和回归，接下来我们将在代码中分别进行介绍。

#### 1. 导入需要的包

In [2]:
# 引入 pyspark 库
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()

from pyspark.ml.linalg import Vector,Vectors
from pyspark.sql import Row
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString,StringIndexer,VectorIndexer

#### 2. 读取数据，简要分析

1. 读取文本文件，
1. 第一个map把每行的数据用“,”隔开，比如在我们的数据集中，每行被分成了5部分，前4部分是鸢尾花的4个特征，最后一部分是鸢尾花的分类；
1. 我们这里把特征存储在Vector中，
1. 创建一个Iris模式的RDD，
1. 然后转化成dataframe；
1. 然后把刚刚得到的数据注册成一个表iris，
1. 注册成这个表之后，我们就可以通过sql语句进行数据查询；
1. 选出我们需要的数据后，我们可以把结果打印出来查看一下数据。

In [4]:
def f(x):
    rel = {}
    rel['features'] = Vectors.dense(float(x[0]),float(x[1]),float(x[2]),float(x[3]))
    rel['label'] = str(x[4])
    return rel
 
data = spark.sparkContext.textFile("../resources/iris.txt").map(lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF()
 
data.createOrReplaceTempView("iris")
 
df = spark.sql("select * from iris")
 
rel = df.rdd.map(lambda t : str(t[1])+":"+str(t[0])).collect()
for item in rel:
    print(item)

Iris-setosa:[5.1,3.5,1.4,0.2]
Iris-setosa:[4.9,3.0,1.4,0.2]
Iris-setosa:[4.7,3.2,1.3,0.2]
Iris-setosa:[4.6,3.1,1.5,0.2]
Iris-setosa:[5.0,3.6,1.4,0.2]
Iris-setosa:[5.4,3.9,1.7,0.4]
Iris-setosa:[4.6,3.4,1.4,0.3]
Iris-setosa:[5.0,3.4,1.5,0.2]
Iris-setosa:[4.4,2.9,1.4,0.2]
Iris-setosa:[4.9,3.1,1.5,0.1]
Iris-setosa:[5.4,3.7,1.5,0.2]
Iris-setosa:[4.8,3.4,1.6,0.2]
Iris-setosa:[4.8,3.0,1.4,0.1]
Iris-setosa:[4.3,3.0,1.1,0.1]
Iris-setosa:[5.8,4.0,1.2,0.2]
Iris-setosa:[5.7,4.4,1.5,0.4]
Iris-setosa:[5.4,3.9,1.3,0.4]
Iris-setosa:[5.1,3.5,1.4,0.3]
Iris-setosa:[5.7,3.8,1.7,0.3]
Iris-setosa:[5.1,3.8,1.5,0.3]
Iris-setosa:[5.4,3.4,1.7,0.2]
Iris-setosa:[5.1,3.7,1.5,0.4]
Iris-setosa:[4.6,3.6,1.0,0.2]
Iris-setosa:[5.1,3.3,1.7,0.5]
Iris-setosa:[4.8,3.4,1.9,0.2]
Iris-setosa:[5.0,3.0,1.6,0.2]
Iris-setosa:[5.0,3.4,1.6,0.4]
Iris-setosa:[5.2,3.5,1.5,0.2]
Iris-setosa:[5.2,3.4,1.4,0.2]
Iris-setosa:[4.7,3.2,1.6,0.2]
Iris-setosa:[4.8,3.1,1.6,0.2]
Iris-setosa:[5.4,3.4,1.5,0.4]
Iris-setosa:[5.2,4.1,1.5,0.1]
Iris-setos

#### 3. 进一步处理特征和标签，以及数据分组

In [5]:
# 分别获取标签列和特征列，进行索引，并进行了重命名。
labelIndexer = StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
 
featureIndexer = VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df)
 
# 这里我们设置一个labelConverter，目的是把预测的类别重新转化成字符型的。
labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
# 接下来，我们把数据集随机分成训练集和测试集，其中训练集占70%。
trainingData, testData = data.randomSplit([0.7, 0.3])

#### 4. 构建决策树分类模型

In [6]:
# 导入所需要的包
from pyspark.ml.classification import DecisionTreeClassificationModel,DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# 训练决策树模型,这里我们可以通过setter的方法来设置决策树的参数，也可以用ParamMap来设置（具体的可以查看spark mllib的官网）。具体的可以设置的参数可以通过explainParams()来获取。
dtClassifier = DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
# 在pipeline中进行设置
pipelinedClassifier = Pipeline().setStages([labelIndexer, featureIndexer, dtClassifier, labelConverter])
# 训练决策树模型
modelClassifier = pipelinedClassifier.fit(trainingData)
# 进行预测
predictionsClassifier = modelClassifier.transform(testData)
# 查看部分预测的结果
predictionsClassifier.select("predictedLabel", "label", "features").show(20)

+---------------+---------------+-----------------+
| predictedLabel|          label|         features|
+---------------+---------------+-----------------+
|    Iris-setosa|    Iris-setosa|[4.4,2.9,1.4,0.2]|
|    Iris-setosa|    Iris-setosa|[4.4,3.0,1.3,0.2]|
|    Iris-setosa|    Iris-setosa|[4.6,3.1,1.5,0.2]|
|    Iris-setosa|    Iris-setosa|[4.6,3.6,1.0,0.2]|
|    Iris-setosa|    Iris-setosa|[4.7,3.2,1.3,0.2]|
|    Iris-setosa|    Iris-setosa|[4.8,3.0,1.4,0.3]|
|    Iris-setosa|    Iris-setosa|[4.8,3.1,1.6,0.2]|
|    Iris-setosa|    Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-versicolor|Iris-versicolor|[5.0,2.0,3.5,1.0]|
|Iris-versicolor|Iris-versicolor|[5.1,2.5,3.0,1.1]|
|    Iris-setosa|    Iris-setosa|[5.1,3.5,1.4,0.2]|
|    Iris-setosa|    Iris-setosa|[5.2,3.4,1.4,0.2]|
|    Iris-setosa|    Iris-setosa|[5.2,3.5,1.5,0.2]|
|    Iris-setosa|    Iris-setosa|[5.5,4.2,1.4,0.2]|
|Iris-versicolor|Iris-versicolor|[5.6,2.5,3.9,1.1]|
| Iris-virginica| Iris-virginica|[5.6,2.8,4.9,2.0]|
|    Iris-se

#### 5. 评估决策树分类模型

In [8]:
evaluatorClassifier = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
 
accuracy = evaluatorClassifier.evaluate(predictionsClassifier)
 
print("Test Error = " + str(1.0 - accuracy))
 
treeModelClassifier = modelClassifier.stages[2]
 
print("Learned classification tree model:\n" + str(treeModelClassifier.toDebugString))

Test Error = 0.052631578947368474
Learned classification tree model:
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_ce3dc286ec78) of depth 5 with 11 nodes
  If (feature 2 <= 2.5999999999999996)
   Predict: 0.0
  Else (feature 2 > 2.5999999999999996)
   If (feature 3 <= 1.65)
    If (feature 2 <= 4.95)
     Predict: 1.0
    Else (feature 2 > 4.95)
     If (feature 0 <= 6.05)
      If (feature 1 <= 2.25)
       Predict: 2.0
      Else (feature 1 > 2.25)
       Predict: 1.0
     Else (feature 0 > 6.05)
      Predict: 2.0
   Else (feature 3 > 1.65)
    Predict: 2.0



​ 从上述结果可以看到模型的预测准确率为 0.94以及训练的决策树模型结构。

#### 6. 构建决策树回归模型

In [9]:
# 导入所需要的包
from pyspark.ml.regression import DecisionTreeRegressionModel,DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
# 训练决策树模型
dtRegressor = DecisionTreeRegressor().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
# 在pipeline中进行设置
pipelineRegressor = Pipeline().setStages([labelIndexer, featureIndexer, dtRegressor, labelConverter])
# 训练决策树模型
modelRegressor = pipelineRegressor.fit(trainingData)
# 进行预测
predictionsRegressor = modelRegressor.transform(testData)
# 查看部分预测结果
predictionsRegressor.select("predictedLabel", "label", "features").show(20)

+---------------+---------------+-----------------+
| predictedLabel|          label|         features|
+---------------+---------------+-----------------+
|    Iris-setosa|    Iris-setosa|[4.4,2.9,1.4,0.2]|
|    Iris-setosa|    Iris-setosa|[4.4,3.0,1.3,0.2]|
|    Iris-setosa|    Iris-setosa|[4.6,3.1,1.5,0.2]|
|    Iris-setosa|    Iris-setosa|[4.6,3.6,1.0,0.2]|
|    Iris-setosa|    Iris-setosa|[4.7,3.2,1.3,0.2]|
|    Iris-setosa|    Iris-setosa|[4.8,3.0,1.4,0.3]|
|    Iris-setosa|    Iris-setosa|[4.8,3.1,1.6,0.2]|
|    Iris-setosa|    Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-versicolor|Iris-versicolor|[5.0,2.0,3.5,1.0]|
|Iris-versicolor|Iris-versicolor|[5.1,2.5,3.0,1.1]|
|    Iris-setosa|    Iris-setosa|[5.1,3.5,1.4,0.2]|
|    Iris-setosa|    Iris-setosa|[5.2,3.4,1.4,0.2]|
|    Iris-setosa|    Iris-setosa|[5.2,3.5,1.5,0.2]|
|    Iris-setosa|    Iris-setosa|[5.5,4.2,1.4,0.2]|
|Iris-versicolor|Iris-versicolor|[5.6,2.5,3.9,1.1]|
| Iris-virginica| Iris-virginica|[5.6,2.8,4.9,2.0]|
|    Iris-se

#### 7. 评估决策树回归模型

In [10]:
evaluatorRegressor = RegressionEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("rmse")
 
rmse = evaluatorRegressor.evaluate(predictionsRegressor)
 
print("Root Mean Squared Error (RMSE) on test data = " +str(rmse))
 
treeModelRegressor = modelRegressor.stages[2]
 
print("Learned regression tree model:\n" + str(treeModelRegressor.toDebugString))

Root Mean Squared Error (RMSE) on test data = 0.2294157338705618
Learned regression tree model:
DecisionTreeRegressionModel (uid=DecisionTreeRegressor_541928b4af91) of depth 5 with 11 nodes
  If (feature 2 <= 2.5999999999999996)
   Predict: 0.0
  Else (feature 2 > 2.5999999999999996)
   If (feature 3 <= 1.65)
    If (feature 2 <= 4.95)
     Predict: 1.0
    Else (feature 2 > 4.95)
     If (feature 0 <= 6.05)
      If (feature 1 <= 2.25)
       Predict: 2.0
      Else (feature 1 > 2.25)
       Predict: 1.0
     Else (feature 0 > 6.05)
      Predict: 2.0
   Else (feature 3 > 1.65)
    Predict: 2.0

