In [32]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer,IndexToString,VectorIndexer
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.pipeline import Pipeline


In [33]:
spark = SparkSession.builder \
        .appName("decision tree") \
        .master("local[4]") \
        .getOrCreate()

# 获取rdd
sc = spark.sparkContext
rdd = sc.textFile("./resources/iris.txt")
print(rdd.take(2))

['5.1,3.5,1.4,0.2,Iris-setosa', '4.9,3.0,1.4,0.2,Iris-setosa']


In [34]:
rdd = rdd.map(lambda line:line.strip().split(",")) \
        .map(lambda line:Row(features=Vectors.dense(float(line[0]),
                                                    float(line[1]),
                                                    float(line[2]),
                                                    float(line[3])),labels=line[4]))
df = spark.createDataFrame(rdd)

In [35]:
df.show(2)

+-----------------+-----------+
|         features|     labels|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
+-----------------+-----------+
only showing top 2 rows



In [43]:
# 标签编码器
labelIndexModel = StringIndexer(inputCol="labels",outputCol="indexedLabel").fit(df)
# 标签解码器
labelConverter = IndexToString(inputCol="prediction",outputCol="predLabel",labels=labelIndexModel.labels)
# 特征向量化
featureVector= VectorIndexer(inputCol="features",outputCol="featuresVec")

In [37]:
# 数据集划分
trainData,testData = df.randomSplit([0.7,0.3])

In [38]:
trainData.count()

112

In [51]:
decisionTreeCls = DecisionTreeClassifier(
    labelCol="indexedLabel",
    featuresCol="featuresVec",
    predictionCol="prediction",
    probabilityCol="probability",
    maxDepth=5,
    impurity="gini"
)
print(decisionTreeCls.explainParams())

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featuresCol: features column name. (default: features, current: featuresVec)
impurity: Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini (default: gini, current: gini)
labelCol: label column name. (default: label, current: indexedLabel)
leafCol: Leaf indices column name. Predicted leaf index of each instance in each tree by preorder. (default: )
maxBins: Max nu

In [54]:
# 构建机器学习的管道
pipeline = Pipeline(stages=[
    labelIndexModel,featureVector,decisionTreeCls,labelConverter
])

# 模型训练
pipelineModel = pipeline.fit(trainData)
# 模型预测
pred_df = pipelineModel.transform(testData)

In [55]:
# 查看
for item in pred_df.select("features","labels","probability","predLabel").head(5):
    print(
        str(item["labels"]),":",str(item["features"]),"---->prob:",str(item["probability"]),str(item["predLabel"])
    )

Iris-setosa : [4.3,3.0,1.1,0.1] ---->prob: [1.0,0.0,0.0] Iris-setosa
Iris-setosa : [4.6,3.2,1.4,0.2] ---->prob: [1.0,0.0,0.0] Iris-setosa
Iris-setosa : [4.6,3.6,1.0,0.2] ---->prob: [1.0,0.0,0.0] Iris-setosa
Iris-setosa : [4.7,3.2,1.3,0.2] ---->prob: [1.0,0.0,0.0] Iris-setosa
Iris-setosa : [4.8,3.1,1.6,0.2] ---->prob: [1.0,0.0,0.0] Iris-setosa


In [56]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="indexedLabel")
auc = evaluator.evaluate(pred_df)
print(auc)

0.9735839598997494


In [57]:
# 获取重要性指标
dtreeModel = pipelineModel.stages[2]
dtreeModel.featureImportances

SparseVector(4, {0: 0.0247, 1: 0.0176, 2: 0.931, 3: 0.0267})

In [58]:
## 下面尝试在entropy模式下的指标信息
decisionTreeCls = DecisionTreeClassifier(
    labelCol="indexedLabel",
    featuresCol="featuresVec",
    predictionCol="prediction",
    probabilityCol="probability",
    maxDepth=5,
    impurity="entropy"
)
print(decisionTreeCls.explainParams())

# 构建机器学习的管道
pipeline = Pipeline(stages=[
    labelIndexModel,featureVector,decisionTreeCls,labelConverter
])

# 模型训练
pipelineModel = pipeline.fit(trainData)
# 模型预测
pred_df = pipelineModel.transform(testData)

# 查看
for item in pred_df.select("features","labels","probability","predLabel").head(5):
    print(
        str(item["labels"]),":",str(item["features"]),"---->prob:",str(item["probability"]),str(item["predLabel"])
    )
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="indexedLabel")
auc = evaluator.evaluate(pred_df)
print(auc)
# 获取重要性指标
dtreeModel = pipelineModel.stages[2]
dtreeModel.featureImportances

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featuresCol: features column name. (default: features, current: featuresVec)
impurity: Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini (default: gini, current: entropy)
labelCol: label column name. (default: label, current: indexedLabel)
leafCol: Leaf indices column name. Predicted leaf index of each instance in each tree by preorder. (default: )
maxBins: Max

SparseVector(4, {0: 0.0186, 1: 0.0136, 2: 0.9298, 3: 0.0381})