In [2]:
# -*- coding: utf-8 -*-
# {{{
from io import StringIO
import pandas
import pydotplus
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier


def createDataSet():
    """
    创建数据集

    :return: 数据集与特征集
    """
    dataSet = [['hot', 'sunny', 'high', 'false', 'no'],
               ['hot', 'sunny', 'high', 'true', 'no'],
               ['hot', 'overcast', 'high', 'false', 'yes'],
               ['cool', 'rain', 'normal', 'false', 'yes'],
               ['cool', 'overcast', 'normal', 'true', 'yes'],
               ['mild', 'sunny', 'high', 'false', 'no'],
               ['cool', 'sunny', 'normal', 'false', 'yes'],
               ['mild', 'rain', 'normal', 'false', 'yes'],
               ['mild', 'sunny', 'normal', 'true', 'yes'],
               ['mild', 'overcast', 'high', 'true', 'yes'],
               ['hot', 'overcast', 'normal', 'false', 'yes'],
               ['mild', 'sunny', 'high', 'true', 'no'],
               ['cool', 'sunny', 'normal', 'true', 'no'],
               ['mild', 'sunny', 'high', 'false', 'yes']]
    labels = ['climate', 'weather', 'temple', 'cold']
    return dataSet, labels


if __name__ == '__main__':
    dataSet, labels = createDataSet()
    yDataList = []  # 提取每组数据的类别，保存在列表里
    for each in dataSet:
        yDataList.append(each[-1])

    dataDict = {}
    for each_label in labels:
        tempList = list()
        for each in dataSet:
            tempList.append(each[labels.index(each_label)])
        dataDict[each_label] = tempList

    dataPD = pandas.DataFrame(dataDict)

    leDict = dict()
    for col in dataPD.columns:
        leDict[col] = LabelEncoder()
        dataPD[col] = leDict[col].fit_transform(dataPD[col])

    dt = DecisionTreeClassifier()
    dt.fit(dataPD.values.tolist(), yDataList)

    dot_data = StringIO()
    tree.export_graphviz(dt, out_file=dot_data,  # 绘制决策树
                         feature_names=dataPD.keys(),
                         class_names=dt.classes_,
                         filled=True, rounded=True,
                         special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.progs = {'dot': u"d:\\Graphviz\\bin\\dot.exe"}
    graph.write_pdf("tree.pdf")

    xTest = [['hot', 'overcast', 'high', 'false'], ['mild', 'sunny', 'high', 'true']]
    testDict = {}
    for each_label in labels:
        tempList = list()
        for each in xTest:
            tempList.append(each[labels.index(each_label)])
        testDict[each_label] = tempList
    testPD = pandas.DataFrame(testDict)  # 生成pandas.DataFrame
    for col in testPD.columns:  # 为每一列序列化
        testPD[col] = leDict[col].transform(testPD[col])

    result = dt.predict(testPD.values.tolist())
    print(result)

['yes' 'no']


In [3]:
dot_data = StringIO()
tree.export_graphviz(dt, out_file=dot_data,  # 绘制决策树
    feature_names=dataPD.keys(),
    class_names=dt.classes_,
    filled=True, rounded=True,
    special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("tree.pdf")

True

In [5]:
print(dataPD.keys())
dt.classes_

Index(['climate', 'cold', 'temple', 'weather'], dtype='object')


array(['no', 'yes'],
      dtype='<U3')

In [15]:
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 23 21:26:09 2018

@author: muli
"""

from sklearn.tree import DecisionTreeClassifier
from sklearn import  datasets
from sklearn import cross_validation

# 可视化需要的包
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
import pydotplus


def load_data():
    '''
    加载用于分类问题的数据集。数据集采用 scikit-learn 自带的 iris 数据集

    :return: 一个元组，用于分类问题。
    元组元素依次为：训练样本集、测试样本集、训练样本集对应的标记、测试样本集对应的标记
    '''
    # scikit-learn 自带的 iris 数据集
    iris=datasets.load_iris() 
    X_train=iris.data
    y_train=iris.target
    # 分层采样拆分成训练集和测试集，测试集大小为原始数据集大小的 1/4
    return cross_validation.train_test_split(X_train, y_train,test_size=0.25,
		random_state=0,stratify=y_train)
    

def test_DecisionTreeClassifier(*data):
    '''
    测试 DecisionTreeClassifier 的用法

    :param data: 可变参数。
    它是一个元组，这里要求其元素依次为：训练样本集、测试样本集、训练样本的标记、测试样本的标记
    :return:  None
    '''
    X_train,X_test,y_train,y_test=data
    # 通过前面的测试，选择最优参数
    clf = DecisionTreeClassifier(criterion='gini',splitter='best',
                                 max_depth=3)
    clf.fit(X_train, y_train)

    print("Training score:%f"%(clf.score(X_train,y_train)))
    print("Testing score:%f"%(clf.score(X_test,y_test)))

#     feature_name = ['sepal','sepal_width','petal','petal_width'] 
#     target_name = ['setosa','versicolor','virginica']
    feature_name = iris.feature_names
    target_name = iris.target_names

  
    dot_data = StringIO() 
    export_graphviz(clf,out_file = dot_data,feature_names=feature_name, 
                         class_names=target_name,filled=True,rounded=True, 
                         special_characters=True)
   
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("DecisionTree1.pdf")
    print('Visible tree plot saved as pdf.')
    
if __name__=='__main__':
    # 产生用于分类问题的数据集
    X_train,X_test,y_train,y_test=load_data() 
    # 调用 test_DecisionTreeClassifier
    test_DecisionTreeClassifier(X_train,X_test,y_train,y_test)

Training score:0.964286
Testing score:0.947368
Visible tree plot saved as pdf.
