In [1]:
from math import log
import operator

##### Collect the data

In [2]:
def createDataSet():
    dataSet = [[1,1,'yes'],
              [1,1,'yes'],
              [1,0,'No'],
              [0,1,'No'],
              [0,1,'No']]
    FeatureNames = ['No surfacing', 'flippers']
    return dataSet, FeatureNames



##### Prepare the data
Here I use Entropy Gain as the method to select the features, which is called the ID3 algorithm, then data is required to be discrete. Here we have the discrete data already

##### Analyzing Data
1. Using ID3 method, calculating the Entropy: H(X) = - SUM P(x)*log2P(x)

In [3]:
def calcEntropy(dataSet):
    # how many entries
    numEntries = len(dataSet)
    # create a dict to store the label counts
    labelCounts = {}
    # iterate the dataset
    for featureVector in dataSet:
        # the last term in featureVecot is the label
        label = featureVector[-1]
        # if label not in labelCounts, then initialize it
        if label not in labelCounts:
            labelCounts[label] = 0
        labelCounts[label] += 1
    
    # based on the label counts, calculate the Entropy
    entropy = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        entropy -= prob * log(prob, 2)
    
    # return the entropy for the dataset
    return entropy


2. Spliting data, based on selected feature

In [4]:
def splitDataSet(dataset, index, value):
    """
    dataset: the given data set
    index: the column index that is selected feature to split the data
    value: the target value
    """
    dataset_new =[]
    for record in dataset:
        if record[index] == value:
            record_new = record[:index]
            record_new.extend(record[index+1:])
            dataset_new.append(record_new)
    return dataset_new

3. Based on the previous fucntion, we want to find the feature that gives the largest information gain

In [5]:
def FindBestFeature(dataset):
    numFeatures = len(dataset[0]) - 1
    baseEntropy = calcEntropy(dataset)
    maxInfoGain = 0.0
    bestFeature = -1
    for index in range(numFeatures):
        subEntropy = 0.0
        values = set([record[index] for record in dataset])
        for value in values:
            dataset_sub = splitDataSet(dataset, index, value)
            prob = len(dataset_sub) / numFeatures
            subEntropy -= calcEntropy(dataset_sub)*prob
        infoGain = baseEntropy - subEntropy
    
        if (infoGain > maxInfoGain):
            maxInfoGain = infoGain
            bestFeature = index
        
    print('Info Gain is: ', infoGain, 'Best Feature index is: ', bestFeature, baseEntropy, subEntropy)
    
    return bestFeature
            

##### Create a Tree

In [6]:
def createTree(dataset, FeatureNames):
    labelList = [record[-1] for record in dataset]
    # there are two stop criterion 
    # 1. when there is no other class lable
    if labelList.count(labelList[0]) == len(labelList):
        return labelList[0]
    # 2. when all features are used but there are more than one labels in the dataset
    if len(dataset[0]) == 1:
        return majorityLabel(labelList)
    
    # otherwise, we have to find the best feature to split the data set
    bestFeatureIndex = FindBestFeature(dataset)
    bestFeatureName = FeatureNames[bestFeatureIndex]
    
    # initialize the tree
    myTree = {bestFeatureName:{}}
    del(FeatureNames[bestFeatureIndex])
    
    # create sub trees for this feature with different values
    featureValues = [record[bestFeatureIndex] for record in dataset]
    featureValues = set(featureValues)
    for value in featureValues:
        subFeatures = FeatureNames[:]    
        myTree[bestFeatureName][value] = createTree(splitDataSet(dataset, bestFeatureIndex, value), subFeatures)
    
    return myTree
    
def majorityLabel(labelList):
    LableCount = {}
    for label in labelList:
        if label not in labelCount:
            labelCount[label] = 0
        labelCount[count] += 1
    labelCount_sorted = sorted(classCount.iteritms(), key=operator.itemgetter(1), reverse = True)
    
    return labelCount_sorted[0][0]

##### Using Decision Tree to do classification

In [21]:
def classify(inputTree, featName, TestVec):
    """
    inputTree
    featName
    TestVec
    """
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]    
    featIndex = featName.index(firstStr)
    key = TestVec[featIndex]
    valueOfFeat = secondDict[key]
    
    print('+++', firstStr, 'xxx', secondDict, '---', key, '>>>', valueOfFeat)
    # check whether valueOfFeat is dict or not
    if isinstance(valueOfFeat, dict):
        classLabel = classify(valueOfFeat, featIndex, TestVec)
    else:
        classLabel = valueOfFeat
    return classLabel
    

In [22]:
def storeTree(inputTree, filename):
    import pickle
    # -------------- 第一种方法 start --------------
    fw = open(filename, 'wb')
    pickle.dump(inputTree, fw)
    fw.close()
    
def grabTree(filename):
    import pickle
    fr = open(filename,'rb')
    return pickle.load(fr)

In [24]:
import copy

myDat, labels = createDataSet()

myTree = createTree(myDat, copy.deepcopy(labels))
print("....",myTree)
print(labels)
print(classify(myTree, labels, [1, 1]))

#print(get_tree_height(myTree))

#dtPlot.createPlot(myTree)

Info Gain is:  2.970950594454669 Best Feature index is:  1 0.9709505944546686 -2.0
Info Gain is:  1.0 Best Feature index is:  0 1.0 0.0
.... {'flippers': {0: 'No', 1: {'No surfacing': {0: 'No', 1: 'yes'}}}}
['No surfacing', 'flippers']
+++ flippers xxx {0: 'No', 1: {'No surfacing': {0: 'No', 1: 'yes'}}} --- 1 >>> {'No surfacing': {0: 'No', 1: 'yes'}}


AttributeError: 'int' object has no attribute 'index'

In [None]:
list(myTree.keys())[0]

In [None]:
labels.index(list(myTree.keys())[0])