In [12]:
from math import log  
 
def calc_shannon_ent(data_set):  
    """ 
    计算香农熵 
    :param data_set:数据集 
    :return: 计算结果 
    """  
    num_entries = len(data_set) #保存数据集中实例的总数  
    label_counts = {}  
    #-----------------------  
    # 为所有可能分类创建字典  
    #-----------------------  
    for vec in data_set: # 遍历每个实例，统计标签的频数  
        current_label = vec[-1]  #设样本的最后一列的数值为键值  
        if current_label not in label_counts.keys():  
            label_counts[current_label] = 0  
        label_counts[current_label] += 1  
    shannon_ent = 0.0  
    for key in label_counts:  
        prob = float(label_counts[key]) / num_entries  
        shannon_ent -= prob * log(prob,2) # 以2为底的对数  
    return shannon_ent  

In [13]:
#创建数据集  
def create_data_set():  
    data_set = [[1,1,'y'],  
                [1,1,'y'],  
                [1,0,'n'],  
                [0,0,'n'],  
                [0,1,'n']  
                ]  
    labels = ['gill', 'fin']  
    return data_set, labels  

In [21]:
#使用自定义函数来计算香农熵
my_data,labels = create_data_set()  
print(my_data) 
print(calc_shannon_ent(my_data))  

[[1, 1, 'y'], [1, 1, 'y'], [1, 0, 'n'], [0, 0, 'n'], [0, 1, 'n']]
0.9709505944546686


In [15]:
#如果有更多的分类呢?  
my_data[-1][-1]='not sure'  
print(my_data)  
print(calc_shannon_ent(my_data) )

[[1, 1, 'y'], [1, 1, 'y'], [1, 0, 'n'], [0, 0, 'n'], [0, 1, 'not sure']]
1.5219280948873621


In [16]:
# 划分数据集
def split_data_set(data_set, axis, value):  
    ''''' 
    按照给定特征划分数据集 
    :param data_set:待划分的数据集 
    :param axis:划分数据集所用的特征 
    :param value: 特征的返回值 
    :return: 划分结果列表 
    '''  
    ret_data_set = []   # 为了不修改原始数据,新建一个列表  
    for vec in data_set:  # vec: 样本, 某一行的数据  
        if vec[axis] == value:  
            reduced_vec = vec[:axis]    
            reduced_vec.extend(vec[axis+1:])  
            ret_data_set.append(reduced_vec)  
    return ret_data_set  
  
my_data,labels = create_data_set()  
print(my_data) 
print(split_data_set(my_data,0,0) )
print(split_data_set(my_data,2,"y") )
print(split_data_set(my_data,1,1) )

[[1, 1, 'y'], [1, 1, 'y'], [1, 0, 'n'], [0, 0, 'n'], [0, 1, 'n']]
[[0, 'n'], [1, 'n']]
[[1, 1], [1, 1]]
[[1, 'y'], [1, 'y'], [0, 'n']]


In [17]:
# 选择最好的划分方式
def best_feature_to_split(data_set): # data_set 必须是以长度相同的列表为元素的列表. 每个样本的最后一个元素是当前样本的类别标签  
    num_features = len(data_set[0])-1   # 求出当前数据集包含的特征的数目.  
    base_entropy = calc_shannon_ent(data_set) # 求出原始香农值  
    best_info_gain = 0.0   
    best_feature = -1  
    for i in range(num_features): # 遍历所有特征  
        feat_list = [example[i] for example in data_set] # 将所有第i个特征值或所有可能存在的值写入新列表  
        unique_values = set(feat_list)   # 去除重复元素  
        new_entropy = 0.0   
        for value in unique_values: #遍历所有唯一特征值  
            sub_data_set = split_data_set(data_set,i, value) #对每个特征划分一次数据集  
            prob = len(sub_data_set)/float(len(data_set))  
            new_entropy += prob * calc_shannon_ent(sub_data_set) # 计算数据集的新的香农熵.   
        info_gain = base_entropy - new_entropy  
        if (info_gain > best_info_gain):  
            best_info_gain = info_gain  
            best_feature = i  
        return best_feature  

my_data, labels = create_data_set()  
print("my_data:")  
print(my_data)  
print("best feature:")  
print(best_feature_to_split(my_data))  


my_data:
[[1, 1, 'y'], [1, 1, 'y'], [1, 0, 'n'], [0, 0, 'n'], [0, 1, 'n']]
best feature:
0


In [18]:
# Gini指数
def cal_gini_index(data_set):  
    """计算给定数据集的Gini指数 
    input: data(list): 数据集
    output: gini(float): Gini指数
    """  
    # Total no. of the sample  
    total_sample = len(data_set)  
    if len(data_set)==0:  
        return 0  
    # Count the no. of labels in the data set  
    label_counts = label_uniq_counts(data_set)  
  
    # Calculate the Gini index of the data set  
    gini = 0  
    for label in label_counts:  
        gini = gini + pow(label_counts[label],2)  
  
    gini = 1 - float(gini)/ pow(total_sample,2)  
    return gini    

In [19]:
#计算数据集中类别标签的个数
from math import pow  
  
def label_uniq_counts(data):  
    """ 
    input: data(list) 
    output: label_uniq_counts(int) 
    """  
    label_uniq_count ={}  
  
    for x in data:  
        label = x[len(x)-1] # 取得每个样本的类标签label  
        if label not in label_uniq_count:  # WHY?  
            label_uniq_count[label] = 0  
        label_uniq_count[label]  += 1  
    return label_uniq_count  

label_uniq_counts(my_data)

{'y': 2, 'n': 3}

In [20]:
# 训练决策树 (Geron)  
# 1. 导入模块  
from sklearn.tree import DecisionTreeClassifier  
import numpy as np  
    
#2. 创建自己的数据集  
  
#create dataset  
class Fish:  
    def __init__(self):  
        X = [[1,1],  
             [1,1],  
             [1,0],  
             [0,1],  
             [0,1]  
             ]  
        y = [1,1,0,0,0]  
        self.data = np.array(X)  
        self.label = np.array(y)  
        self.feature = np.array(['鳃','鳍'])     
# 训练集  
fs = Fish()  
  
print('X: ', fs.data)  
print('y:',fs.label)    
  
# 训练集的特征  
X = fs.data  
# 训练集的标签  
y = fs.label  
  
#3.1 分类器这个类之实例化  
tree_clf = DecisionTreeClassifier(max_depth=2)  
# 3.2 最终生成决策树模型  
tree_clf.fit(X,y)  
  
# 传入一个数据集,用决策树做预测  
# predict_proba(测试集): 返回值: 一个ndarray数组,其行数为样本个数,其列数为种类数.  
a = tree_clf.predict_proba([[1,1],[0,0]])  
# predict(测试集): 函数的返回值: 一个一维array数组,其列数为样本个数  
b = tree_clf.predict([[1,0]])  
c = tree_clf.predict([[1,1],[1,0]])  
print('测试样本为各类的概率:',a)  
print('分类预测结果',c)  

X:  [[1 1]
 [1 1]
 [1 0]
 [0 1]
 [0 1]]
y: [1 1 0 0 0]
测试样本为各类的概率: [[0. 1.]
 [1. 0.]]
分类预测结果 [1 0]
