In [28]:
import numpy as np
from collections import Counter

class ID3DecisionTree:
    def __init__(self):
        self.tree = None

    def entropy(self, y):
        """计算信息熵"""
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

    def information_gain(self, X, y, feature_index):
        """计算信息增益"""
        parent_entropy = self.entropy(y)
        values, counts = np.unique(X[:, feature_index], return_counts=True)
        weighted_entropy = sum(
            (counts[i] / np.sum(counts)) * self.entropy(y[X[:, feature_index] == value])
            for i, value in enumerate(values)
        )
        return parent_entropy - weighted_entropy

    def best_split(self, X, y, features):
        """找到最佳分裂特征"""
        gains = [self.information_gain(X, y, feature) for feature in features]
        return features[np.argmax(gains)]

    def build_tree(self, X, y, features):
        """
        递归构建决策树
        :param X: 条件属性矩阵
        :param y: 决策属性向量
        :param features: 可用的特征索引列表
        :return: 决策树（字典）
        A: 属性集合（所有feature）
        """
        # 1. 如果 D 中样本全属于一个类别 C，则为叶节点
        if len(np.unique(y)) == 1:
            return y[0]

        # 2. 如果 A 为空，或 D 样本在 A 上取值相同，则返回多数类
        if len(features) == 0 or all(np.all(X[:, feature] == X[0, feature]) for feature in features):
            return Counter(y).most_common(1)[0][0]

        # 3. 从 A 中选择最优分裂属性 a*
        best_feature = self.best_split(X, y, features)
        tree = {best_feature: {}}

        # 4. 对每个 a* 的取值生成子节点
        values = np.unique(X[:, best_feature])
        for value in values:
            subset_X = X[X[:, best_feature] == value]
            subset_y = y[X[:, best_feature] == value]

            # 如果 D_v 为空，则标记为叶节点，返回 D 中多数类
            if len(subset_y) == 0:
                tree[best_feature][value] = Counter(y).most_common(1)[0][0]
            else:
                # 递归调用，去除已使用的特征
                remaining_features = [f for f in features if f != best_feature]
                tree[best_feature][value] = self.build_tree(subset_X, subset_y, remaining_features)

        return tree

    def fit(self, X, y):
        """
        训练 ID3 决策树
        :param X: 条件属性矩阵
        :param y: 决策属性向量
        """
        features = list(range(X.shape[1]))
        self.tree = self.build_tree(X, y, features)

    def predict_single(self, x, tree, default_class):
        """
        对单个样本进行预测
        :param x: 单个样本
        :param tree: 决策树
        :param default_class: 0
        :return: 分类结果
        """
        if not isinstance(tree, dict):
            return tree  # 叶节点，直接返回分类

        feature = list(tree.keys())[0]
        value = x[feature]

        # 获取子树或默认分类
        subtree = tree[feature].get(value, default_class)
        return self.predict_single(x, subtree, default_class)

    def predict(self, X, default_class):
        """
        对样本集进行预测
        :param X: 样本集
        :param default_class: 默认分类（训练集多数类）
        :return: 分类结果列表
        """
        return np.array([self.predict_single(x, self.tree, default_class) for x in X])


In [29]:
import pandas as pd
import numpy as np
from shiyan6 import ID3DecisionTree
from yanzheng import df
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

path = r'C:\Users\李昌峻\Desktop\实验课\No6\西瓜数据集.xlsx'

mapping = {
    '色泽': {'青绿': 0, '乌黑': 1, '浅白': 2},
    '根蒂': {'蜷缩': 0, '稍蜷': 1, '硬挺': 2},
    '敲声': {'浊响': 0, '沉闷': 1, '清脆': 2},
    '纹理': {'清晰': 0, '稍糊': 1, '模糊': 2},
    '脐部': {'凹陷': 0, '稍凹': 1, '平坦': 2},
    '触感': {'硬滑': 0, '软粘': 1},
    '好瓜': {'是': 1, '否': 0}
}
# 将中文列转换为数值
for column in df.columns:
    df[column] = df[column].map(mapping[column])
# 决策属性：
attrs = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
# 目标属性：
target = '好瓜'
target_col = -1

X = np.array(df[attrs])
y = np.array(df[target])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

data = np.hstack((X, y.reshape(-1, 1)))
print(data)
"""
[[0 0 0 0 0 0 1]
 [1 0 1 0 0 0 1]
 [1 0 0 0 0 0 1]
 [0 0 1 0 0 0 1]
 [2 0 0 0 0 0 1]
 [0 1 0 0 1 1 1]
 [1 1 0 1 1 1 1]
 [1 1 0 0 1 0 1]
 [1 1 1 1 1 0 0]
 [0 2 2 0 2 1 0]
 [2 2 2 2 2 0 0]
 [2 0 0 2 2 1 0]
 [0 1 0 1 0 0 0]
 [2 1 1 1 0 0 0]
 [1 1 0 0 1 1 0]
 [2 0 0 2 2 0 0]
 [0 0 1 1 1 0 0]]
"""
ID3 = ID3DecisionTree()
all_entro = ID3.entropy(y)
print("all_entro:",all_entro) # 0.998

# ===========================验证实验一的三个子集熵===============================
groups = {value: data[data[:, 0] == value] for value in np.unique(data[:, 0])} # 划分
subset_entropies = {key: ID3.entropy(group[:, target_col]) for key, group in groups.items()}
print(subset_entropies) #{0: 1.0, 1: 0.9182958340544896, 2: 0.7219280948873623}

# ============================验证实验一中的信息增益=================================
for ind in range(len(attrs)):
    print(attrs[ind], ID3.information_gain(data, y, ind))
    """
    色泽 0.10812516526536531
    根蒂 0.14267495956679288
    敲声 0.14078143361499584
    纹理 0.3805918973682686
    脐部 0.28915878284167895
    触感 0.006046489176565584
    """

# ===========================验证决策树=========================================
# 训练决策树
ID3 = ID3DecisionTree()
ID3.fit(X_train, y_train)
default_class = Counter(y_train).most_common(1)[0][0]
print("default_class:", default_class)

train_predictions = ID3.predict(X_train, default_class)
test_predictions = ID3.predict(X_test, default_class)
# print("测试集实际:", y_test)
# print("测试集预测结果:", test_predictions)

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)
print("训练集准确率:", train_accuracy)
print("测试集准确率:", test_accuracy)

# ==================================混淆矩阵==================================
cm = confusion_matrix(y_test, test_predictions)
# Plot using seaborn for better aestheticsa
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['N', 'Y'], yticklabels=['N', 'Y'])
plt.xlabel('pre_labels')
plt.ylabel('true_labels')
plt.title('Confusion matrix')
plt.show()

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.