 Bagging实现

In [1]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np
import numpy as np  


def evaluate_classifier_multiple_times(classifier, X, y, n_iterations=10,bagging_base_model='DT'):  
    all_accuracies = []  
    all_f1_scores = []  
  
    for iteration in range(n_iterations):  
        # 设置十折交叉验证，每次使用不同的random_state  
        kf = KFold(n_splits=10, shuffle=True, random_state=42 + iteration * 10)  # 使用迭代次数作为随机种子  
        scores = []  
        f1_scores_iter = []  
  
        # 遍历交叉验证的每一折  
        for fold, (train_index, test_index) in enumerate(kf.split(X)):  
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]  
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]  
  
            # 将pandas读取的数据转化为list形式
            X_train = X_train.values.tolist()  
            y_train = y_train.values.tolist()  
            X_test = X_test.values.tolist()  
            y_test = y_test.values.tolist()  
           
            X_train = np.array(X_train)
            y_train = np.array(y_train)

            # print('X_train',X_train)
            # 创建并训练Bagging分类器
            bagging_clf = classifier.CustomBaggingClassifier(base_estimator=bagging_base_model,n_estimators=11, random_state=42)
            bagging_clf.fit(X_train, y_train)

            # 预测测试集
            y_pred = bagging_clf.predict(X_test)
            
            # one-hot编码原因
            if bagging_base_model=='NN':
                y_pred=y_pred+1
            # print('y_pred',y_pred,len(y_pred))
            # print('y_test',y_test,len(y_test))
            accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)  
            f1 = f1_score(y_test, y_pred, average='macro')  
            scores.append(accuracy)  
            f1_scores_iter.append(f1)  
        # print('y_test',y_test,len(y_test))
        mean_accuracy = np.mean(scores)  
        std_accuracy = np.std(scores)  
        mean_f1 = np.mean(f1_scores_iter)  
        print(f'第{iteration}次',mean_accuracy)  
        all_accuracies.append(mean_accuracy)  
        all_f1_scores.append(mean_f1)  
  
        # print(f"Iteration {iteration + 1}: Mean Accuracy = {mean_accuracy:.4f}, Std Accuracy = {std_accuracy:.4f}, Mean F1 Score = {mean_f1:.4f}")  

    overall_mean_accuracy = np.mean(all_accuracies)  
    overall_std_accuracy = np.std(all_accuracies)  
    overall_mean_f1 = np.mean(all_f1_scores)  
  
    return overall_mean_accuracy, overall_std_accuracy, overall_mean_f1  
  
# # 示例调用  
# # classifier_instance = YourClassifier()  # 替换为你的分类器实例  
# # X = your_X_data  # 替换为你的特征数据  
# # y = your_y_data  # 替换为你的标签数据  
# # k = your_k_value  # 替换为你的k值  
# # evaluate_classifier_multiple_times(classifier_instance, X, y, k)


In [5]:
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler

from package_py import Bagging8

    
file_paths =[ "data\\win.xls"]  # 实际文件路径
# file_paths =[ "data\\bal.xls", "data\\gla_lisan_result.xlsx", "data\\hay.xls", "data\\iri.xls", "data\\new.xls", "data\\win_lisan_result.xls", "data\\zoo.xls"]  # 实际文件路径
# mean_accuracys=[]
for i in range(len(file_paths)):
    file_path=file_paths[i]

    data = pd.read_excel(file_path, header=None)  
    # 将数据分为特征和标签  
    X = data.iloc[:, :-1]  # 前n列是特征  
    y = data.iloc[:, -1]   # 最后一列是分类标签  
    
    # 数据标准hua
    scaler = StandardScaler()  
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    


    mean_accuracy,std_accuracy,f1=evaluate_classifier_multiple_times(Bagging8,X_scaled_df,y,10,bagging_base_model='DT')
    # mean_accuracys.append(mean_accuracy)

    # 使用 f-string 格式化输出  
    print(f'{file_path} \n  mean_accuracy: {mean_accuracy:.3f} std_accuracy: {std_accuracy:.3f} f1: {f1:.3f}')



第0次 0.7754901960784314
第1次 0.7692810457516339
第2次 0.7705882352941177
第3次 0.757516339869281
第4次 0.8042483660130719
第5次 0.7258169934640523
第6次 0.7584967320261438
第7次 0.7692810457516339
第8次 0.7869281045751633
第9次 0.7699346405228759
data\win.xls 
  mean_accuracy: 0.769 std_accuracy: 0.019 f1: 0.758


调库

In [3]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier  # 使用多层感知器（神经网络）
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# 文件路径列表
file_paths = [r"data\zoo.xls"]  # 你可以根据需要添加更多文件路径
# # 文件路径列表
# file_paths = [
#     r"data\bal.xls", r"data\gla.xls", r"data\hay.xls",
#     r"data\iri.xls", r"data\new_avoid_negtive.xls", r"data\win.xls", r"data\zoo.xls"
# ]

# 初始化结果字典
results = {}

# 对每个数据集进行十次十折交叉验证
for file_path in file_paths:
    # 读取Excel文件
    df = pd.read_excel(file_path, header=None)
    
    # 分离特征和标签
    X = df.iloc[:, :-1].values  # 将数据转换为NumPy数组，所有行，除了最后一列的所有列（特征）
    y = df.iloc[:, -1].values    # 将数据转换为NumPy数组，所有行，最后一列（标签）
    
    # 确保标签是整数类型（对于sklearn的分类器通常是必要的）
    y = y.astype(int)
    
    # 创建单个MLP分类器作为基学习器
    base_mlp = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42, solver='adam', learning_rate_init=0.2)
    # 初始化决策树分类器  
    base_dt = DecisionTreeClassifier(random_state=42)
    # 创建BaggingClassifier，使用10个MLP作为基学习器
    bagging_mlp = BaggingClassifier(base_estimator=base_mlp, n_estimators=11, random_state=42)
    
    # 初始化用于存储每次交叉验证结果的列表
    accuracies = []
    f1_scores_list = []
    
    # 进行十次十折交叉验证
    for i in range(10):
        kf = KFold(n_splits=10, shuffle=True, random_state=42 + i * 10)
        scores = cross_val_score(bagging_mlp, X, y, cv=kf, scoring='accuracy')
        accuracies.append(scores.mean())
        
        # 使用cross_val_predict获取所有折叠的预测
        y_preds = cross_val_predict(bagging_mlp, X, y, cv=kf)
        
        # 计算F1分数（macro平均）
        f1_scores = f1_score(y, y_preds, average='macro')
        f1_scores_list.append(f1_scores)
        print(f"数据集{file_path}第{i+1}次十折准确度: {scores.mean()}")
    
    # 计算十次交叉验证的平均准确度和F1分数
    mean_accuracy = np.mean(accuracies)
    mean_f1 = np.mean(f1_scores_list)
    print(f"数据集{file_path}平均准确度: {mean_accuracy}")
    print(f"数据集{file_path}平均F1分数: {mean_f1}")

数据集data\zoo.xls第1次十折准确度: 0.9400000000000001
数据集data\zoo.xls第2次十折准确度: 0.9400000000000001
数据集data\zoo.xls第3次十折准确度: 0.9400000000000001
数据集data\zoo.xls第4次十折准确度: 0.9400000000000001
数据集data\zoo.xls第5次十折准确度: 0.9318181818181819
数据集data\zoo.xls第6次十折准确度: 0.9400000000000001
数据集data\zoo.xls第7次十折准确度: 0.9509090909090909
数据集data\zoo.xls第8次十折准确度: 0.9400000000000001
数据集data\zoo.xls第9次十折准确度: 0.93
数据集data\zoo.xls第10次十折准确度: 0.96
数据集data\zoo.xls平均准确度: 0.9412727272727274
数据集data\zoo.xls平均F1分数: 0.8477356854781022


In [4]:
import numpy as np
from package_py import NeuralNetworks7
from package_py import DecisionTree4
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

class CustomBaggingClassifier:
    
    def __init__(self, base_estimator=DecisionTree4,extra_base_estimator=None, n_estimators=10, random_state=42):
        """
        初始化 Bagging 分类器

        参数:
        base_estimator (object): 基学习器，默认为决策树分类器
        n_estimators (int): 基学习器的数量，默认为 10
        random_state (int): 随机数种子，用于保证每次运行结果的一致性

        属性:
        base_estimator (object): 基学习器
        n_estimators (int): 基学习器的数量
        random_state (int): 随机数种子
        models (list): 存储训练好的基学习器的列表
      """
        self.base_estimator = base_estimator
        self.extra_base_estimator = extra_base_estimator
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.models = []

    def fit(self, X, y):
        rng = np.random.RandomState(self.random_state)
        # print(rng)
        if self.extra_base_estimator!=None:
            pass
        else:
            if self.base_estimator == NeuralNetworks7:
                for _ in range(self.n_estimators):
                    # 对原始数据集进行有放回的随机抽样
                    X_resample, y_resample = resample(X, y, random_state=rng)
                    # 训练基学习器
                    model = self.base_estimator.NeuralNetworks(X_resample, y_resample)
                    model.fit()
                    self.models.append(model)
            if self.base_estimator == DecisionTree4:
                for _ in range(self.n_estimators):
                    # 对原始数据集进行有放回的随机抽样
                    X_resample, y_resample = resample(X, y, random_state=rng)
                    index_list = list(range(len(X_resample[0])))

                    combined_list = [[*X_resample[i], y_resample[i]] for i in range(len(X_resample))]
                    # 训练基学习器
                    model = self.base_estimator.TreeGenerate(combined_list,index_list)  
                    # self.base_estimator.print_tree(model.root)
                    # #剪枝
                    # model = tree.bhd_tree(tree.root)
            
                    self.models.append(model)
                    # if scissors==True:
                    #     tree.bhd_tree(tree.root)

    def predict(self, X):
        
        if self.base_estimator == NeuralNetworks7:
            # 对每个基学习器进行预测
            predictions = np.array([model.predict(X) for model in self.models])
            # 对于分类问题，使用多数投票法来确定最终的类别
        if self.base_estimator == DecisionTree4:
            predictions = []
            for tree in self.models:
                predict= self.base_estimator.predicts(tree, X)
                predictions.append(predict.predict(X,tree.root))
            np.array(predictions)

        # print(predictions)
        # 对于分类问题，使用多数投票法来确定最终的类别
        majority_vote = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0,arr=predictions)
        
        return majority_vote

# 加载数据集
data = load_iris()
X = data.data
y = data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train)
# 创建并训练Bagging分类器
bagging_clf = CustomBaggingClassifier(n_estimators=20, random_state=42)
bagging_clf.fit(X_train, y_train)

# 预测测试集
y_pred = bagging_clf.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f'Custom Bagging分类器的准确率: {accuracy}')
print(y_test)

[[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]
 [6.3 2.5 5.  1.9]
 [6.4 3.2 4.5 1.5]
 [5.2 3.5 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.2 4.1 1.5 0.1]
 [5.8 2.7 5.1 1.9]
 [6.  3.4 4.5 1.6]
 [6.7 3.1 4.7 1.5]
 [5.4 3.9 1.3 0.4]
 [5.4 3.7 1.5 0.2]
 [5.5 2.4 3.7 1. ]
 [6.3 2.8 5.1 1.5]
 [6.4 3.1 5.5 1.8]
 [6.6 3.  4.4 1.4]
 [7.2 3.6 6.1 2.5]
 [5.7 2.9 4.2 1.3]
 [7.6 3.  6.6 2.1]
 [5.6 3.  4.5 1.5]
 [5.1 3.5 1.4 0.2]
 [7.7 2.8 6.7 2. ]
 [5.8 2.7 4.1 1. ]
 [5.2 3.4 1.4 0.2]
 [5.  3.5 1.3 0.3]
 [5.1 3.8 1.9 0.4]
 [5.  2.  3.5 1. ]
 [6.3 2.7 4.9 1.8]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.1 3.3 1.7 0.5]
 [5.6 2.7 4.2 1.3]
 [5.1 3.4 1.5 0.2]
 [5.7 3.  4.2 1.2]
 [7.7 3.8 6.7 2.2]
 [4.6 3.2 1.4 0.2]
 [6.2 2.9 4.3 1.3]
 [5.7 2.5 5.  2. ]
 [5.5 4.2 1.4 0.2]
 [6.  3.  4.8 1.8]
 [5.8 2.7 5.1 1.9]
 [6.  2.2 4.  1. ]
 [5.4 3.  4.5 1.5]
 [6.2 3.4 5.4 2.3]
 [5.5 2.3 4.  1.3]
 [5.4 3.9 1.7 0.4]
 [5.  2.3 3.3 1. ]
 [6.4 2.7 5.3 1.9]
 [5.  3.3 1.4 0.2]
 [5.  3.2 1.

调库

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

class CustomBaggingClassifier:
    
    def __init__(self, base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42):
        """
        初始化 Bagging 分类器

        参数:
        base_estimator (object): 基学习器，默认为决策树分类器
        n_estimators (int): 基学习器的数量，默认为 10
        random_state (int): 随机数种子，用于保证每次运行结果的一致性

        属性:
        base_estimator (object): 基学习器
        n_estimators (int): 基学习器的数量
        random_state (int): 随机数种子
        models (list): 存储训练好的基学习器的列表
      """
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.models = []

    def fit(self, X, y):
        rng = np.random.RandomState(self.random_state)
        for _ in range(self.n_estimators):
            # 对原始数据集进行有放回的随机抽样
            X_resample, y_resample = resample(X, y, random_state=self.random_state)
            # 训练基学习器
            model = self.base_estimator.fit(X_resample, y_resample)
            self.models.append(model)

    def predict(self, X):
        # 对每个基学习器进行预测
        predictions = np.array([model.predict(X) for model in self.models])
        print(predictions)
        # 对于分类问题，使用多数投票法来确定最终的类别
        majority_vote = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0,arr=predictions)
        return majority_vote

# 加载数据集
data = load_iris()
X = data.data
y = data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建并训练Bagging分类器
bagging_clf = CustomBaggingClassifier(n_estimators=10, random_state=42)
bagging_clf.fit(X_train, y_train)

# 预测测试集
y_pred = bagging_clf.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f'Custom Bagging分类器的准确率: {accuracy}')
print(y_test)