In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, confusion_matrix
from sklearn.preprocessing import MinMaxScaler   # 归一化，使数据在0-1之间
from tqdm import tqdm                 

In [2]:
def score(view, model):
    
    MMS = MinMaxScaler()
    X = MMS.fit_transform(np.asarray(view.iloc[:,:-1]))   # 选取除了最后一列以外的全部作为特征，转化为numpy数组，进行归一化
    y = list(view.iloc[:,-1]) # 选取所有行的最后一列作为标签
    y_pred = model.predict(X)
    
    acc = accuracy_score(y, y_pred)  # 参数是真实标签和预测标签
    f1 = f1_score(y, y_pred, average='weighted')  # average平均，表示计算f1的方法，weighted表示在用加权的方法计算
                                                  # 加权F1分数计算方法：计算每个类的权重，以及各自的F1，对应相乘求和
    mcc = matthews_corrcoef(y, y_pred)
    
    tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[-1, 1]).ravel() # 根据标签+1，-1得到tn，fp，fn，tp组成的混淆矩阵，
                                                                         # 根据ravel把矩阵展平变成一组序列
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # else0表示如果fp+fn=0直接让敏感度和特异度为0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    return acc, f1, mcc, sensitivity, specificity

In [6]:
if __name__ == '__main__':  # 只有直接运行的时候才会执行这串代码，如果作为模块导入到其他代码内，则这串代码不会运行。
    filename_list = ['15(7维)']
    n_components = [7] # PCA降维后保留的主成分数是多少个
    
    
    for k in range(len(filename_list)):   # len(filename_list)返回了filename_list列表的长度，共计14个
        view = pd.read_csv(filename_list[k] + '.csv', header=None)  # 读取filename_list第【k】个数据集，+csv之后组成完整路径
                                                                    # 例如：australian.csv
        #模型一
        
        kf1 = KFold(n_splits=5, shuffle=True, random_state=10)      # 5折交叉验证，每次交叉实验中80%数据用于训练，20%数据用于验证
                                                                    # 参数shuffle表示数据顺序打乱
        kf2 = KFold(n_splits=5, shuffle=True, random_state=110)     # kf1是外部交叉验证，kf2是内部交叉验证
                                                            # 内部交叉验证是在外部交叉验证中的训练集内部，再进行交叉验证，调出最优超参数

        
        # 定义超参数范围
        C_list = [2**i for i in range(-8, 8)]  # 正则化系数，range(-8,8)生成-8到7的整数
        kernel_list = ['linear']

        result_dict = {}  # 初始化 result_dict

        with tqdm(total=len(C_list) * len(kernel_list) * 25) as pbar:  # tqdm库可实现进度条
                                                                       # 进度条长度是16（C的长度）*1（核的个数）*25（外部和内部五次的交叉验证）
                                                                       # 将结果赋值为pbar，可用pbar来调用这个进度条
            acc_list = []
            f1_list = []
            mcc_list = []
            sensitivity_list = []
            specificity_list = []
            
            # 外部交叉检验
            for train_valid_index, test_index in kf1.split(view):   # kf.split(数据集)对数据集信息训练测试集的分割，返回各自的索引
                view_train_valid = view.iloc[train_valid_index] # 训练集
                view_test = view.iloc[test_index] # 测试集
               
            # 内部交叉验证
                for C in C_list:
                    for kernel in kernel_list:
                        result_list = []
                        for train_index, valid_index in kf2.split(view_train_valid): # 选择外部交叉验证得到的训练集进行内部交叉验证
                            view_train = view_train_valid.iloc[train_index]
                            view_valid = view_train_valid.iloc[valid_index]
                            
                            MMS = MinMaxScaler()
                            view_train_scaled = MMS.fit_transform(np.asarray(view_train.iloc[:,:-1]))
                            view_train_labels = view_train.iloc[:,-1].values  # .values 将 pandas.Series 转换为 numpy.ndarray。
                        
                            model = SVC(C=C, kernel=kernel, random_state=10)     # 创立SVC（Support Vector Classifier）赋值给model
                            model.fit(view_train_scaled, view_train_labels) # 训练模型，选取的数据是除了最后一列（标签）的所有数据

                            acc, f1, mcc, sensitivity, specificity = score(view_valid, model)  # score接收内部交叉的验证集view_valid和训练好的模型model
                            pbar.update(1) # 更新进度条
                            result_list.append((acc, f1, mcc, sensitivity, specificity))
             # 计算各项均值
                        acc_mean = np.mean([x[0] for x in result_list])
                        f1_mean = np.mean([x[1] for x in result_list])
                        mcc_mean = np.mean([x[2] for x in result_list])
                        sensitivity_mean = np.mean([x[3] for x in result_list])
                        specificity_mean = np.mean([x[4] for x in result_list])

                        result_dict[(C, kernel)] = (acc_mean, f1_mean, mcc_mean, sensitivity_mean, specificity_mean)
                                                        # key(C, kernel)，value(acc_mean, f1_mean, mcc_mean, sensitivity_mean, specificity_mean)

                para = sorted(result_dict.items(), key=lambda x: x[1], reverse=True) # result_dict.items()返回键值对的列表，每个元素是一个元组(key,value)。
                                                                                     # 每个元素是一个元组 (key, value)
                                                                                     # key=lambda x: x[1]表示对数据集的值排序，具体来说是按acc_mean进行排序，例如：key=lambda x: x[1][1]：按 f1_mean 进行排序。
                                                                                    # reverse=True是降序
                best_C = para[0][0][0] # para[0] 是排序后的第一个元素，即性能最好的键值对。
                                       # para[0][0] 是最佳的超参数组合 (C, kernel)。
                                       # para[0][0][0] 是最佳的 C 值。
                best_kernel = para[0][0][1] # 最佳的kernel值。

                with open('svm_beast_11.txt', mode='a') as param_file:
                    param_file.write(f'{filename_list[k]}, 最优参数: C={best_C}, kernel={best_kernel}\n')
                
                MMS = MinMaxScaler()
                view_train_valid_scaled = MMS.fit_transform(np.asarray(view_train_valid.iloc[:,:-1]))
                view_train_valid_labels = view_train_valid.iloc[:,-1].values
                    
                model = SVC(C=best_C, kernel=best_kernel, random_state=10) # 选取最好参数重新进行SVC
                model.fit(view_train_valid_scaled, view_train_valid_labels)
                acc, f1, mcc, sensitivity, specificity = score(view_test, model)  # 直接用测试集数据和最好的参数模型进行计算指标

                acc_list.append(acc)
                f1_list.append(f1)
                mcc_list.append(mcc)
                sensitivity_list.append(sensitivity)
                specificity_list.append(specificity)

            acc_mean, acc_std = np.mean(acc_list), np.std(acc_list)
            f1_mean, f1_std = np.mean(f1_list), np.std(f1_list)
            mcc_mean, mcc_std = np.mean(mcc_list), np.std(mcc_list)
            sensitivity_mean, sensitivity_std = np.mean(sensitivity_list), np.std(sensitivity_list)
            specificity_mean, specificity_std = np.mean(specificity_list), np.std(specificity_list)

            with open('svm_n1.txt', mode='a') as Note:
                Note.write(f'{filename_list[k]}:\n')
                Note.write(f'Acc mean: {acc_mean}, Acc std: {acc_std}\n')
                Note.write(f'F1 mean: {f1_mean}, F1 std: {f1_std}\n')
                Note.write(f'MCC mean: {mcc_mean}, MCC std: {mcc_std}\n')
                Note.write(f'Sensitivity mean: {sensitivity_mean}, Sensitivity std: {sensitivity_std}\n')
                Note.write(f'Specificity mean: {specificity_mean}, Specificity std: {specificity_std}\n')


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [00:06<00:00, 63.26it/s]


In [7]:
if __name__ == '__main__':
    filename_list = ['15(7维)']
    n_components = [7]
    
    for k in range(len(filename_list)):
##############################################################################################
        # 读取模型2的数据
        view1 = pd.read_csv(filename_list[k] + '.csv', header=None)

        # 使用PCA进行降维，创建第二个视图的数据
        modelPCA = PCA(n_components=n_components[k])                             # 开始创立PCA模型，n_components=设置降维后的主成分个数
        PCAdata = modelPCA.fit_transform(view1.copy().iloc[:, :-1])              # copy()设置一个副本，防止对view1做出修改
                                                                                 # fit_transform进行降维
        view = pd.concat([pd.DataFrame(PCAdata), view1.iloc[:, -1]], axis=1)     # 将降维之后的数据和原数据最后一列按列拼接起来

###################################################################################################


        kf1 = KFold(n_splits=5, shuffle=True, random_state=10)
        kf2 = KFold(n_splits=5, shuffle=True, random_state=110)

        C_list = [2**i for i in range(-8, 8)]
        kernel_list = ['linear']

        result_dict = {}  # 初始化 result_dict

        with tqdm(total=len(C_list) * len(kernel_list) * 25) as pbar:
            acc_list = []
            f1_list = []
            mcc_list = []
            sensitivity_list = []
            specificity_list = []

            for train_valid_index, test_index in kf1.split(view):
                view_train_valid = view.iloc[train_valid_index]
                view_test = view.iloc[test_index]

                for C in C_list:
                    for kernel in kernel_list:
                        result_list = []
                        for train_index, valid_index in kf2.split(view_train_valid):
                            view_train = view_train_valid.iloc[train_index]
                            view_valid = view_train_valid.iloc[valid_index]

                            MMS = MinMaxScaler()
                            view_train_scaled = MMS.fit_transform(np.asarray(view_train.iloc[:,:-1]))
                            view_train_labels = view_train.iloc[:,-1].values # .values 将 pandas.Series 转换为 numpy.ndarray。
                        
                            model = SVC(C=C, kernel=kernel, random_state=10)     # 创立SVC（Support Vector Classifier）赋值给model
                            model.fit(view_train_scaled, view_train_labels) # 训练模型，选取的数据是除了最后一列（标签）的所有数据

                            acc, f1, mcc, sensitivity, specificity = score(view_valid, model)  # 确保定义了 score 函数
                            pbar.update(1)
                            result_list.append((acc, f1, mcc, sensitivity, specificity))

                        acc_mean = np.mean([x[0] for x in result_list])
                        f1_mean = np.mean([x[1] for x in result_list])
                        mcc_mean = np.mean([x[2] for x in result_list])
                        sensitivity_mean = np.mean([x[3] for x in result_list])
                        specificity_mean = np.mean([x[4] for x in result_list])

                        result_dict[(C, kernel)] = (acc_mean, f1_mean, mcc_mean, sensitivity_mean, specificity_mean)

                para = sorted(result_dict.items(), key=lambda x: x[1], reverse=True)
                best_C = para[0][0][0]
                best_kernel = para[0][0][1]

                with open('svm_beast_2.txt', mode='a') as param_file:
                    param_file.write(f'{filename_list[k]}, 最优参数: C={best_C}, kernel={best_kernel}\n')

                MMS = MinMaxScaler()
                view_train_valid_scaled = MMS.fit_transform(np.asarray(view_train_valid.iloc[:,:-1]))
                view_train_valid_labels = view_train_valid.iloc[:,-1].values
                    
                model = SVC(C=best_C, kernel=best_kernel, random_state=10) # 选取最好参数重新进行SVC
                model.fit(view_train_valid_scaled, view_train_valid_labels)
                acc, f1, mcc, sensitivity, specificity = score(view_test, model)  # 直接用测试集数据和最好的参数模型进行计算指标

                acc_list.append(acc)
                f1_list.append(f1)
                mcc_list.append(mcc)
                sensitivity_list.append(sensitivity)
                specificity_list.append(specificity)

            acc_mean, acc_std = np.mean(acc_list), np.std(acc_list)
            f1_mean, f1_std = np.mean(f1_list), np.std(f1_list)
            mcc_mean, mcc_std = np.mean(mcc_list), np.std(mcc_list)
            sensitivity_mean, sensitivity_std = np.mean(sensitivity_list), np.std(sensitivity_list)
            specificity_mean, specificity_std = np.mean(specificity_list), np.std(specificity_list)

            with open('svm_n2.txt', mode='a') as Note:
                Note.write(f'{filename_list[k]}:\n')
                Note.write(f'Acc mean: {acc_mean}, Acc std: {acc_std}\n')
                Note.write(f'F1 mean: {f1_mean}, F1 std: {f1_std}\n')
                Note.write(f'MCC mean: {mcc_mean}, MCC std: {mcc_std}\n')
                Note.write(f'Sensitivity mean: {sensitivity_mean}, Sensitivity std: {sensitivity_std}\n')
                Note.write(f'Specificity mean: {specificity_mean}, Specificity std: {specificity_std}\n')

100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [00:05<00:00, 67.19it/s]


In [8]:
if __name__ == '__main__':
    filename_list = ['15(7维)']
    n_components = [7]

    for k in range(len(filename_list)):
##############################################################################################
        # 读取模型3的数据
        view1 = pd.read_csv(filename_list[k] + '.csv', header=None)

        # 使用PCA进行降维，创建第二个视图的数据
        modelPCA = PCA(n_components=n_components[k])
        PCAdata = modelPCA.fit_transform(view1.copy().iloc[:, :-1])
        view2 = pd.concat([pd.DataFrame(PCAdata), view1.iloc[:, -1]], axis=1)

        # 合并原始数据和 PCA 数据
        view = pd.concat([view1.iloc[:, :-1], view2], axis=1) #保持最后的变量是view即可                      对比合并看结果！
###################################################################################################

        # 数据最小最大缩放
        MMS = MinMaxScaler()
        view_scaled = MMS.fit_transform(view.iloc[:, :-1])
        view_scaled = pd.DataFrame(view_scaled, columns=view.columns[:-1])
        view_scaled[view.columns[-1]] = view.iloc[:, -1]  # 保持标签列不变
        

        kf1 = KFold(n_splits=5, shuffle=True, random_state=10)
        kf2 = KFold(n_splits=5, shuffle=True, random_state=110)

        C_list = [2**i for i in range(-8, 8)]
        kernel_list = ['linear']

        result_dict = {}  # 初始化 result_dict

        with tqdm(total=len(C_list) * len(kernel_list) * 25) as pbar:
            acc_list = []
            f1_list = []
            mcc_list = []
            sensitivity_list = []
            specificity_list = []

            for train_valid_index, test_index in kf1.split(view):
                view_train_valid = view.iloc[train_valid_index]
                view_test = view.iloc[test_index]

                for C in C_list:
                    for kernel in kernel_list:
                        result_list = []
                        for train_index, valid_index in kf2.split(view_train_valid):
                            view_train = view_train_valid.iloc[train_index]
                            view_valid = view_train_valid.iloc[valid_index]

                            MMS = MinMaxScaler()
                            view_train_scaled = MMS.fit_transform(np.asarray(view_train.iloc[:,:-1]))
                            view_train_labels = view_train.iloc[:,-1].values
                        
                        
                            model = SVC(C=C, kernel=kernel, random_state=10)     # 创立SVC（Support Vector Classifier）赋值给model
                            model.fit(view_train_scaled, view_train_labels) # 训练模型，选取的数据是除了最后一列（标签）的所有数据


                            acc, f1, mcc, sensitivity, specificity = score(view_valid, model)  # 确保定义了 score 函数
                            pbar.update(1)
                            result_list.append((acc, f1, mcc, sensitivity, specificity))

                        acc_mean = np.mean([x[0] for x in result_list])
                        f1_mean = np.mean([x[1] for x in result_list])
                        mcc_mean = np.mean([x[2] for x in result_list])
                        sensitivity_mean = np.mean([x[3] for x in result_list])
                        specificity_mean = np.mean([x[4] for x in result_list])

                        result_dict[(C, kernel)] = (acc_mean, f1_mean, mcc_mean, sensitivity_mean, specificity_mean)

                para = sorted(result_dict.items(), key=lambda x: x[1], reverse=True)
                best_C = para[0][0][0]
                best_kernel = para[0][0][1]

                with open('svm_beast_3.txt', mode='a') as param_file:
                    param_file.write(f'{filename_list[k]}, 最优参数: C={best_C}, kernel={best_kernel}\n')

                MMS = MinMaxScaler()
                view_train_valid_scaled = MMS.fit_transform(np.asarray(view_train_valid.iloc[:,:-1]))
                view_train_valid_labels = view_train_valid.iloc[:,-1].values
                    
                model = SVC(C=best_C, kernel=best_kernel, random_state=10) # 选取最好参数重新进行SVC
                model.fit(view_train_valid_scaled, view_train_valid_labels)
                acc, f1, mcc, sensitivity, specificity = score(view_test, model)  # 直接用测试集数据和最好的参数模型进行计算指标

                acc_list.append(acc)
                f1_list.append(f1)
                mcc_list.append(mcc)
                sensitivity_list.append(sensitivity)
                specificity_list.append(specificity)

            acc_mean, acc_std = np.mean(acc_list), np.std(acc_list)
            f1_mean, f1_std = np.mean(f1_list), np.std(f1_list)
            mcc_mean, mcc_std = np.mean(mcc_list), np.std(mcc_list)
            sensitivity_mean, sensitivity_std = np.mean(sensitivity_list), np.std(sensitivity_list)
            specificity_mean, specificity_std = np.mean(specificity_list), np.std(specificity_list)

            with open('svm_n3.txt', mode='a') as Note:
                Note.write(f'{filename_list[k]}:\n')
                Note.write(f'Acc mean: {acc_mean}, Acc std: {acc_std}\n')
                Note.write(f'F1 mean: {f1_mean}, F1 std: {f1_std}\n')
                Note.write(f'MCC mean: {mcc_mean}, MCC std: {mcc_std}\n')
                Note.write(f'Sensitivity mean: {sensitivity_mean}, Sensitivity std: {sensitivity_std}\n')
                Note.write(f'Specificity mean: {specificity_mean}, Specificity std: {specificity_std}\n')

100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [00:06<00:00, 58.05it/s]


In [9]:
run_time = end_time - start_time    # 程序的运行时间，单位为秒
print(run_time)

NameError: name 'end_time' is not defined