In [1]:
from scipy.io import arff
import nbimporter
import pandas as pd
import numpy as np
from scipy.stats import norm
import random
# random.seed(42)
from Logistic_Regression import LogisticRegression
from Naive_Bayes import NaiveBayes
from prepare_for_training import prepare_for_training
from sigmoid import sigmoid
from operator import itemgetter
import copy
import time





# 讀取.arff資料集

In [2]:
def read_arff(file):
    data,features = arff.loadarff(file)
    df = pd.DataFrame(data)
    return df

# 讀取.data的資料集

In [3]:
def read_data(file,sep_type = ",",header = 0):
    df = pd.read_csv(file,sep = sep_type,header = header)
    return df

# 資料切割

In [None]:
def data_split(X,y,k):
    #切割後的結果
    result = []
    #資料筆數
    num_examples = X.shape[0]
    #特徵數量
    num_features = X.shape[1]
    #fold大小
    fold_size = num_examples//k;
    
    for i in range(k):
        start = i * fold_size;
        end = (i + 1) * fold_size
        
        #訓練集
        x_train = pd.concat([X.iloc[:start],X.iloc[end:]],axis = 0).values
        y_train = pd.concat([y.iloc[:start],y.iloc[end:]],axis = 0).values

        #測試集
        x_test = X.iloc[start:end].values
        y_test = y.iloc[start:end].values
        
        result.append([x_train,y_train,x_test,y_test])
    
    return result
    
    
    

# 羅吉斯迴歸

In [4]:
def LR(X,y,k,num_iterations,alpha):
    num_examples = X.shape[0]
    fold_size = num_examples//k;
    accuracies = []
    
    for i in range(k):
        start = i * fold_size;
        end = (i + 1) * fold_size
       
        #測試集
        x_test = X.iloc[start:end].values
        y_test = y.iloc[start:end].values
        
        #將測試集做資料前處理
        (x_test, features_mean, features_deviation) = prepare_for_training(x_test, True)
            
        #訓練集
        x_train = pd.concat([X.iloc[:start],X.iloc[end:]],axis = 0).values
        y_train = pd.concat([y.iloc[:start],y.iloc[end:]],axis = 0).values
        
        #訓練集資料權重
        data_weight = np.ones((x_train.shape[0],1))
        
        #訓練羅吉斯迴歸模型
        logistic_regression = LogisticRegression(x_train,y_train,data_weight)
        (theta,cost_history) = logistic_regression.train(alpha,num_iterations)
        
#         print(theta.T)
        print("-"*79)
        
        #將測試集做預測
        prediction = LogisticRegression.hypothesis(x_test,theta)
        prediction[prediction > 0.5] = 1
        prediction[prediction < 0.5] = 0
        #計算模型正確率
        acc = np.mean(prediction == y_test)
        #紀錄k次的模型正確率
        accuracies.append(acc)
    
    avg_accuracy = sum(accuracies) / len(accuracies)    
    return accuracies,avg_accuracy

# 袋裝法-羅吉斯迴歸

In [5]:
def bagging(X,y,k,num_iterations,alpha,size):
    num_examples = X.shape[0]
    num_features = X.shape[1]
    fold_size = num_examples//k;
    accuracies = []
    bagging_prediction = np.zeros((fold_size,1))
    #紀錄5倍交叉驗證的所有模型
    base_model_record = []
    #紀錄5倍交叉驗證所有base model的預測結果
    base_model_prediction_record = []
    
    for i in range(k):
#         print("="*79)
#         print(f"第{i+1}次的交叉驗證")
#         print("="*79)
        start = i * fold_size;
        end = (i + 1) * fold_size

        #測試集
        x_test = X.iloc[start:end].values
        y_test = y.iloc[start:end].values
        
        #訓練集
        x_train = pd.concat([X.iloc[:start],X.iloc[end:]],axis = 0).values
        y_train = pd.concat([y.iloc[:start],y.iloc[end:]],axis = 0).values
        

        #將測試集做資料前處理
        (x_test, features_mean, features_deviation) = prepare_for_training(x_test, True)

        
        #子訓練集的大小，假設100%
        sample_size = int(1*x_train.shape[0])
        
        #子訓練集資料權重
        data_weight = np.ones((sample_size,1))
        
        #紀錄當前交叉驗證的25個基本模型
        temp_base_model_record = []
        
        #訓練基本模型
        for j in range(size):
            #隨機抽取100%的子訓練集做模型的訓練
            random_indices = np.random.choice(x_train.shape[0],size = sample_size, replace = True)
            x_train_subset = x_train[random_indices,:]
            y_train_subset = y_train[random_indices,:]
            

            #訓練羅吉斯迴歸基本模型
            logistic_regression = LogisticRegression(x_train_subset,y_train_subset,data_weight)
            (theta,cost_history) = logistic_regression.train(alpha,num_iterations)
            
            #紀錄基本模型
            temp_base_model_record.append(theta)
            
        prediction = np.zeros((fold_size,1))
            
        #將該集成模型做預測
        final_ensemble_base_learner_prediction = np.zeros((fold_size,size))
        final_erlr_prediction = np.zeros((fold_size,1))

        
        for current_index in range(size):
            final_ensemble_base_learner_x_test_prediction = sigmoid(np.dot(x_test,temp_base_model_record[current_index]))
            final_ensemble_base_learner_x_test_prediction[final_ensemble_base_learner_x_test_prediction > 0.5] = 1
            final_ensemble_base_learner_x_test_prediction[final_ensemble_base_learner_x_test_prediction < 0.5] = 0

            final_ensemble_base_learner_prediction[:,current_index] = final_ensemble_base_learner_x_test_prediction.ravel()
        
        base_model_prediction_record.append(final_ensemble_base_learner_prediction)
        

        #進行集成投票
        for row in range(final_ensemble_base_learner_prediction.shape[0]):
            zero_nums = 0
            one_nums = 0
            for ele in final_ensemble_base_learner_prediction[row]:
                if ele == 0:
                    zero_nums+=1
                else:
                    one_nums+=1
            if(zero_nums > one_nums):
                final_erlr_prediction[row] = 0
            else:
                final_erlr_prediction[row] = 1

        #計算模型正確率
        acc = np.mean(final_erlr_prediction == y_test)
        #紀錄k次的模型正確率
        accuracies.append(acc)
        #紀錄此次交叉驗證的25個基本模型的紀錄
        base_model_record.append(temp_base_model_record)
        
    avg_accuracy = sum(accuracies) / len(accuracies)
    return accuracies,avg_accuracy,base_model_record,base_model_prediction_record

# ERLR

In [6]:
def erlr_base_model(X,features_deviation):      
    #隨機產生羅吉斯迴歸基本模型之參數
    #首先隨機產生0到1的亂數
    theta = np.random.rand(X.shape[1], 1)
        
    #計算資料集的標準差
    features_deviation = features_deviation.reshape(len(features_deviation),1)
    features_deviation = np.insert(features_deviation, 0, np.ones(1), axis=0)

    #代入反向標準常態函數，得到一般迴歸係數
    theta = norm.ppf(theta)    
    
    return theta

In [7]:
#初始分類門檻值
def initial_threshold(X,y,size = 5000):
    X = X.values
    y = y.values
    accuracies = []
    model_record = []
    
    #對資料集做資料前處理
    (X,features_mean,features_deviation) = prepare_for_training(X)
    
    temp_accuracies = []
    for i in range(size):
        theta = erlr_base_model(X,features_deviation)
        model = np.copy(theta).ravel().tolist()
        model_record.append(model)

        predictions = sigmoid(np.dot(X,theta))
        predictions[predictions > 0.5] = 1
        predictions[predictions < 0.5] = 0

        acc = np.mean(predictions == y)
        temp_accuracies.append(acc)
        
    return max(temp_accuracies)

In [8]:
def erlr(X,y,k,size,num_iterations):
    #資料筆數
    num_examples = X.shape[0]
    #特徵數量
    num_features = X.shape[1]
    #fold大小
    fold_size = num_examples//k;
    #紀錄r次迭代的集成模型分類正確率
    r_accuracies_record = []
    #紀錄5-fold的分類正確率
    accuracies = []
    #總共產生多少基本模型
    total_base_learner = 0
    
     
    for i in range(k):
        start = i * fold_size;
        end = (i + 1) * fold_size

        #測試集
        x_test = X.iloc[start:end].values
        y_test = y.iloc[start:end].values

        #將測試集做資料前處理
        (x_test, features_mean, features_deviation) = prepare_for_training(x_test, True)

        #訓練集
        x_train = pd.concat([X.iloc[:start],X.iloc[end:]],axis = 0).values
        y_train = pd.concat([y.iloc[:start],y.iloc[end:]],axis = 0).values
        
        #對訓練集做資料前處理
        (x_train,features_mean,features_deviation) = prepare_for_training(x_train,True)
        
        #分類正確率門檻值
        threshold = initial_threshold(X,y)
        
        #r次迭代的分類正確率
        r_accuracies = []
        
        #r次迭代的模型
        r_model_record = []
        
        #迭代逐步提升分類正確率門檻值
        for iteration in range(num_iterations):
            #紀錄模型的係數
            temp_r_model = []
            
            #紀錄訓練集分類正確率
            x_train_accuracies = []
            
            #目前基本模型的數量
            base_learner_size = 0
            
            #當前基本模型的預測結果
            current_base_learners_prediction = np.zeros((x_train.shape[0], size))
            
            #當前erlr集成模型預測
            current_erlr_prediction = np.zeros((x_train.shape[0],1))
            
            print("當前分類正確率門檻值:",threshold)
            
            #訓練基本模型
            while base_learner_size < size: 
                #隨機產生羅吉斯迴歸模型 
                theta = erlr_base_model(x_train,features_deviation)

                #增加一個模型
                total_base_learner += 1

                #將訓練集做預測
                x_train_predictions = sigmoid(np.dot(x_train,theta))
                x_train_predictions[x_train_predictions > 0.5] = 1
                x_train_predictions[x_train_predictions < 0.5] = 0
                x_train_acc = np.mean(x_train_predictions == y_train)

                #如果基本模型在訓練集上的分類正確率優於threshold
                if(x_train_acc > threshold):
                    #紀錄基本模型的係數
                    temp_r_model.append(theta)
                    #將該模型加入至集成模型
                    current_base_learners_prediction[:,base_learner_size] = x_train_predictions.ravel()
                    base_learner_size += 1
                    
                    print(iteration,base_learner_size)
#                     print(theta.T)
                    print("-"*70)
                    
                    x_train_accuracies.append(x_train_acc)

            
            #紀錄當前集成模型
            r_model_record.append(temp_r_model)
            
            #進行當前集成模型於訓練集上的投票
            for row in range(current_base_learners_prediction.shape[0]):
                zero_nums = 0
                one_nums = 0
                for ele in current_base_learners_prediction[row]:
                    if ele == 0:
                        zero_nums+=1
                    else:
                        one_nums+=1
                if(zero_nums > one_nums):
                    current_erlr_prediction[row] = 0
                else:
                    current_erlr_prediction[row] = 1

            #計算模型正確率
            acc = np.mean(current_erlr_prediction == y_train)
            print(f"第{iteration}次更新門檻值，當前門檻值:{threshold} ,當前集成模型在訓練集上的正確率:{acc}")
            r_accuracies.append(acc)
            
            #更新分類正確率門檻值
            threshold = sum(x_train_accuracies)/len(x_train_accuracies)

        #挑出五次門檻值更新中，集成模型正確率最高的索引
        best_ensemble_model_acc = max(r_accuracies)
        max_index = r_accuracies.index(best_ensemble_model_acc)
        
        #將該集成模型做預測
        final_ensemble_base_learner_prediction = np.zeros((fold_size,size))
        final_erlr_prediction = np.zeros((fold_size,1))

        
        for current_index in range(size):
            final_ensemble_base_learner_x_test_prediction = sigmoid(np.dot(x_test,r_model_record[max_index][current_index]))
            final_ensemble_base_learner_x_test_prediction[final_ensemble_base_learner_x_test_prediction > 0.5] = 1
            final_ensemble_base_learner_x_test_prediction[final_ensemble_base_learner_x_test_prediction < 0.5] = 0

            final_ensemble_base_learner_prediction[:,current_index] = final_ensemble_base_learner_x_test_prediction.ravel()
        
        
        
        #進行最終集成模型投票，其預測結果即為當前fold的集成模型分類正確率
        for row in range(final_ensemble_base_learner_prediction.shape[0]):
            zero_nums = 0
            one_nums = 0
            for ele in final_ensemble_base_learner_prediction[row]:
                if ele == 0:
                    zero_nums+=1
                else:
                    one_nums+=1
            if(zero_nums > one_nums):
                final_erlr_prediction[row] = 0
            else:
                final_erlr_prediction[row] = 1
                
        #計算最終集成模型在測試集上的正確率
        final_ensemble_acc = np.mean(final_erlr_prediction == y_test)
#         print("="*79)
        print(f"當前交叉驗證:{i} ,最終集成模型在測試集上的正確率:{final_ensemble_acc}")
#         print("="*79)
        
        #紀錄k次的模型正確率
        accuracies.append(final_ensemble_acc)
    avg_accuracy = sum(accuracies) / len(accuracies)
    return accuracies,avg_accuracy,r_accuracies_record

# 粒子群優化演算法 - 羅吉斯迴歸

In [9]:
def particle_swarm_optimization(X,y,n,threshold,best_model = None):
    #設定粒子的搜索範圍
    x_min = -2
    x_max = 2
    #設定粒子的速度
    v_min = -2
    v_max = 2
    #權重
    w = 0.9
    #自我學習因子
    c1 = 2
    #群體學習因子
    c2 = 2
    #維度
    d = X.shape[1]
    #迭代次數
    generation = 100
    #紀錄每一個粒子的位置向量(生成N個粒子)
    particle_x_record = np.random.uniform(x_min,x_max, size=(n,d))
    
    #是否保留上一輪的最佳粒子
    if best_model is not None:
#         print("加入上一輪的最佳粒子")
        particle_x_record[0] = best_model
    
    #紀錄每一個粒子的速度向量
    particle_v_record = np.random.uniform(x_min,x_max, size=(n,d))
    #紀錄每一個粒子的歷史最佳位置
    pbest = np.zeros((n,d))
    #紀錄每一個粒子的歷史最佳適應值
    pbest_fitness = np.zeros(n)
    #紀錄群體的最佳位置
    gbest = np.zeros((n,d))
    #紀錄群體最佳的歷史最佳適應值
    gbest_fitness = 0

    
    #迭代generation次
    for gen in range(generation):
#         print("當前回合為:", gen+1)
        #評估各粒子的適應值
        for i in range(n):
            x_train_predictions = sigmoid(np.dot(X,particle_x_record[i].reshape(d,1)))
            x_train_predictions[x_train_predictions > 0.5] = 1
            x_train_predictions[x_train_predictions < 0.5] = 0
            x_train_acc = np.mean(x_train_predictions == y) 
            #如果當前粒子的適應值大於粒子歷史最佳適應值
            if x_train_acc > pbest_fitness[i]:
                #更新粒子個體歷史最佳適應值
                pbest_fitness[i] = x_train_acc
                #更新粒子個體歷史最佳位置
                pbest[i] = particle_x_record[i]
        
        #取得當前粒子群體最佳適應值的索引
        max_index = np.argmax(pbest_fitness)
        
        #如果當前粒子群體最佳適應值大於歷史粒子群體最佳適應值
        if pbest_fitness[max_index] > gbest_fitness:
            #更新粒子群體歷史最佳位置
            for i in range(n):
                gbest[i] = pbest[max_index]
            #更新粒子群體歷史最佳適應值
            gbest_fitness = pbest_fitness[max_index]
#         print("最佳適應值:",gbest_fitness)
        
        #檢查粒子群體是否收斂
        is_converge = True
        first_particle = pbest[0]
        for particle in pbest:
            if not np.array_equal(pbest[0], particle):
                is_converge = False
                break

        #如果收斂則跳出演算法
        if is_converge:
#             print("收斂了")
            break
    
        #更新粒子當前速度
        r1 = np.random.random()
        r2 = np.random.random()
        particle_v_record = w * particle_v_record + c1*r1*(pbest-particle_x_record) + c2*r2*(gbest-particle_x_record)
        w = 0.5 / generation-1
        
        #速度限制檢查
        for row in range(len(particle_v_record)):
            for col in range(len(particle_v_record[row])):
                if particle_v_record[row][col] < v_min or particle_v_record[row][col] > v_max:
                    particle_v_record[row][col] = np.random.uniform(v_min,v_max)
        
        #更新粒子的位置
        particle_x_record = particle_x_record + particle_v_record
        
        #位置限制檢查
        for row in range(len(particle_x_record)):
            for col in range(len(particle_x_record[row])):
                if particle_x_record[row][col] < x_min or particle_x_record[row][col] > x_max:
                    particle_x_record[row][col] = np.random.uniform(x_min, x_max)
                    
    return gbest[0],gbest_fitness

# PSO-ERLR

In [10]:
def pso_erlr(X, y, k = 5, size = 25, num_iterations = 5):
    #資料筆數
    num_examples = X.shape[0]
    #特徵數量
    num_features = X.shape[1]
    #fold大小
    fold_size = num_examples//k;
    #紀錄r次迭代的集成模型分類正確率
    r_accuracies_record = []
    #紀錄5-fold的分類正確率
    accuracies = []
    #總共產生多少基本模型
    total_base_learner = 0
    #紀錄五次交叉驗證後每個最佳集成模型的基本模型
    base_model_record = []
    #紀錄5倍交叉驗證後每個最佳集成模型中所有基本模型的預測結果
    base_model_prediction_record = []

    
    #五倍交叉驗證  
    for i in range(k):
#         print(f"交叉驗證{i}")
        start = i * fold_size;
        end = (i + 1) * fold_size

        #測試集
        x_test = X.iloc[start:end].values
        y_test = y.iloc[start:end].values
        

        #訓練集
        x_train = pd.concat([X.iloc[:start],X.iloc[end:]],axis = 0).values
        y_train = pd.concat([y.iloc[:start],y.iloc[end:]],axis = 0).values
        
        
        #將測試集做資料前處理
        (x_test, features_mean, features_deviation) = prepare_for_training(x_test, True)
                
        #對訓練集做資料前處理
        (x_train,features_mean,features_deviation) = prepare_for_training(x_train,True)
        
        #分類正確率門檻值
        threshold = 0
        
        #r次迭代的分類正確率
        r_accuracies = []
        
        #r次迭代的模型
        r_model_record = []
        
        #迭代逐步提升分類正確率門檻值
        for iteration in range(num_iterations):
#             print(f"第{iteration}次更新分類正確率門檻值")
            
            #紀錄模型的係數
            temp_r_model = []
            
            #紀錄訓練集分類正確率
            x_train_accuracies = []
            
            #目前基本模型的數量
            base_learner_size = 0
            
            #當前基本模型的預測結果
            current_base_learners_prediction = np.zeros((x_train.shape[0], size))
            
            #當前pso-erlr集成模型預測
            current_pso_erlr_prediction = np.zeros((x_train.shape[0],1))
            
#             print("當前分類正確率門檻值:",threshold)
            #訓練基本模型
            while base_learner_size < size: 
                best_model = None
                #是否要離開尋找base learner的過程
                exit_find_model = False
                #紀錄重新執行粒子群優化演算法的次數(如果執行5次都沒辦法通過分類正確率門檻值就不要了)
                count_repeat_num = 0
                
                #執行粒子群優化演算法
                while count_repeat_num < 5:
                    #挑選最佳粒子(模型)
                    best_model_theta, best_modle_acc = particle_swarm_optimization(x_train,y_train,50,threshold,best_model) 
                    #如果最佳模型在訓練集上的分類正確率優於threshold
                    if(best_modle_acc > threshold):
                        #紀錄基本模型的係數
                        temp_r_model.append(best_model_theta)      
                       
                        #將訓練集集做預測
                        base_learner_x_train_predictions = sigmoid(np.dot(x_train,best_model_theta.T))
                        base_learner_x_train_predictions[base_learner_x_train_predictions > 0.5] = 1
                        base_learner_x_train_predictions[base_learner_x_train_predictions < 0.5] = 0
                        
                        #加入至集成模型
                        current_base_learners_prediction[:,base_learner_size] = base_learner_x_train_predictions.ravel()
                        base_learner_size += 1
#                         print(iteration,base_learner_size)
#                         print(best_model_theta)
#                         print(f"已經找到{base_learner_size}個基本模型")
#                         print("-"*70)
                        x_train_accuracies.append(best_modle_acc)
                        break
                    else:
#                         print("保留最佳粒子")
                        #保留最佳粒子
                        best_model = best_model_theta
                        count_repeat_num += 1
                
                if count_repeat_num == 5:
#                     print(f"已經嘗試過{count_repeat_num}次尋找通過門檻值的基本模型了")
                    exit_find_model = True
                
                if exit_find_model:
                    break
            
            #如果沒有找到25個基本模型就跳出
            if base_learner_size < size:
                break
            
            #紀錄當前集成模型
            r_model_record.append(temp_r_model)
            
            

            #進行當前集成模型於訓練集上的投票
            for row in range(current_base_learners_prediction.shape[0]):
                zero_nums = 0
                one_nums = 0
                for ele in current_base_learners_prediction[row]:
                    if ele == 0:
                        zero_nums+=1
                    else:
                        one_nums+=1
                if(zero_nums > one_nums):
                    current_pso_erlr_prediction[row] = 0
                else:
                    current_pso_erlr_prediction[row] = 1

            #計算集成模型正確率
            acc = np.mean(current_pso_erlr_prediction == y_train)
#             print("="*79)
#             print(f"第{iteration}次更新門檻值，當前門檻值:{threshold} ,當前集成模型在訓練集上的正確率:{acc}")
#             print("="*79)
            r_accuracies.append(acc)
            
            #更新分類正確率門檻值
            threshold = sum(x_train_accuracies)/len(x_train_accuracies)
        
        #挑選出五次門檻值更新中，集成模型正確率最高的索引
        best_ensemble_model_acc = max(r_accuracies)
        max_index = r_accuracies.index(best_ensemble_model_acc)
#         print("="*79)
#         print(f"最佳集成模型的索引是{max_index}")
#         print("="*79)
    
#         #把每一次交叉驗證中，r次的集成模型分類正確率記起來
#         r_accuracies_record.append(r_accuracies)
        
        
        #將該集成模型做預測
        final_ensemble_base_learner_prediction = np.zeros((fold_size,size))
        final_pso_erlr_prediction = np.zeros((fold_size,1))
        
        for current_index in range(size):
            final_ensemble_base_learner_x_test_prediction = sigmoid(np.dot(x_test,r_model_record[max_index][current_index].T))
            final_ensemble_base_learner_x_test_prediction[final_ensemble_base_learner_x_test_prediction > 0.5] = 1
            final_ensemble_base_learner_x_test_prediction[final_ensemble_base_learner_x_test_prediction < 0.5] = 0

            final_ensemble_base_learner_prediction[:,current_index] = final_ensemble_base_learner_x_test_prediction.ravel()
        
        
        base_model_prediction_record.append(final_ensemble_base_learner_prediction)
        
        
        #進行最終集成模型投票，其預測結果即為當前fold的集成模型分類正確率
        for row in range(final_ensemble_base_learner_prediction.shape[0]):
            zero_nums = 0
            one_nums = 0
            for ele in final_ensemble_base_learner_prediction[row]:
                if ele == 0:
                    zero_nums+=1
                else:
                    one_nums+=1
            if(zero_nums > one_nums):
                final_pso_erlr_prediction[row] = 0
            else:
                final_pso_erlr_prediction[row] = 1
                
        #計算最終集成模型在測試集上的正確率
        final_ensemble_acc = np.mean(final_pso_erlr_prediction == y_test)
#         print("="*79)
#         print(f"當前交叉驗證:{i} ,最終集成模型在測試集上的正確率:{final_ensemble_acc}")
#         print("="*79)
        
        
        #紀錄k次的模型正確率
        accuracies.append(final_ensemble_acc)
    avg_accuracy = sum(accuracies) / len(accuracies)
    return accuracies,avg_accuracy,r_accuracies_record,base_model_record,base_model_prediction_record

# Naive Bayes

In [11]:
def NB(X,y,k,is_categorical = False):
    num_examples = X.shape[0]
    #紀錄資料集中，每一個屬性的所有可能值
    feature_info = {}
    #如果是類別型資料
    if is_categorical:
        #每一個特徵的可能值依據資料集而決定
        for i in range(X.shape[1]):
            feature_info[i] = list(X.iloc[:,i].unique())
    #如果是連續型資料
    else:
        #每一個特徵的可能值會用10-equal width來分割，所以會有0~9的值
        for i in range(X.shape[1]):
            feature_info[i] = [i for i in range(10)]
        
    fold_size = num_examples//k;
    accuracies = []
    
    for i in range(k):
        start = i * fold_size;
        end = (i + 1) * fold_size
       
        #測試集
        x_test = X.iloc[start:end].values
        y_test = y.iloc[start:end].values
            
        #訓練集
        x_train = pd.concat([X.iloc[:start],X.iloc[end:]],axis = 0).values
        y_train = pd.concat([y.iloc[:start],y.iloc[end:]],axis = 0).values
        
        
        #訓練簡易貝氏模型
        naive_bayes = NaiveBayes(x_train, y_train, feature_info)
        class_prior, likelihood, all_class = naive_bayes.fit()
        
        
        #將測試集做預測
        prediction = NaiveBayes.predict(x_test,class_prior,likelihood,all_class)
        
        #計算模型正確率
        acc = np.mean(prediction == y_test)
        
        #紀錄k次的模型正確率
        accuracies.append(acc)
        
    
    avg_accuracy = sum(accuracies) / len(accuracies)    
    return accuracies,avg_accuracy

# 袋裝法 - 簡易貝氏

In [12]:
def nb_bagging(X,y,k,size,is_categorical = False):
    num_examples = X.shape[0]
    fold_size = num_examples//k;
    accuracies = []
    nb_bagging_prediction = np.zeros((fold_size,1))
    #紀錄5倍交叉驗證的所有模型
    base_model_record = []
    #紀錄5倍交叉驗證所有base model的預測結果
    base_model_prediction_record = []
    
    #紀錄資料集中，每一個屬性的所有可能值
    feature_info = {}
    #如果是類別型資料
    if is_categorical:
        #每一個特徵的可能值依據資料集而決定
        for i in range(X.shape[1]):
            feature_info[i] = list(X.iloc[:,i].unique())
    #如果是連續型資料
    else:
        #每一個特徵的可能值會用10-equal width來分割，所以會有0~9的值
        for i in range(X.shape[1]):
            feature_info[i] = [i for i in range(10)]
    
              
    #所有類別種類
    all_class = y.iloc[:,0].unique()
        
    
    for i in range(k):
#         print("="*79)
#         print(f"第{i+1}次的交叉驗證")
#         print("="*79)

        start = i * fold_size;
        end = (i + 1) * fold_size

        #測試集
        x_test = X.iloc[start:end].values
        y_test = y.iloc[start:end].values

        #訓練集
        x_train = pd.concat([X.iloc[:start],X.iloc[end:]],axis = 0).values
        y_train = pd.concat([y.iloc[:start],y.iloc[end:]],axis = 0).values
        
        #子訓練集的大小，假設100%
        sample_size = int(1*x_train.shape[0])
        
        #子訓練集資料權重
        data_weight = np.ones((sample_size,1))
        
        #紀錄當前交叉驗證的25個基本模型
        temp_base_model_record = []
        
        
        #訓練基本模型
        for j in range(size):
            #隨機抽取100%的子訓練集做模型的訓練
            random_indices = np.random.choice(x_train.shape[0],size = sample_size, replace = True)
            x_train_subset = x_train[random_indices,:]
            y_train_subset = y_train[random_indices,:]
            

            #訓練簡易貝氏模型
            naive_bayes = NaiveBayes(x_train_subset, y_train_subset, feature_info)
            class_prior, likelihood, all_class = naive_bayes.fit()
           
            temp_base_model_record.append([class_prior,likelihood])
        
        
        #將該集成模型做預測
        final_ensemble_base_learner_prediction = np.zeros((fold_size,size))
        final_prediction = np.zeros((fold_size,1))

        
        for current_index in range(size):
            current_class_prior,current_likelihoods = temp_base_model_record[current_index]
            final_ensemble_base_learner_x_test_prediction = predict(x_test,all_class,current_likelihoods,current_class_prior)
            final_ensemble_base_learner_prediction[:,current_index] = final_ensemble_base_learner_x_test_prediction.ravel()
        
        
        base_model_prediction_record.append(final_ensemble_base_learner_prediction)
            
#             #將測試集做預測
#             x_test_predictions = NaiveBayes.predict(x_test,class_prior,likelihood,all_class)
            
#             base_learners_prediction[:,j] = x_test_predictions.ravel()
            
#             print(j+1)
#             print("-"*70)

        
#         base_model_prediction_record.append(base_learners_prediction)
            
        
        #進行集成投票
        for row in range(final_ensemble_base_learner_prediction.shape[0]):
            ensemble_prediction_record = {key: 0 for key in all_class}
            for ele in final_ensemble_base_learner_prediction[row]:
                ensemble_prediction_record[int(ele)] += 1
            ensemble_prediction = max(ensemble_prediction_record, key=lambda k: ensemble_prediction_record[k])                
            final_prediction[row] = ensemble_prediction

        #計算模型正確率
        acc = np.mean(final_prediction == y_test)
        #紀錄k次的模型正確率
        accuracies.append(acc)
        #紀錄此次交叉驗證的25個基本模型的紀錄
        base_model_record.append(temp_base_model_record)
        
    avg_accuracy = sum(accuracies) / len(accuracies)
    return accuracies,avg_accuracy,base_model_record,base_model_prediction_record

# TRENB

In [13]:
#隨機產生prior probability #add的目的用來產生粒子群的速度
def random_prior_probability(all_class,add = False):
    class_prior = {}
    if add:
        for unique_class in all_class:
            class_prior[unique_class] = (np.random.rand() + 1e-10)*1e-1
    else:
        for unique_class in all_class:
            class_prior[unique_class] = np.random.rand() + 1e-10
            
    return class_prior

In [14]:
#隨機產生likelihood #add的目的用來產生粒子群的速度
def random_likelihood(num_features,feature_info,all_class,add = False):
    likelihoods = {}
    
    if add:
        #針對每一個特徵
        for feature in range(num_features):
            #針對每個特徵值
            for i in feature_info[feature]:
                likelihoods[str(feature)+"_"+str(i)] = {}
                #針對每一個類別
                for unique_class in all_class:
                    likelihoods[str(feature)+"_"+str(i)][unique_class] = (np.random.rand() + 1e-10)*1e-1
    else:
        #針對每一個特徵
        for feature in range(num_features):
            #針對每個特徵值
            for i in feature_info[feature]:
                likelihoods[str(feature)+"_"+str(i)] = {}
                #針對每一個類別
                for unique_class in all_class:
                    likelihoods[str(feature)+"_"+str(i)][unique_class] = np.random.rand() + 1e-10
    return likelihoods

In [15]:
#將prior probability做正規化
def prior_probability_normalization(class_prior):
    total_prior_probability = sum(class_prior.values())
    for key, value in class_prior.items():
        class_prior[key] = value / total_prior_probability

In [16]:
#將likelihood做正規化
def likelihood_normalization(likelihoods,num_features,feature_info,all_class):
    total_likelihood = {} 
    for feature in range(num_features):
        total_likelihood[feature] = {}
        #針對每一個類別
        for unique_class in all_class:
            total = 0      
            #針對每個特徵值
            for i in feature_info[feature]: 
                total += likelihoods[str(feature)+"_"+str(i)][unique_class]
            
            total_likelihood[feature][unique_class] = total
            
    for feature in range(num_features):
        #針對每個特徵值
        for i in feature_info[feature]:
            #針對每一個類別
            for unique_class in all_class:
                likelihoods[str(feature)+"_"+str(i)][unique_class] /= total_likelihood[feature][unique_class]
    

In [17]:
# 隨機產生簡易貝氏分類模型
def trenb_base_model(X,feature_info,all_class):
    #特徵數量
    num_features = X.shape[1]   
    
    
    #隨機產生prior probability
    class_prior = random_prior_probability(all_class)
    #隨機產生likelihood
    likelihoods = random_likelihood(num_features,feature_info,all_class)
    
    
    #將prior probability做正規化
    prior_probability_normalization(class_prior)
    #將likelihood做正規化
    likelihood_normalization(likelihoods,num_features,feature_info,all_class)
            
    return class_prior,likelihoods      

In [18]:
#(隨機)簡易貝氏進行預測
def predict(X,all_class,likelihoods,class_prior):
    predictions = np.zeros((X.shape[0], 1))
    #進行預測
    #每筆資料
    for row in range(X.shape[0]):
        max_value = max(all_class)
        single_class_prediction = {}
        #每個類別
        for unique_class in all_class:
            cur_likelihood = 1
            #每個特徵
            for col in range(X.shape[1]):
                cur_likelihood *= likelihoods[str(col)+"_"+str(X[row][col])][unique_class]
            single_class_prediction[unique_class] = cur_likelihood * class_prior[unique_class]

        #機率值最大的類別即為人預測結果
        predictions[row][0] = max(single_class_prediction, key=lambda k: single_class_prediction[k])
    
    return predictions

In [19]:
# 初始分類門檻值
def trenb_initial_threshold(X,y,feature_info,all_class,num_iter = 20,size = 1000): 
    X = X.values
    y = y.values
    accuracies = []
    model_record = []
    predictions = np.zeros((X.shape[0], 1))
    
    
    for iteration in range(num_iter):
        temp_accuracies = []
        for i in range(size):
            #隨機產生簡易貝氏分類模型
            class_prior,likelihoods = trenb_base_model(X,feature_info,all_class)
            
            #進行預測
            predictions = predict(X,all_class,likelihoods,class_prior)
            
            acc = np.mean(predictions == y)
            temp_accuracies.append(acc)
       
        accuracies.append(max(temp_accuracies))
    
    initial_threshold = sum(accuracies) / len(accuracies)
    return initial_threshold

In [20]:
def trenb(X,y,k,size,num_iterations,is_categorical = False):
    #資料筆數
    num_examples = X.shape[0]
    #特徵數量
    num_features = X.shape[1]
    #fold大小
    fold_size = num_examples//k;
    #紀錄r次迭代的集成模型分類正確率
    r_accuracies_record = []
    #紀錄5-fold的分類正確率
    accuracies = []
    #總共產生多少基本模型
    total_base_learner = 0
    #紀錄資料集中，每一個屬性的所有可能值
    feature_info = {}
    #如果是類別型資料
    if is_categorical:
        #每一個特徵的可能值依據資料集而決定
        for i in range(X.shape[1]):
            feature_info[i] = list(X.iloc[:,i].unique())
    #如果是連續型資料
    else:
        #每一個特徵的可能值會用10-equal width來分割，所以會有0~9的值
        for i in range(X.shape[1]):
            feature_info[i] = [i for i in range(10)]
              
    #所有類別種類
    all_class = np.unique(y)
    
     
    for i in range(k):
        print(f"交叉驗證{i}")
        start = i * fold_size;
        end = (i + 1) * fold_size

        #測試集
        x_test = X.iloc[start:end].values
        y_test = y.iloc[start:end].values

        #訓練集
        x_train = pd.concat([X.iloc[:start],X.iloc[end:]],axis = 0).values
        y_train = pd.concat([y.iloc[:start],y.iloc[end:]],axis = 0).values
        
        #分類正確率門檻值
        threshold = trenb_initial_threshold(X,y,feature_info,all_class,num_iter = 20,size = 1000)
        
        #r次迭代的分類正確率
        r_accuracies = []
        
        #r次迭代的模型
        r_model_record = []
        
        #迭代逐步提升分類正確率門檻值
        for iteration in range(num_iterations):
            #紀錄模型
            temp_r_model = []
            
            #紀錄訓練集分類正確率
            x_train_accuracies = []
            
            #目前基本模型的數量
            base_learner_size = 0
            
            #當前基本模型的預測結果
            current_base_learners_prediction = np.zeros((x_train.shape[0], size))
            
            #當前trenb集成模型預測
            current_trenb_prediction = np.zeros((x_train.shape[0],1))
            
            print("當前分類正確率門檻值:",threshold)
            #訓練基本模型
            while base_learner_size < size: 
                #隨機產生簡易貝氏分類器
                class_prior,likelihoods = trenb_base_model(X,feature_info,all_class)      

                #增加一個模型
                total_base_learner += 1

                #將訓練集做預測
                x_train_predictions = predict(x_train,all_class,likelihoods,class_prior)
                x_train_acc = np.mean(x_train_predictions == y_train)

                #如果基本模型在訓練集上的分類正確率優於threshold
                if(x_train_acc > threshold):
                    #紀錄基本模型的資訊
                    temp_r_model.append([class_prior,likelihoods])
                    #將該模型加入至集成模型
                    current_base_learners_prediction[:,base_learner_size] = x_train_predictions.ravel()
                    base_learner_size += 1
                    print(iteration,base_learner_size)
#                     print(likelihoods)
#                     print(class_prior)
                    print("-"*70)
                    x_train_accuracies.append(x_train_acc)

            
            #紀錄當前集成模型
            r_model_record.append(temp_r_model)

            #進行當前集成模型於訓練集上投票
            for row in range(current_base_learners_prediction.shape[0]):
                ensemble_prediction_record = {key: 0 for key in all_class}
#                 print(base_learners_prediction[row])
                for ele in current_base_learners_prediction[row]:
                    ensemble_prediction_record[int(ele)] += 1
                ensemble_prediction = max(ensemble_prediction_record, key=lambda k: ensemble_prediction_record[k])                
#                 print(f"集成將該筆資料預測為:{ensemble_prediction}")
                current_trenb_prediction[row] = ensemble_prediction

            #計算模型正確率
            acc = np.mean(current_trenb_prediction == y_train)
            print(f"第{iteration}次更新門檻值，當前門檻值:{threshold} ,當前集成模型在訓練集上的正確率:{acc}")
            r_accuracies.append(acc)
            
            #更新分類正確率門檻值
            threshold = sum(x_train_accuracies)/len(x_train_accuracies)
        
       #挑出五次門檻值更新中，集成模型正確率最高的索引
        best_ensemble_model_acc = max(r_accuracies)
        max_index = r_accuracies.index(best_ensemble_model_acc)
        
        #將該集成模型做預測
        final_ensemble_base_learner_prediction = np.zeros((fold_size,size))
        final_trenb_prediction = np.zeros((fold_size,1))

        
        for current_index in range(size):
            current_class_prior,current_likelihoods = r_model_record[max_index][current_index]
            final_ensemble_base_learner_x_test_prediction = predict(x_test,all_class,current_likelihoods,current_class_prior)
            final_ensemble_base_learner_prediction[:,current_index] = final_ensemble_base_learner_x_test_prediction.ravel()
        
        #進行當前集成模型於測試集上投票
        for row in range(final_ensemble_base_learner_prediction.shape[0]):
            ensemble_prediction_record = {key: 0 for key in all_class}
#                 print(base_learners_prediction[row])
            for ele in final_ensemble_base_learner_prediction[row]:
                ensemble_prediction_record[int(ele)] += 1
            ensemble_prediction = max(ensemble_prediction_record, key=lambda k: ensemble_prediction_record[k])                
#                 print(f"集成將該筆資料預測為:{ensemble_prediction}")
            final_trenb_prediction[row] = ensemble_prediction 
        
        #計算最終集成模型在測試集上的正確率
        final_ensemble_acc = np.mean(final_trenb_prediction == y_test)
#         print("="*79)
        print(f"當前交叉驗證:{i} ,最終集成模型在測試集上的正確率:{final_ensemble_acc}")
#         print("="*79) 
        
        
        #紀錄k次的模型正確率
        accuracies.append(final_ensemble_acc)
    avg_accuracy = sum(accuracies) / len(accuracies)
    print("="*70)
    return accuracies,avg_accuracy,r_accuracies_record
        

# 粒子群優化演算法-簡易貝氏


In [21]:
def trenb_particle_swarm_optimization(X,y,n,feature_info,all_class,threshold,best_model = None):
    #設定粒子的搜索範圍
    x_min = 0 + 1e-10
    x_max = 1
    #設定粒子的速度
    v_min = 0 + 1e-10
    v_max = 1
    #權重
    w = 0.9
    #自我學習因子
    c1 = 2
    #群體學習因子
    c2 = 2
    #維度(特徵數量)
    d = X.shape[1]
    #迭代次數
    generation = 100
    
    #紀錄每一個粒子的位置向量
    particle_x_record = []
    for i in range(n):
        particle_x_record.append({})
    
    #隨機生成N個粒子
    for i in range(n):
        #隨機產生簡易貝氏分類器
        class_prior,likelihoods = trenb_base_model(X,feature_info,all_class)  
#         particle_x_record.append({"likelihood":likelihoods,"class_prior":class_prior})
        particle_x_record[i]["likelihood"] = likelihoods
        particle_x_record[i]["class_prior"] = class_prior 
    
    #是否保留上一輪的最佳粒子
    if best_model is not None:
        print("加入上一輪的最佳粒子")
        particle_x_record[0] = best_model
    
    #紀錄每一個粒子的速度向量
    particle_v_record = []
    for i in range(n):
        particle_v_record.append({})
    
    #隨機產生N個粒子的速度
    for i in range(n):
        #隨機產生liklihood
        liklihoods = random_likelihood(d,feature_info,all_class,True)
        #隨機產生prior probability
        class_prior = random_prior_probability(all_class,True)
        
        particle_v_record[i]["likelihood"] = liklihoods
        particle_v_record[i]["class_prior"] = class_prior
    
    
    #紀錄每一個粒子的歷史最佳位置
    pbest = []
    for i in range(n):
        pbest.append({})
    
    #紀錄每一個粒子的歷史最佳適應值
    pbest_fitness = []
    for i in range(n):
        pbest_fitness.append(0)
        
    #紀錄群體的最佳位置
    gbest = []
    for i in range(n):
        gbest.append({})
        
    #紀錄群體最佳的歷史最佳適應值
    gbest_fitness = 0

    
    #迭代generation次
    for gen in range(generation):
        print("當前回合為:", gen+1)
        #評估各粒子的適應值
        for i in range(n):      
            likelihoods = particle_x_record[i]["likelihood"]
            class_prior = particle_x_record[i]["class_prior"]
            x_train_predictions = predict(X,all_class,likelihoods,class_prior)
            x_train_acc = np.mean(x_train_predictions == y) 
#             print(f"粒子{i}的長相")
#             print(particle_x_record[i])
#             print(f"粒子{i}的適應值:{x_train_acc}")
            #如果當前粒子的適應值大於粒子歷史最佳適應值
#             print("="*79)
#             print("pbest_fitness:",pbest_fitness)
            if x_train_acc > pbest_fitness[i]:
                #更新粒子個體歷史最佳適應值
                pbest_fitness[i] = x_train_acc
                #更新粒子個體歷史最佳位置
                pbest[i] = copy.deepcopy(particle_x_record[i])
        
#         print("="*79)
#         print("更新完pbest")
#         print(pbest)
#         print("="*79)
        
        #取得當前粒子群體最佳適應值的索引
        max_index = pbest_fitness.index(max(pbest_fitness))
#         print(f"當前的最佳粒子為:粒子{max_index}")

        #如果當前粒子群體最佳適應值大於歷史粒子群體最佳適應值
        if pbest_fitness[max_index] > gbest_fitness:
            #更新粒子群體歷史最佳位置
            for i in range(n):
                gbest[i] = copy.deepcopy(pbest[max_index])
            #更新粒子群體歷史最佳適應值
            gbest_fitness = pbest_fitness[max_index]
        
#         print("更新完gbest")
#         print(gbest)
#         print("="*79)
#         print("gbest_fitness")
#         print(gbest_fitness)
            
        print("最佳適應值:",gbest_fitness)
#         print("="*79)
        
        #檢查粒子群體是否收斂
        is_converge = True
        for particle in pbest:
            if particle != pbest[0]:
                is_converge = False
                break
    
        #如果收斂則跳出演算法
        if is_converge:
            print("收斂了")
            break

        #更新粒子當前速度
        r1 = np.random.random()
        r2 = np.random.random()
        
        #針對每一個粒子
        for i in range(n):
            #針對每一個類別
            for unique_class in all_class:
                #處理prior probability
                particle_v_record[i]["class_prior"][unique_class] = (w * particle_v_record[i]["class_prior"][unique_class] + 
                                                                     c1*r1*(pbest[i]["class_prior"][unique_class] - particle_x_record[i]["class_prior"][unique_class]) +
                                                                     c2*r2*(gbest[i]["class_prior"][unique_class] - particle_x_record[i]["class_prior"][unique_class])
                                                                     )
                #針對每一個特徵
                for feature in range(d):
                    #針對每個特徵值
                    for feature_value in feature_info[feature]:
                        #處理likelihood
                        particle_v_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] = (w * particle_v_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] + 
                                                                                                     c1*r1*(pbest[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] - particle_x_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class]) +
                                                                                                     c2*r2*(gbest[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] - particle_x_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class])
                                                                                                     )

        w = 0.5 / generation-1
        
        
        #速度限制檢查
        for i in range(n):
            #針對每一個類別
            for unique_class in all_class:
                if particle_v_record[i]["class_prior"][unique_class] < v_min or particle_v_record[i]["class_prior"][unique_class] > v_max:
                    particle_v_record[i]["class_prior"][unique_class] = (np.random.rand() + 1e-10)*1e-1
                
                #針對每一個特徵
                for feature in range(d):
                    #針對每個特徵值
                    for feature_value in feature_info[feature]:
                        if particle_v_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] < v_min or particle_v_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] > v_max:
                            particle_v_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] = (np.random.rand() + 1e-10)*1e-1
                                          
     
        #更新粒子的位置
        for i in range(n):
            #針對每一個類別
            for unique_class in all_class:
                particle_x_record[i]["class_prior"][unique_class] += particle_v_record[i]["class_prior"][unique_class]
                #針對每一個特徵
                for feature in range(d):
                    #針對每個特徵值
                    for feature_value in feature_info[feature]:
                        particle_x_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] += particle_v_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class]
                        
#         #將各個粒子做正規化
#         for i in range(n):
#             class_prior = particle_x_record[i]["class_prior"]
#             likelihoods = particle_x_record[i]["likelihood"]
            
#             prior_probability_normalization(class_prior)
#             likelihood_normalization(likelihoods,d,feature_info,all_class)
            
#             particle_x_record[i]["class_prior"] = class_prior
#             particle_x_record[i]["likelihood"] = likelihoods
        
        #位置限制檢查
        for i in range(n):
            #針對每一個類別
            for unique_class in all_class:
                if particle_x_record[i]["class_prior"][unique_class] < x_min or particle_x_record[i]["class_prior"][unique_class] > x_max:
                    particle_x_record[i]["class_prior"][unique_class] = np.random.rand() + 1e-10
                
                #針對每一個特徵
                for feature in range(d):
                    #針對每個特徵值
                    for feature_value in feature_info[feature]:
                        if particle_x_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] < x_min or particle_x_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] > x_max:
                            particle_x_record[i]["likelihood"][str(feature)+"_"+str(feature_value)][unique_class] = np.random.rand() + 1e-10
                            
        #將各個粒子做正規化
        for i in range(n):
            class_prior = particle_x_record[i]["class_prior"]
            likelihoods = particle_x_record[i]["likelihood"]
            
            prior_probability_normalization(class_prior)
            likelihood_normalization(likelihoods,d,feature_info,all_class)
            
            particle_x_record[i]["class_prior"] = class_prior
            particle_x_record[i]["likelihood"] = likelihoods
        
#     print("現在來看看這個最佳粒子的長相")
#     print(gbest[0])
#     likelihoods = gbest[0]["likelihood"]
#     class_prior = gbest[0]["class_prior"]
#     x_train_predictions = predict(X,all_class,likelihoods,class_prior)
#     x_train_acc = np.mean(x_train_predictions == y)
#     print("這個最佳粒子的適應值:",x_train_acc)
                        
    return gbest[0],gbest_fitness

# PSO-TRENB

In [22]:
def pso_trenb(X, y, k = 5, size = 25, num_iterations = 5, is_categorical = False):
    #資料筆數
    num_examples = X.shape[0]
    #特徵數量
    num_features = X.shape[1]
    #fold大小
    fold_size = num_examples//k;
    
    #紀錄資料集中，每一個屬性的所有可能值
    feature_info = {}
    #如果是類別型資料
    if is_categorical:
        #每一個特徵的可能值依據資料集而決定
        for i in range(X.shape[1]):
            feature_info[i] = list(X.iloc[:,i].unique())
    #如果是連續型資料
    else:
        #每一個特徵的可能值會用10-equal width來分割，所以會有0~9的值
        for i in range(X.shape[1]):
            feature_info[i] = [i for i in range(10)]
    
#     print(feature_info)
    
              
    #所有類別種類
    all_class = y.iloc[:,0].unique()
        
    #紀錄r次迭代的集成模型分類正確率
    r_accuracies_record = []
    #紀錄5-fold的分類正確率
    accuracies = []
    #總共產生多少基本模型
    total_base_learner = 0
    #紀錄五次交叉驗證後每個最佳集成模型的基本模型
    base_model_record = []
    #紀錄5倍交叉驗證後每個最佳集成模型中所有基本模型的預測結果
    base_model_prediction_record = []

    
    #五倍交叉驗證  
    for i in range(k):
        start_time = time.time()
        print(f"交叉驗證{i}")
        start = i * fold_size;
        end = (i + 1) * fold_size

        #測試集
        x_test = X.iloc[start:end].values
        y_test = y.iloc[start:end].values

        #訓練集
        x_train = pd.concat([X.iloc[:start],X.iloc[end:]],axis = 0).values
        y_train = pd.concat([y.iloc[:start],y.iloc[end:]],axis = 0).values
        
        #分類正確率門檻值
        threshold = 0
        
        #r次迭代的分類正確率
        r_accuracies = []
        
        #r次迭代的模型
        r_model_record = []
        
        #迭代逐步提升分類正確率門檻值
        for iteration in range(num_iterations):
            print(f"第{iteration}次更新分類正確率門檻值")
            
            #紀錄模型的係數
            temp_r_model = []
            
            #紀錄訓練集分類正確率
            x_train_accuracies = []
            
            #目前基本模型的數量
            base_learner_size = 0
            
            #當前基本模型的預測結果
            current_base_learners_prediction = np.zeros((x_train.shape[0], size))
            
            #當前pso-trenb集成模型預測
            current_pso_trenb_prediction = np.zeros((x_train.shape[0],1))
            
            print("當前分類正確率門檻值:",threshold)
            
            #訓練基本模型
            while base_learner_size < size: 
                #當前是否保留最佳粒子
                best_model = None
                #是否要離開尋找base learner的過程
                exit_find_model = False
                #紀錄重新執行粒子群優化演算法的次數(如果執行5次都沒辦法通過分類正確率門檻值就不要了)
                count_repeat_num = 0
                
                #執行粒子群優化演算法
                while count_repeat_num < 5:
                    #挑選最佳粒子(模型)
                    best_nb_model, best_modle_acc = trenb_particle_swarm_optimization(x_train,y_train,50,feature_info,all_class,threshold,best_model) 

                    #如果最佳模型在訓練集上的分類正確率優於threshold
                    if(best_modle_acc > threshold):
                        #紀錄基本模型
                        temp_r_model.append(best_nb_model)  
                        
                        #取得模型
                        class_prior = best_nb_model["class_prior"]
                        likelihoods = best_nb_model["likelihood"]
                                    
                        #將訓練集做預測
                        x_train_predictions = predict(x_train,all_class,likelihoods,class_prior)
                        
                        #加入至集成模型
                        current_base_learners_prediction[:,base_learner_size] = x_train_predictions.ravel()
                        base_learner_size += 1
                        
                        print(iteration,base_learner_size)
                        print(f"已經找到{base_learner_size}個基本模型")
#                         print(best_nb_model)
                        print("-"*70)
                        x_train_accuracies.append(best_modle_acc)
                        break
                    else:
                        print("保留最佳粒子")
                        #保留最佳粒子
                        best_model = best_nb_model
#                         print(best_model)
                        count_repeat_num += 1
                
                if count_repeat_num == 5:
                    print(f"已經嘗試過{count_repeat_num}次尋找通過門檻值的基本模型了")
                    exit_find_model = True
                
                if exit_find_model:
                    break
        
            #如果沒有找到25個基本模型就跳出
            if base_learner_size < size:
                break
            
            
            #紀錄當前集成模型
            r_model_record.append(temp_r_model)
            

            #進行當前集成模型於訓練集上投票
            for row in range(current_base_learners_prediction.shape[0]):
                ensemble_prediction_record = {key: 0 for key in all_class}
                for ele in current_base_learners_prediction[row]:
                    ensemble_prediction_record[int(ele)] += 1
                ensemble_prediction = max(ensemble_prediction_record, key=lambda k: ensemble_prediction_record[k])                
                current_pso_trenb_prediction[row] = ensemble_prediction
            
            #計算集成模型正確率
            acc = np.mean(current_pso_trenb_prediction == y_train)
            print(f"第{iteration}次更新門檻值，當前門檻值:{threshold} ,當前集成模型在訓練集上的正確率:{acc}")
            r_accuracies.append(acc)
            
            #更新分類正確率門檻值
            threshold = sum(x_train_accuracies)/len(x_train_accuracies)
    
        
        #挑出五次門檻值更新中，集成模型正確率最高的索引
        best_ensemble_model_acc = max(r_accuracies)
        max_index = r_accuracies.index(best_ensemble_model_acc)
        
        #將該集成模型做預測
        final_pso_trenb_base_learner_prediction = np.zeros((fold_size,size))
        final_pso_trenb_prediction = np.zeros((fold_size,1))

        
        for current_index in range(size):
            current_class_prior = r_model_record[max_index][current_index]["class_prior"]
            current_likelihoods = r_model_record[max_index][current_index]["likelihood"]
            final_ensemble_base_learner_x_test_prediction = predict(x_test,all_class,current_likelihoods,current_class_prior)
            final_pso_trenb_base_learner_prediction[:,current_index] = final_ensemble_base_learner_x_test_prediction.ravel()
        
        
        base_model_prediction_record.append(final_pso_trenb_base_learner_prediction)
        
        
        #進行當前集成模型於測試集上投票
        for row in range(final_pso_trenb_base_learner_prediction.shape[0]):
            ensemble_prediction_record = {key: 0 for key in all_class}
#                 print(base_learners_prediction[row])
            for ele in final_pso_trenb_base_learner_prediction[row]:
                ensemble_prediction_record[int(ele)] += 1
            ensemble_prediction = max(ensemble_prediction_record, key=lambda k: ensemble_prediction_record[k])                
#                 print(f"集成將該筆資料預測為:{ensemble_prediction}")
            final_pso_trenb_prediction[row] = ensemble_prediction 
        
        #計算最終集成模型在測試集上的正確率
        final_ensemble_acc = np.mean(final_pso_trenb_prediction == y_test)
#         print("="*79)
        print(f"當前交叉驗證:{i} ,最終集成模型在測試集上的正確率:{final_ensemble_acc}")
#         print("="*79) 
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"第{i}次交叉驗證所花費的時間：{execution_time/60} 分鐘")
        
        
        #紀錄k次的模型正確率
        accuracies.append(final_ensemble_acc)
    avg_accuracy = sum(accuracies) / len(accuracies)
    print("="*70)
    return accuracies,avg_accuracy,r_accuracies_record,base_model_prediction_record

# 取得最終結果

In [23]:
# 取得結果
def get_result(accuracies,avg_accuracy):
    print("五次的模型正確率")
    print(accuracies)
    print("="*79)
    print("模型平均正確率")
    print(avg_accuracy)