In [72]:
import numpy as np
import math
from collections import Counter
from process_data import load_and_process_data
from evaluation import get_micro_F1,get_macro_F1,get_acc

class NaiveBayes:
    '''参数初始化
    Pc: P(c) 每个类别c的概率分布
    Pxc: P(c|x) 每个特征的条件概率
    '''
    def __init__(self):
        self.Pc={}
        self.Pxc={}
        self.P1c={}
    '''
    通过训练集计算先验概率分布p(c)和条件概率分布p(x|c)
    建议全部取log，避免相乘为0
    '''
    def fit(self,traindata,trainlabel,featuretype):
        #计算先验概率分布p(c)
        static1 = Counter(trainlabel[:, 0])
        for i in static1:
            static1[i] = (static1[i]+1)/(traindata.shape[0]+3)
        self.Pc = static1
        #计算条件概率分布p(x|c)
        #对连续属性进行预处理
        train = traindata[:, 1:]
        
        #第一列单独处理
        for i in self.Pc:
            for j in range(3):
                filter1 = traindata[:, 0] == j+1
                filter2 = trainlabel == i
                sum1 = np.sum(filter2)
                sum2 = 0
                for t in range(traindata.shape[0]):
                    if filter1[t] and filter2[t]:
                        sum2=sum2+1
                self.P1c[(j+1,i)] = (sum2 + 1)/(sum1 + 3)
        print(self.P1c)
        for i in range(traindata.shape[1]-1):
            for j in self.Pc:
                filt = trainlabel[:,0] == j
                temp = traindata[:,i+1]
                temp = temp[filt]
                miu = np.mean(temp)
                sigema = np.var(temp)
                self.Pxc[(i+1,j)]=(miu,sigema)
        #print(self.Pxc)
            

    '''
    根据先验概率分布p(c)和条件概率分布p(x|c)对新样本进行预测
    返回预测结果,预测结果的数据类型应为np数组，shape=(test_num,1) test_num为测试数据的数目
    feature_type为0-1数组，表示特征的数据类型，0表示离散型，1表示连续型
    '''
    def predict(self, features, featuretype):
        result = []
        #连续属性预处理
        for i in range(features.shape[0]):
            prob = []
            for j in [1,2,3]:
                log = 0
                log = log + math.log(self.Pc[j]) + math.log(self.P1c[(features[i,0],j)])
                for k in range(features.shape[1]-1):
                    miu = self.Pxc[(k+1,j)][0]
                    sigema = self.Pxc[(k+1,j)][1]
                    t = math.exp(-(features[i,k+1]-miu)**2/2/sigema)/math.sqrt(2*math.pi*sigema)
                    log =log + math.log(t)
                prob.append(log)
            #print(prob)
            if prob[0] > prob[1] and prob[0] > prob[2]:
                result.append(1)
            elif prob[1] > prob[0] and prob[1] > prob[2]:
                result.append(2)
            else:
                result.append(3)
        return np.array(result).reshape(features.shape[0],1)
                

In [76]:
train_data,train_label,test_data,test_label=load_and_process_data()
feature_type=[0,1,1,1,1,1,1,1] #表示特征的数据类型，0表示离散型，1表示连续型
Nayes=NaiveBayes()
Nayes.fit(train_data,train_label,feature_type) # 在训练集上计算先验概率和条件概率
pred=Nayes.predict(test_data,feature_type)
print("Acc: "+str(get_acc(test_label,pred)))
print("macro-F1: "+str(get_macro_F1(test_label,pred)))
print("micro-F1: "+str(get_micro_F1(test_label,pred)))

train_num: 3554
test_num: 983
train_feature's shape:(3554, 8)
test_feature's shape:(983, 8)
{(1, 1): 0.14907508161044614, (2, 1): 0.07616974972796518, (3, 1): 0.7747551686615887, (1, 2): 0.36265320836337417, (2, 2): 0.28406633020908434, (3, 2): 0.35328046142754144, (1, 3): 0.4630071599045346, (2, 3): 0.39697692919649963, (3, 3): 0.14001591089896578}
Acc: 0.6134282807731435
0.7137404580152672
0.4725111441307578
0.6684005201560468
macro-F1: 0.6182173741006906
micro-F1: 0.6134282807731435
