## 使用朴素贝叶斯分类器对鸢尾花数据集进行分类

In [101]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

iris_dataset = load_iris()
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(
    iris_dataset.data, iris_dataset.target, test_size=0.2, random_state=0
)

gnb = GaussianNB()
gnb.fit(iris_X_train, iris_y_train)
iris_test_predictions = gnb.predict(iris_X_test)
pd.DataFrame(
    np.hstack([iris_test_predictions.reshape(-1, 1), iris_y_test.reshape(-1, 1)]),
    columns=["Predicted", "Actual"],
).head(10)

Unnamed: 0,Predicted,Actual
0,2,2
1,1,1
2,0,0
3,2,2
4,0,0
5,2,2
6,0,0
7,1,1
8,1,1
9,1,1


In [102]:
accuracy_score(iris_test_predictions, iris_y_test)

0.9666666666666667

## 三种贝叶斯算法对比

In [3]:
def build_table(thead, tbody):
    return f"<table>{thead}{tbody}</table>"


def build_thead(ths):
    innerHtml = "".join(ths)
    return f"<thead>{innerHtml}</thead>"


def build_tbody(trs):
    innerHtml = "".join(trs)
    return f"<tbody>{innerHtml}</tbody>"


def build_th(text):
    return f"<th>{text}</th>"


def build_td(text):
    return f"<td>{text}</td>"


def build_tr(tds):
    innerHtml = "".join(tds)
    return f"<tr>{innerHtml}</tr>"


In [99]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine, load_iris, load_breast_cancer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from IPython.core.display import display_html


def extract_data(dataset):
    return (dataset.data, dataset.target)


datasets = {
    "Wine": load_wine(),
    "Iris": load_iris(),
    "Breast Cancer": load_breast_cancer(),
}
nb_clfs = {
    "Gaussian": GaussianNB(),
    "Multinomial": MultinomialNB(),
    "Bernoulli": BernoulliNB(),
}


def calculate_metrics(dataset_name, clf_name):
    X, y = extract_data(datasets[dataset_name])
    clf = nb_clfs[clf_name]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return (accuracy, cm)


def build_td_content(dataset_name, clf_name):
    accuracy, cm = calculate_metrics(dataset_name, clf_name)
    target_names = datasets[dataset_name].target_names
    matrix = pd.DataFrame(cm, columns=target_names, index=target_names)
    return "".join(
        [
            "<div>Accuracy: {:.2f}</div>".format(accuracy),
            f'<div style="display: flex; justify-content: end;">{matrix.to_html()}</div>',
        ]
    )


result_html = build_table(
    thead=build_thead(
        ths=[
            build_th("Dataset\\Classifier"),
            *(build_th(cls_name) for cls_name in nb_clfs.keys()),
        ]
    ),
    tbody=build_tbody(
        trs=[
            build_tr(
                tds=[
                    build_td(dataset_name),
                    *(
                        build_td(text=build_td_content(dataset_name, clf_name))
                        for clf_name in nb_clfs.keys()
                    ),
                ]
            )
            for dataset_name in datasets.keys()
        ]
    ),
)


display_html(result_html, raw=True)

Dataset\Classifier,Gaussian,Multinomial,Bernoulli
Unnamed: 0_level_1,class_0,class_1,class_2
Unnamed: 0_level_2,class_0,class_1,class_2
Unnamed: 0_level_3,class_0,class_1,class_2
Unnamed: 0_level_4,setosa,versicolor,virginica
Unnamed: 0_level_5,setosa,versicolor,virginica
Unnamed: 0_level_6,setosa,versicolor,virginica
Unnamed: 0_level_7,malignant,benign,Unnamed: 3_level_7
Unnamed: 0_level_8,malignant,benign,Unnamed: 3_level_8
Unnamed: 0_level_9,malignant,benign,Unnamed: 3_level_9
Wine,Accuracy: 0.92  class_0  class_1  class_2  class_0  14  0  0  class_1  2  13  1  class_2  0  0  6,Accuracy: 0.86  class_0  class_1  class_2  class_0  13  1  0  class_1  0  13  3  class_2  1  0  5,Accuracy: 0.44  class_0  class_1  class_2  class_0  0  14  0  class_1  0  16  0  class_2  0  6  0
,class_0,class_1,class_2
class_0,14,0,0
class_1,2,13,1
class_2,0,0,6
,class_0,class_1,class_2
class_0,13,1,0
class_1,0,13,3
class_2,1,0,5
,class_0,class_1,class_2

Unnamed: 0,class_0,class_1,class_2
class_0,14,0,0
class_1,2,13,1
class_2,0,0,6

Unnamed: 0,class_0,class_1,class_2
class_0,13,1,0
class_1,0,13,3
class_2,1,0,5

Unnamed: 0,class_0,class_1,class_2
class_0,0,14,0
class_1,0,16,0
class_2,0,6,0

Unnamed: 0,setosa,versicolor,virginica
setosa,11,0,0
versicolor,0,13,0
virginica,0,1,5

Unnamed: 0,setosa,versicolor,virginica
setosa,11,0,0
versicolor,0,0,13
virginica,0,0,6

Unnamed: 0,setosa,versicolor,virginica
setosa,0,0,11
versicolor,0,0,13
virginica,0,0,6

Unnamed: 0,malignant,benign
malignant,43,4
benign,4,63

Unnamed: 0,malignant,benign
malignant,36,11
benign,1,66

Unnamed: 0,malignant,benign
malignant,0,47
benign,0,67


## 对手写字体数据集进行分类

In [5]:
import gzip
import pickle

data_path = "mnist.pkl.gz"
minst = gzip.open(data_path, "rb")
minst_data = pickle.load(minst, encoding="bytes")
(
    (X_train_minst, y_train_minst),
    (X_valid_minst, y_valid_minst),
    (X_test_minst, y_test_minst),
) = minst_data

# 图像二值化
X_train_minst_bin = np.where(X_train_minst >= 0.5, 1, 0)
X_valid_minst_bin = np.where(X_valid_minst >= 0.5, 1, 0)
X_test_minst_bin = np.where(X_test_minst >= 0.5, 1, 0)

In [72]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted


class MyNaiveBayesClassifier(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        check_X_y(X, y)
        self.classes_, class_counts = np.unique(y, return_counts=True)
        self.prior_probabilities_ = class_counts / len(y)
        self.feature_values_ = [np.unique(X[:, i]) for i in range(X.shape[1])]
        self.conditional_probabilities_ = [0] * len(self.classes_)
        for (class_index, klass) in enumerate(self.classes_):
            self.conditional_probabilities_[class_index] = [0] * X.shape[1]
            X_class = X[y == klass]
            for feature_index in range(X.shape[1]):
                self.conditional_probabilities_[klass][feature_index] = {}
                for feature_value in self.feature_values_[feature_index]:
                    N_k = (X_class[:, feature_index] == feature_value).sum()
                    m = X_class.shape[0]
                    A_i = len(self.feature_values_[feature_index])
                    self.conditional_probabilities_[klass][feature_index][
                        feature_value
                    ] = (N_k + 1) / (m + A_i)
                    self.conditional_probabilities_[klass][feature_index][
                        "__not_exists__"
                    ] = 1 / (m + A_i)

        return self

    def predict(self, X):
        check_is_fitted(self)
        return np.apply_along_axis(self.__predict_one, axis=1, arr=X)
    
    def __predict_one(self, X_row):
        # conditional_probabilities = np.array(
        #     [
        #         [
        #             self.conditional_probabilities_[klass][feature_index].get(
        #                 feature_value,
        #             )
        #             or self.conditional_probabilities_[klass][feature_index][
        #                 "__not_exists__"
        #             ]
        #             for feature_index, feature_value in enumerate(X_row)
        #         ]
        #         for klass in self.classes_
        #     ]
        # )
        # return self.classes_[
        #     np.argmax(
        #         np.prod(conditional_probabilities, axis=1) * self.prior_probabilities_
        #     )
        # ]
        
        class_probabilities = np.array(self.prior_probabilities_)
        for class_index in range(len(self.classes_)):
            for feature_index, feature_value in enumerate(X_row):
                conditional_probablity = self.conditional_probabilities_[class_index][feature_index].get(
                    feature_value,
                ) or self.conditional_probabilities_[class_index][feature_index][
                    "__not_exists__"
                ]
                class_probabilities[class_index] *= conditional_probablity

        return self.classes_[np.argmax(class_probabilities)]



In [77]:
my_nb_clf = MyNaiveBayesClassifier()
my_nb_clf.fit(X_train_minst_bin, y_train_minst).score(X_test_minst_bin, y_test_minst)

0.8437

## 修改 naivebayes.py 的实现

In [81]:
wb_features = ["Outlook", "Temp", "Humidity", "Windy"]
wb_dataset = np.array(
    [
        ["Sunny", "Hot", "High", "Weak", "No"],
        ["Sunny", "Hot", "High", "Strong", "No"],
        ["Overcast", "Hot", "High", "Weak", "Yes"],
        ["Rain", "Mild", "High", "Weak", "Yes"],
        ["Rain", "Cool", "Normal", "Weak", "Yes"],
        ["Rain", "Cool", "Normal", "Strong", "No"],
        ["Overcast", "Cool", "Normal", "Strong", "Yes"],
        ["Sunny", "Mild", "High", "Weak", "No"],
        ["Sunny", "Cool", "Normal", "Weak", "Yes"],
        ["Rain", "Mild", "Normal", "Weak", "Yes"],
        ["Sunny", "Mild", "Normal", "Strong", "Yes"],
        ["Overcast", "Mild", "High", "Strong", "Yes"],
        ["Overcast", "Hot", "Normal", "Weak", "Yes"],
        ["Rain", "Mild", "High", "Strong", "No"],
    ]
)
wb_to_predict = np.array(["Rain", "Hot", "High", "Weak"])

In [79]:
class NaiveBayes(object):
    def getTrainSet(self):
        trainData = wb_dataset[:, :-1]  # 训练数据x1,x2
        labels = wb_dataset[:, -1]  # 训练数据所对应的所属类型Y
        return trainData, labels

    def classify(self, trainData, labels, features):
        # 求labels中每个label的先验概率
        labels = list(labels)  # 转换为list类型
        P_y = {}  # 存入label的概率
        for label in labels:
            P_y[label] = labels.count(label) / float(
                len(labels)
            )  # p = count(y) / count(Y)
        # print("先验概率：", P_y)
        # 求label与feature同时发生的概率
        P_xy = {}
        for y in P_y.keys():
            y_index = [
                i for i, label in enumerate(labels) if label == y
            ]  # labels中出现y值的所有数值的下标索引
            for j in range(
                len(features)
            ):  # features[0] 在trainData[:,0]中出现的值的所有下标索引
                x_index = [
                    i
                    for i, feature in enumerate(trainData[:, j])
                    if feature == features[j]
                ]
                xy_count = len(
                    set(x_index) & set(y_index)
                )  # set(x_index)&set(y_index)列出两个表相同的元素
                pkey = str(features[j]) + "*" + str(y)
                P_xy[pkey] = xy_count / float(len(labels))
        # print(P_xy)
        # 求条件概率
        P = {}
        for y in P_y.keys():
            for x in features:
                pkey = str(x) + "|" + str(y)
                P[pkey] = P_xy[str(x) + "*" + str(y)] / float(
                    P_y[y]
                )  # P[X1/Y] = P[X1Y]/P[Y]
        # print("似然概率：", P)
        # 求[2,'S']所属类别
        F = {}  # [2,'S']属于各个类别的概率
        for y in P_y:
            F[y] = P_y[y]
            for x in features:
                F[y] = (
                    F[y] * P[str(x) + "|" + str(y)]
                )  # P[y/X] = P[X/y]*P[y]/P[X]，分母相等，比较分子即可，所以有F=P[X/y]*P[y]=P[x1/Y]*P[x2/Y]*P[y]
        # print("后验概率：", F)
        features_label = max(F, key=F.get)  # 概率最大值对应的类别
        return features_label

In [82]:
nb = NaiveBayes()
trainData, labels = nb.getTrainSet()
result = nb.classify(trainData, labels, wb_to_predict)
print (wb_to_predict,'属于',result)

['Rain' 'Hot' 'High' 'Weak'] 属于 No
