```
 Name: 張宸愷
 ID: 0710018
```


In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

mush = pd.read_csv("data/agaricus-lepiota.data", sep=",", header=None)
mush = mush.drop(columns=11) # drop features with '?'

# encode the categorical features
encoder = OrdinalEncoder(dtype=np.int8)
encoder.fit(mush)
data = encoder.transform(mush) # encode the dataset into integers
np.random.shuffle(data)



In [21]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix

# define a single run


def single_run(training: np.ndarray, validation: np.ndarray):
    cat_NB = CategoricalNB(alpha=10)
    cat_NB.fit(training[:, 1:], training[:, 0])

    prediction = cat_NB.predict(validation[:, 1:])
    confm = confusion_matrix(validation[:, 0], prediction).ravel()
    tn, fp, fn, tp = confm
    sensitivity = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tn + tp + tn + fn)
    return confm, sensitivity, precision, accuracy


In [22]:
import math
from sklearn.model_selection import KFold

mode = "1"
training = data.copy()


if mode == "holdout":

    ratio = 0.7

    validation = training[int(training.shape[0]*ratio):].copy()
    training = training[0:int(training.shape[0]*ratio)]
    confm, sensitivity, precision, accuracy = single_run(training, validation)
else:
    kf = KFold(n_splits=3)
    sum = (0,0,0,0)
    for train_i, valid_i in kf.split(X=training[:, 1:]):
        confm, sensitivity, precision, accuracy = single_run(
            training[train_i], training[valid_i])


print("tn, fp, fn, tp =", confm)
print("(sensitivity, precision, accuracy) =",
      (sensitivity, precision, accuracy))


tn, fp, fn, tp = [1363   13  174 1158]
(sensitivity, precision, accuracy) = (0.8693693693693694, 0.9888983774551665, 0.6212419911286348)


In [23]:
# data visualization

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

visual_data: pd.DataFrame = pd.read_csv(
    "data/agaricus-lepiota.data", sep=",", header=None)

titles = ["cap-shape",
          "cap-surface",
          "cap-color",
          "bruises",
          "odor",
          "gill-attachment",
          "gill-spacing",
          "gill-size",
          "gill-color",
          "stalk-shape",
          "stalk-root",
          "stalk-surface-above-ring",
          "stalk-surface-below-ring",
          "stalk-color-above-ring",
          "stalk-color-below-ring",
          "veil-type",
          "veil-color",
          "ring-number",
          "ring-type",
          "spore-print-color",
          "population",
          "habitat"]

generate_graph = False

if (generate_graph):

    # plot mixed
    fig, axs = plt.subplots(6, 4, figsize=(15, 22), dpi=100)
    for c in range(1, 23):
        cur = visual_data[c].value_counts()

        axs[int((c-1)/4), (c-1) % 4].bar(cur.index, cur.values)
        axs[int((c-1)/4), (c-1) % 4].set_title(titles[c-1])

    fig.suptitle("Mixed targets")

    # plot edible
    selector = visual_data[0] == 'e'
    fig, axs = plt.subplots(6, 4, figsize=(15, 22), dpi=100)
    for c in range(1, 23):
        aa = visual_data[selector][c].value_counts()
        axs[int((c-1)/4), (c-1) % 4].bar(aa.index, aa.values)
        axs[int((c-1)/4), (c-1) % 4].set_title(titles[c-1])

    fig.suptitle("Edible targets")

    # plot poisonous
    selector = visual_data[0] == 'p'
    fig, axs = plt.subplots(6, 4, figsize=(15, 22), dpi=100)
    for c in range(1, 23):
        aa = visual_data[selector][c].value_counts()
        axs[int((c-1)/4), (c-1) % 4].bar(aa.index, aa.values)
        axs[int((c-1)/4), (c-1) % 4].set_title(titles[c-1])

    fig.suptitle("Poisonous targets")
