```
 Name: 張宸愷
 ID: 0710018
```


In [78]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import KFold
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

mush = pd.read_csv("data/agaricus-lepiota.data", sep=",", header=None)
mush = mush.drop(columns=11)  # drop features with with missing values

# encode the categorical features
encoder = OrdinalEncoder(dtype=np.int8)
encoder.fit(mush)
data = encoder.transform(mush)  # encode the dataset into integers

np.random.shuffle(data)
# np.random.shuffle(data)


# The one I wrote myself
def get_metrics(confm: np.ndarray):
    tn, fp, fn, tp = confm
    sensitivity = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tn + tp + fn + fn)

    return sensitivity, precision, accuracy


def single_run(training: np.ndarray, validation: np.ndarray):
    cat_NB = CategoricalNB(
        alpha=1e-10,
        min_categories=[6, 4, 10, 2, 9, 4, 3, 2, 12, 2, 4, 4, 9, 9, 2, 4, 3, 8, 9, 6, 7])
    cat_NB.fit(training[:, 1:], training[:, 0])

    prediction = cat_NB.predict(validation[:, 1:])
    validation = encoder.inverse_transform(validation)
    mod_pred = np.zeros(validation.shape)
    mod_pred[:, 0] = prediction
    mod_pred = encoder.inverse_transform(mod_pred)
    confm = confusion_matrix(validation[:, 0], mod_pred[:, 0])
    r = classification_report(
        validation[:, 0], mod_pred[:, 0], digits=8, output_dict=True)

    return confm, r


def print_metrics(r: dict()):

    print("""
    | category | recall | precision | accuracy |
    |----|----|----|----|
    | p | {} | {} | {} |
    | e | {} | {} | {} |
      """.format(r["p"]["recall"], r["p"]["precision"], r["accuracy"],
                 r["e"]["recall"], r["e"]["precision"], r["accuracy"]))


mode = "holdout"
training = data.copy()

if mode == "holdout":

    ratio = 0.7

    validation = training[int(training.shape[0]*ratio):].copy()
    training = training[0:int(training.shape[0]*ratio)]
    confm, r = single_run(training, validation)
    print_metrics(r)
    # sensitivity, precision, accuracy =
else:
    kf = KFold(n_splits=3)
    sum = []
    for train_i, valid_i in kf.split(X=training[:, 1:]):
        confm = single_run(
            training[train_i], training[valid_i])
        sum.append(confm)
    confm = np.sum(sum, axis=0) / 3

print("tn, fp, fn, tp =", confm)



    | category | recall | precision | accuracy |
    |----|----|----|----|
    | p | 1.0 | 0.995857497928749 | 0.9979491386382281 |
    | e | 0.9959546925566343 | 1.0 | 0.9979491386382281 |
      
tn, fp, fn, tp = [[1231    5]
 [   0 1202]]


  
  # sdggj
  
  | category | recall | precision | accuracy |
  |----|----|----|----|
  | p | 0.9991511035653651 | 0.9957698815566836 | 0.9975389663658737 |
  | e | 0.996031746031746 | 0.9992038216560509 | 0.9975389663658737 |
      

      

In [79]:
# data visualization

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

visual_data: pd.DataFrame = pd.read_csv(
    "data/agaricus-lepiota.data", sep=",", header=None)

titles = ["cap-shape",
          "cap-surface",
          "cap-color",
          "bruises",
          "odor",
          "gill-attachment",
          "gill-spacing",
          "gill-size",
          "gill-color",
          "stalk-shape",
          "stalk-root",
          "stalk-surface-above-ring",
          "stalk-surface-below-ring",
          "stalk-color-above-ring",
          "stalk-color-below-ring",
          "veil-type",
          "veil-color",
          "ring-number",
          "ring-type",
          "spore-print-color",
          "population",
          "habitat"]

generate_graph = False

if (generate_graph):

    # plot mixed
    fig, axs = plt.subplots(6, 4, figsize=(15, 22), dpi=100)
    for c in range(1, 23):
        cur = visual_data[c].value_counts()

        axs[int((c-1)/4), (c-1) % 4].bar(cur.index, cur.values)
        axs[int((c-1)/4), (c-1) % 4].set_title(titles[c-1])

    fig.suptitle("Mixed targets")

    # plot edible
    selector = visual_data[0] == 'e'
    fig, axs = plt.subplots(6, 4, figsize=(15, 22), dpi=100)
    for c in range(1, 23):
        aa = visual_data[selector][c].value_counts()
        axs[int((c-1)/4), (c-1) % 4].bar(aa.index, aa.values)
        axs[int((c-1)/4), (c-1) % 4].set_title(titles[c-1])

    fig.suptitle("Edible targets")

    # plot poisonous
    selector = visual_data[0] == 'p'
    fig, axs = plt.subplots(6, 4, figsize=(15, 22), dpi=100)
    for c in range(1, 23):
        aa = visual_data[selector][c].value_counts()
        axs[int((c-1)/4), (c-1) % 4].bar(aa.index, aa.values)
        axs[int((c-1)/4), (c-1) % 4].set_title(titles[c-1])

    fig.suptitle("Poisonous targets")
