In [4]:
from typing import List, Dict
import os
import pandas as pd

class NB:
    # Number of Categories
    NC = 2
    # Number of Variables
    NV = 0
    # Leaning Data
    N = 0
    LD = pd.DataFrame()
    # Test Data
    TD = pd.DataFrame()
    # 頻度表
    # N_{X_0=c}の格納先．ft_c[0]=N_{X_0=0}, ft_c[1] = N_{X_0=1}となる(X_0は目的変数を表す)．
    ft_c = []
    # N_{X_i=k,X_0=c}の格納先．例えば，ft_ic[0][0]=N_{X_i=0,X_0=0}, ft_ic[1][0]=N_{X_i=1,X_0=0}となる．
    ft_ic = []

    def __init__(self):
        # データセット入力
        dataset = "spam"
        print("Dataset: "+dataset)
        # データ読み込み
        self.readData(f"data/{dataset}")
        print(f"Number of Test Data: {len(self.TD)}")
        print(f"Number of Learning Data: {len(self.LD)}")
        print(f"Number of Variables: {self.NV}")
        # 頻度表の初期化
        self.ft_c = [0 for _ in range(self.NC)]
        self.ft_ic = [[[0 for _ in range(self.NC)] for _ in range(self.NC)]
                       for _ in range(self.NV - 1)]
        # 頻度表の作成
        self.setFrequencyTable()
        col_to_idx = {name: idx for idx, name in enumerate(self.LD.columns)}
        print("Frequency Table has been set.")
        print("="*50)
        print(f"ft_c[0]:{self.ft_c[0]}, ft_c[1]:{self.ft_c[1]}")
        print(f"true : {len(self.LD[self.LD['CLASS(spam)'] == 0])}, {len(self.LD[self.LD['CLASS(spam)'] == 1])}")
        print("="*50)
        print(f"ft_ic['i'][0][0]:{self.ft_ic[col_to_idx['i']][0][0]}, ft_ic['i'][0][1]:{self.ft_ic[col_to_idx['i']][0][1]}")
        print(f"true : {len(self.LD[(self.LD['i'] == 0) & (self.LD['CLASS(spam)'] == 0)])}, {len(self.LD[(self.LD['i'] == 0) & (self.LD['CLASS(spam)'] == 1)])}")
        print(f"ft_ic['i'][1][0]:{self.ft_ic[col_to_idx['i']][1][0]}, ft_ic['i'][1][1]:{self.ft_ic[col_to_idx['i']][1][1]}")
        print(f"true : {len(self.LD[(self.LD['i'] == 1) & (self.LD['CLASS(spam)'] == 0)])}, {len(self.LD[(self.LD['i'] == 1) & (self.LD['CLASS(spam)'] == 1)])}")
        print("="*50)
        print(f"ft_ic['to'][0][0]:{self.ft_ic[col_to_idx['to']][0][0]}, ft_ic['to'][0][1]:{self.ft_ic[col_to_idx['to']][0][1]}")
        print(f"true : {len(self.LD[(self.LD['to'] == 0) & (self.LD['CLASS(spam)'] == 0)])}, {len(self.LD[(self.LD['to'] == 0) & (self.LD['CLASS(spam)'] == 1)])}")
        print(f"ft_ic['to'][1][0]:{self.ft_ic[col_to_idx['to']][1][0]}, ft_ic['to'][1][1]:{self.ft_ic[col_to_idx['to']][1][1]}")
        print(f"true : {len(self.LD[(self.LD['to'] == 1) & (self.LD['CLASS(spam)'] == 0)])}, {len(self.LD[(self.LD['to'] == 1) & (self.LD['CLASS(spam)'] == 1)])}")
        print("="*50)
        self.theta_k, self.theta_kc = self._get_parameters()
        print("Parameters have been set.")
        self.classification(self.theta_k, self.theta_kc)
    def _get_parameters(self):
        """
        変数自身とその親の値を鍵としてその条件付き確率パラメータを返すようなHashMapを、 各変数ごとにArrayListで繋いだものを返す関数
        """
        theta_k = [{} for _ in range(self.NC)]
        for i in range(self.NC):
            theta_k[i] = self.ft_c[i] / self.N
        theta_kc = [[[0 for _ in range(self.NC)] for _ in range(self.NC)]for _ in range(self.NV - 1)]
        for j in range(self.NV - 1):
            for k in range(self.NC):
                for c in range(self.NC):
                    if self.ft_c[c] == 0 or self.ft_ic[j][k][c] == 0:
                        theta_kc[j][k][c] = 1.0
                    else:
                        theta_kc[j][k][c] = self.ft_ic[j][k][c] / self.ft_c[c]

        return theta_k, theta_kc

    def classification(self,theta_k,theta_kc) -> None:
        num_correct_prediction = 0
        for i in range(len(self.TD)):
            print(f"Predicting Test Data {i+1}/{len(self.TD)}")
            # 真のクラスラベル
            y_true = self.TD.iat[i, self.NV - 1]
            prob_true = theta_k[1]
            prob_false = theta_k[0]
            for j in range(self.NV - 1):
                x_ij = self.TD.iat[i, j]
                prob_true *= theta_kc[j][x_ij][1]
                prob_false *= theta_kc[j][x_ij][0]
            y_pred = 1 if prob_true > prob_false else 0
            if y_true == y_pred:
                num_correct_prediction += 1
        # Print accuracy
        accuracy = (
            (num_correct_prediction / float(len(self.TD))) if len(self.TD) > 0 else 0.0
        )
        print(f"Classification accuracy: {accuracy}")
    def readData(self, filePath: str) -> None:
        self.LD = pd.read_csv(os.path.join(filePath, "LD.csv"))
        self.TD = pd.read_csv(os.path.join(filePath, "TD.csv"))
        self.NV = len(self.LD.columns)
        self.N = len(self.LD)

    def setFrequencyTable(self) -> None:
        """
        N_{X_0=c}とN_{X_i=k,X_0=c}をそれぞれft_c, ft_icに格納する関数
        """
        for j in range(self.NV - 1):
            for i in range(len(self.LD)):
                self.ft_ic[j][self.LD.iat[i, j]][self.LD.iat[i, self.NV - 1]] += 1
        for i in range(len(self.LD)):
            self.ft_c[self.LD.iat[i, self.NV - 1]] += 1

NB()

Dataset: spam
Number of Test Data: 375
Number of Learning Data: 1000
Number of Variables: 301
Frequency Table has been set.
ft_c[0]:451, ft_c[1]:549
true : 451, 549
ft_ic['i'][0][0]:272, ft_ic['i'][0][1]:521
true : 272, 521
ft_ic['i'][1][0]:179, ft_ic['i'][1][1]:28
true : 179, 28
ft_ic['to'][0][0]:348, ft_ic['to'][0][1]:199
true : 348, 199
ft_ic['to'][1][0]:103, ft_ic['to'][1][1]:350
true : 103, 350
Parameters have been set.
Predicting Test Data 1/375
Predicting Test Data 2/375
Predicting Test Data 3/375
Predicting Test Data 4/375
Predicting Test Data 5/375
Predicting Test Data 6/375
Predicting Test Data 7/375
Predicting Test Data 8/375
Predicting Test Data 9/375
Predicting Test Data 10/375
Predicting Test Data 11/375
Predicting Test Data 12/375
Predicting Test Data 13/375
Predicting Test Data 14/375
Predicting Test Data 15/375
Predicting Test Data 16/375
Predicting Test Data 17/375
Predicting Test Data 18/375
Predicting Test Data 19/375
Predicting Test Data 20/375
Predicting Test Data

<__main__.NB at 0x21c54bd5890>