# 문제 

categorical feature로 구성된 입력 값을 사용하는 Naive Bayes Classifier를 구현  
models.py에서 Fill-in으로 처리된 부분들 (fit 과 predict) 부분을 채워주면 됩니다.  
Categorical feature의 경우 어떻게 Naive Bayes Classifier를 만들 수 있는 지 수식을 참고

(참고로, Naive Bayes Classifier는 확률은 count기반으로 어림하기 때문에, 학습 시 랜덤한 특성이 없음. 따라서, source code와 동일한 결과를 내야 함.)

모든 결과는 smoothing factor m=1로 설정

In [16]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self):
        self.prior = None
        self.conditional_prob = None
        self.m = 1 #smoothing factor
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        # Compute prior probabilities of each class: 'self.prior'
        self.prior = np.zeros(n_classes)
  
        for idx, c in enumerate(self.classes):
            self.prior[idx] = np.sum(y == c) / n_samples

        # Compute conditional probabilities of each feature given each class:  'self.conditional_prob
        self.conditional_prob = np.zeros((n_classes, n_features), dtype=np.ndarray)

        # use file type dict() for each element in self.conditional_prob  (this will be an easier way to find the frequency for each category of a given feature)
        for c_idx, c in enumerate(self.classes):
            samples_in_class = X[y == c]
            for f_idx in range(n_features):
                self.conditional_prob[c_idx, f_idx] = dict()
                unique_vals, counts = np.unique(samples_in_class[:, f_idx], return_counts=True)
                total_counts = np.sum(counts) + self.m * len(unique_vals)
                for u_val, count in zip(unique_vals, counts):
                    self.conditional_prob[c_idx, f_idx][u_val] = (count + self.m) / total_counts
        return self
        
    def predict(self, X):
        y_pred = np.zeros(len(X), dtype=np.int8)
 
        for i, sample in enumerate(X):
            probabilities = np.zeros(len(self.classes))
            for c_idx, c in enumerate(self.classes):
                prob = np.log(self.prior[c_idx])
                for f_idx, val in enumerate(sample):
                    if val in self.conditional_prob[c_idx, f_idx]:
                        prob += np.log(self.conditional_prob[c_idx, f_idx][val])
                probabilities[c_idx] = prob
            y_pred[i] = self.classes[np.argmax(probabilities)]
            
        return y_pred

In [11]:
import numpy as np
import math

class NaiveBayesClassifier:
    def __init__(self):
        self.priors = {}
        self.likelihoods = {}
        self.m = 1

    def fit(self, X, y):
        # Calculate priors
        classes = set(y)
        n = len(y)
        for c in classes:
            self.priors[c] = sum(1 for i in y if i == c) / n

        # Calculate likelihoods
        for i in range(len(X[0])):
            for c in classes:
                self.likelihoods[(i, c)] = {}
                feature_values = set([X[j][i] for j in range(n)])
                for f in feature_values:
                    self.likelihoods[(i, c)][f] = (
                        (sum(1 for j in range(n) if X[j][i] == f and y[j] == c) + self.m) /
                        (sum(1 for j in range(n) if y[j] == c) + self.m * len(classes))
                    )

    def predict(self, X):
        y_pred = []
        for example in X:
            posteriors = {}
            for c in self.priors:
                likelihood = 1
                for i in range(len(example)):
                    if (i, c) in self.likelihoods:
                        likelihood *= self.likelihoods[(i, c)][example[i]]
                posteriors[c] = self.priors[c] * likelihood
            y_pred.append(max(posteriors, key=posteriors.get))
        return y_pred


In [17]:
import numpy as np
import pandas as pd
import sys

sys.path.append('/content/drive/My Drive/응용통계학과/기계학습/Project1_Final')                                      # 구글 드라이브 경로 추가
from models import NaiveBayesClassifier

# import data
tr_df = pd.read_csv('/content/drive/My Drive/응용통계학과/기계학습/Project1_Final/train_data.csv')
te_df = pd.read_csv('/content/drive/My Drive/응용통계학과/기계학습/Project1_Final/train_data.csv')

# split feature/labels
feat_list = ['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capitalgain', 'capitalloss', 'hoursperweek']
label     = 'class'

tr_X      = np.asarray(tr_df[feat_list])
tr_y      = np.asarray(tr_df[label])

te_X      = np.asarray(te_df[feat_list])
te_y      = np.asarray(te_df[label])


model = NaiveBayesClassifier()

model.fit(tr_X,tr_y)

print(model.conditional_prob)
print(np.mean(model.predict(te_X) == te_y))

[[0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0]]
0.7619327924653853


In [14]:
import numpy as np
import pandas as pd

sys.path.append('/content/drive/My Drive/응용통계학과/기계학습/Project1_Final')                                      # 구글 드라이브 경로 추가

from models import NaiveBayesClassifier

# import data
tr_df = pd.read_csv('/content/drive/My Drive/응용통계학과/기계학습/Project1_Final/train_data.csv')
# te_df = pd.read_csv('./test_data.csv')

from sklearn.model_selection import train_test_split
tr_df, te_df = train_test_split(tr_df, test_size=0.1, random_state=48)

# split feature/labels
feat_list = ['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capitalgain', 'capitalloss', 'hoursperweek']
label     = 'class'

tr_X      = np.asarray(tr_df[feat_list])
tr_y      = np.asarray(tr_df[label])

te_X      = np.asarray(te_df[feat_list])
te_y      = np.asarray(te_df[label])



model = NaiveBayesClassifier()

model.fit(tr_X,tr_y)
print(np.mean(model.predict(te_X) == te_y))

0.7648413510747185
