In [None]:
import numpy as np 
import matplotlib.pyplot as plt

# install
## numpy
## matplotlib

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pwd
%cd /content/drive/MyDrive/Colab Notebooks/assign08

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
/content/drive/MyDrive/Colab Notebooks/assign08


## data load & preprocessing

#### MNIST 손글씨 데이터셋

In [None]:
from dataset.mnist import load_mnist

(train_raw_img, train_y), (test_raw_img, test_y) = load_mnist(flatten=False, normalize=False)

In [None]:
# reshape 
train_X = train_raw_img.reshape(len(train_raw_img.squeeze()), -1)
test_X = test_raw_img.reshape(len(test_raw_img.squeeze()), -1)

### mutinomial naive bayes

In [None]:
def train_MNB(X, y) :
    classes = np.unique(y)
    class_priors = {}
    feature_parameters = {}
    
    # Calculate class priors
    total_samples = len(y)
    for class_label in classes:
        class_count = np.sum(y == class_label)
        class_priors[class_label] = class_count / total_samples
    
    # Calculate feature parameters (likelihoods)
    for class_label in classes:
        class_samples = X[y == class_label]
        feature_count = np.sum(class_samples, axis=0) + 1 # 1은 더욱 부드럽게 하기 위해 집어 넣는다.
        total_count = np.sum(feature_count)
        feature_parameters[class_label] = feature_count / total_count
    
    return class_priors, feature_parameters

In [None]:
def MLE_with_MNB(X, y, class_priors, feature_parameters):
    classes = np.unique(y)
    preds = []

    for sample in X:
        log_likelihoods = []

        for class_label in classes:
            class_prior = class_priors[class_label]
            log_likelihood = np.log(class_prior)
            for feature, parameter in zip(sample, feature_parameters[class_label]):
                log_likelihood += feature * np.log(parameter)

            log_likelihoods.append(log_likelihood)

        preds.append(classes[np.argmax(log_likelihoods)])

    return preds

In [None]:
class_priors, feature_parameters = train_MNB(train_X[:100], train_y[:100])

train_preds = MLE_with_MNB(train_X[:100], train_y, class_priors, feature_parameters)
train_acc = np.mean(train_preds == train_y[:100])
print(train_acc)

test_pred = MLE_with_MNB(test_X[:100], test_y, class_priors, feature_parameters)
test_acc = np.mean(test_pred == test_y[:100])
print(test_acc)

0.97
0.73


In [None]:
class_priors, feature_parameters = train_MNB(train_X[:1000], train_y[:1000])

train_preds = MLE_with_MNB(train_X[:1000], train_y, class_priors, feature_parameters)
train_acc = np.mean(train_preds == train_y[:1000])
print(train_acc)

test_pred = MLE_with_MNB(test_X[:1000], test_y, class_priors, feature_parameters)
test_acc = np.mean(test_pred == test_y[:1000])
print(test_acc)

0.865
0.779


In [None]:
class_priors, feature_parameters = train_MNB(train_X[:10000], train_y[:10000])

train_preds = MLE_with_MNB(train_X[:10000], train_y, class_priors, feature_parameters)
train_acc = np.mean(train_preds == train_y[:10000])
print(train_acc)

test_pred = MLE_with_MNB(test_X[:10000], test_y, class_priors, feature_parameters)
test_acc = np.mean(test_pred == test_y[:10000])
print(test_acc)

0.834
0.8341


Multinomial Naive Bayes는 다중 클래스 분류에 사용될 수 있는데, 각 클래스에 대한 확률을 추정하고 가장 높은 확률을 갖는 클래스로 데이터를 분류한다.

Multinomial Naive Bayes는 텍스트 분류와 같은 다중 클래스가 필요할 때 적합하다

Train와 Test 모두 크기를 증가할 수록 두 Accuracy의 차이가 줄어들 기 때문에 더욱 정확해진다.

Train Accuracy가 Test Accuracy 보다 클 때는 Overfitting(과적합)이 발생할 가능성이 있으므로, Case 1과 2는 과적합 가능성이 있다.

### bernoulli naive bayes

In [None]:
def train_BNB(X, y):
    classes = np.unique(y)
    num_classes = len(classes)
    num_features = X.shape[1]
    
    class_priors = np.zeros(num_classes)
    feature_parameters = np.zeros((num_classes, num_features))
    
    for i, c in enumerate(classes):
        # Class priors 계산
        class_priors[i] = np.sum(y == c) / len(y)
        
        # feature parameters 계산
        class_samples = X[y == c]
        feature_probabilities = np.mean(class_samples, axis=0)
        feature_parameters[i] = feature_probabilities
    
    return class_priors, feature_parameters

In [None]:
def MLE_with_BNB(X, y, class_priors, feature_parameters) :
    classes = np.unique(y)
    num_samples, num_features = X.shape
    num_classes = len(classes)

    epsilon = 1e-10
    
    preds = np.zeros(num_samples)
    
    for i in range(num_samples):
        sample = X[i]
        
        # 이진 클래스에 대한 log_likelihood 계산
        log_likelihoods = np.zeros(num_classes)
        for j in range(num_classes):
            log_likelihood = np.sum(sample * np.log(np.maximum(feature_parameters[j], epsilon)) + (1 - sample) * np.log(np.maximum(1 - feature_parameters[j], epsilon)))
            log_likelihood += np.log(class_priors[j])
            log_likelihoods[j] = log_likelihood
        
        # 가장 높은 log_likelihood를 예측값으로 하여 삽입
        preds[i] = classes[np.argmax(log_likelihoods)]
    
    return preds

In [None]:
class_priors, feature_parameters = train_BNB(train_X[:100], train_y[:100])

train_preds = MLE_with_BNB(train_X[:100], train_y, class_priors, feature_parameters)
train_acc = np.mean(train_preds == train_y[:100])
print(train_acc)

test_pred = MLE_with_BNB(test_X[:100], test_y, class_priors, feature_parameters)
test_acc = np.mean(test_pred == test_y[:100])
print(test_acc)

0.69
0.47


In [None]:
class_priors, feature_parameters = train_BNB(train_X[:1000], train_y[:1000])

train_preds = MLE_with_BNB(train_X[:1000], train_y, class_priors, feature_parameters)
train_acc = np.mean(train_preds == train_y[:1000])
print(train_acc)

test_pred = MLE_with_BNB(test_X[:1000], test_y, class_priors, feature_parameters)
test_acc = np.mean(test_pred == test_y[:1000])
print(test_acc)

0.529
0.439


In [None]:
class_priors, feature_parameters = train_BNB(train_X[:10000], train_y[:10000])

train_preds = MLE_with_BNB(train_X[:10000], train_y, class_priors, feature_parameters)
train_acc = np.mean(train_preds == train_y[:10000])
print(train_acc)

test_pred = MLE_with_BNB(test_X[:10000], test_y, class_priors, feature_parameters)
test_acc = np.mean(test_pred == test_y[:10000])
print(test_acc)

0.3532
0.3428


Train과 Test의 크기가 커질 수록 두 Accuracy의 차이는 줄어든다.

하지만 반대로 두 Accuracy의 절대적인 값은 감소한다.

이러한 이유로는 Bernoulli Naive Bayes가 이진 값으로 분류하는 특성 상 특정 클래스의 데이터가 다른 클래스보다 훨씬 많거나 적은 경우, 모델은 데이터가 많은 클래스에 더 치우쳐저 훈련될 가능성이 있기 때문이다.