In [24]:
import numpy as np
import pandas as pd

# Naive Bayes Classifier
It is a conditional probability model, with formula: <br>
$ P(C| x_1, x_2, x_3, ...) = \frac{P(C)P(X|C)}{P(X)}$ <br>
It is naive because we have naive assumption such that every pair of features are independent from each other given C.<br>
So we can rewrite the formula as: <br>
$ P(C| x_1, x_2, x_3, ...) = P(C)P(x_1|C)P(x_2|C)... = P(C)\prod^{n}_{i=1} P(x_i|C)$

In [25]:
class Naive_Bayes():
    """

    Naive Bayes classifer

    Attributes:
        prior: P(Y)
        likelihood: P(X_j | Y)
    """

    def __init__(self):
        """
            Some initializations, if neccesary
        """

        self.model_name = 'Naive Bayes'
        self.prior = None
        self.likelihood = None


    def fit(self, X_train, y_train):

        """
            The fit function fits the Naive Bayes model based on the training data.
            Here, we assume that all the features are **discrete** features.

            X_train is a matrix or 2-D numpy array, represnting training instances.
            Each training instance is a feature vector.

            y_train contains the corresponding labels. There might be multiple (i.e., > 2) classes.
        """

        """
            TODO: 1. Modify and add some codes to the following for-loop
                     to compute the correct prior distribution of all y labels.
                  2. Make sure they are normalized to a distribution.
        """
        # Calculate the prior distribution P(Y)
        unique_classes = np.unique(y_train)
        class_counts = {label: 0 for label in unique_classes}
        self.prior = dict()
        # Count the occurrences of each class in y_train to calculate the prior probabilities
        for y in y_train:
            class_counts[y] += 1

        # Normalize the counts to make it a probability distribution
        total_count = len(y_train)
        self.prior = {label: count / total_count for label, count in class_counts.items()}
        """
            TODO: 3. Modify and add some codes to the following for-loops
                     to compute the correct likelihood P(X_j | Y).
                  4. Make sure they are normalized to distributions.
        """
        # Initialize the likelihood dictionary
        self.likelihood = {label: {} for label in np.unique(y_train)}
        feature_counts = {label: [{} for _ in range(X_train.shape[1])] for label in np.unique(y_train)}
        # Count occurrences of feature values given a class
        for x, y in zip(X_train, y_train):
            for j in range(len(x)):
                if x[j] not in feature_counts[y][j]:
                    feature_counts[y][j][x[j]] = 0
                feature_counts[y][j][x[j]] += 1

        """
            TODO: 5. Think about whether we really need P(X_1 = x_1, X_2 = x_2, ..., X_d = x_d)
                     in practice?
                  6. Does this really matter for the final classification results?
        """
        # Convert counts to probabilities
        for y in self.likelihood:
            for j in range(len(feature_counts[y])):
                total_count = sum(feature_counts[y][j].values())
                self.likelihood[y][j] = {value: count / total_count for value, count in feature_counts[y][j].items()}


    def ind_predict(self, x : list):

        """
            Predict the most likely class label of one test instance based on its feature vector x.
        """

        """
            TODO: 7. Enumerate all possible class labels and compute the likelihood
                     based on the given feature vector x. Don't forget to incorporate
                     both the prior and likelihood.
                  8. Pick the label with the higest probability.
                  9. How to deal with very small probability values, especially
                     when the feature vector is of a high dimension. (Hint: log)
                  10. How to how to deal with unknown feature values?
        """
        # Initialize variables to keep track of the best class and its probability
        best_label, max_log_prob = None, float('-inf')
        # Iterate over each class to compute the total log probability for the feature vector
        for y in self.prior.keys():
            log_prob = np.log(self.prior[y]) # Start with the log of the prior probability
            # Multiply by the likelihood of each feature given this class
            for j, xj in enumerate(x):
                # If the feature value has been seen before, use its likelihood
                if xj in self.likelihood[y][j]:
                    log_prob += np.log(self.likelihood[y][j][xj])
                else:
                    # Handle unknown feature values
                    # One approach is Laplace smoothing (adding 1 to the numerator and
                    # adding the number of features to the denominator)
                    # For this example, we will use a small probability epsilon
                    epsilon = 1e-6
                    log_prob += np.log(epsilon)
            # Update the best class label if this class's probability is greater than the current max
            if log_prob > max_log_prob:
                max_log_prob = log_prob
                best_label = y
        return best_label

    def predict(self, X):

        """
            X is a matrix or 2-D numpy array, represnting testing instances.
            Each testing instance is a feature vector.

            Return the predictions of all instances in a list.
        """

        """
            TODO: 11. Revise the following for-loop to call ind_predict to get predictions.
        """

        predictions = []
        for x in X:
            # Call the ind_predict method to get the prediction for the current instance
            predicted_label = self.ind_predict(x)
            predictions.append(predicted_label)

        return predictions

In [26]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data'
col = ['class_name','left_weight','left_distance','right_weight','right_distance']
data = pd.read_csv(url, delimiter = ',', names = col)

In [27]:
data

Unnamed: 0,class_name,left_weight,left_distance,right_weight,right_distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5
...,...,...,...,...,...
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4


In [28]:
data.class_name.value_counts()

class_name
R    288
L    288
B     49
Name: count, dtype: int64

In [29]:
X = np.asarray(data.iloc[:, 1:])
y = data.class_name
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state = 88)

In [30]:
clf = Naive_Bayes()
clf.fit(X_train, y_train)
y_test = np.array(y_test)
y_hat = clf.predict(X_test)

Overall Accuracy

In [31]:
sum(y_hat == y_test)/ 207  # you should get something like 0.88

0.8840579710144928

In [32]:
len(y_test)

207

In [33]:
len(y_hat)

207