In [4]:
import numpy as np
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import json

# Naive Bayes classifier
 Only accepts discrete data as features and labels  

In [3]:

class NaiveBayes:

    def __init__(self):
        self.probabilities = {}
        self.unique_labels = []

    def evaluate(self, features, labels):
      return self.predict(features, labels)

    def predict(self, features, labels=None):
        predictions_prob = []
        predictions = []

        for instance in features:
            predicted = self.probabilities[len(self.probabilities) - 1]
            for i, feature in enumerate(instance):
                dictionary = self.probabilities[i]
                if feature in dictionary.keys():
                    predicted = np.multiply(dictionary[feature], predicted)
            # print(predicted)
            # renormalize 
            for i in range(len(predicted)):
                predicted[i] = predicted[i] / np.sum(predicted)
            predictions_prob.append(predicted) #

        for i in range(len(predictions_prob)):
                predictions.append(np.argmax(predictions_prob[i])) 

        if labels is not None:
            accuracy = 0
            for prediction, target in zip(predictions, labels):
                if self.unique_labels[prediction] == target:
                    accuracy += 1

            accuracy = accuracy / len(predictions_prob)
            loss = 1 - accuracy

            return accuracy, loss
        # else:
        #     for i in range(len(predictions_prob)):
        #         predictions.append(np.argmax(predictions_prob[i]))

        return predictions, predictions_prob

    def _feature_prob(self, data):
        count = {}
        label_column = data[:, -1]
        for label in self.unique_labels:
            count[label] = 0

        for target in label_column:
            for label in self.unique_labels:
                if target == label:
                    count[label] += 1

        # if probability of a feature occuring in a label is 0
        # add 1 to all unique feature frequencey
        # prevents algorithm from always predicting 0
        # for missing class/ label 
        if 0 in count.values():
          for label in self.unique_labels:
             count[label] += 1


       

        return count

    def _get_probability(self, data):
        prob = self._get_unique(data)[1] / len(data)

        return prob

    def _split(self, data, split_column, split_value):
        new_data = data[split_column == split_value]
        return new_data

    # def _encode_class(self, labels):
    #     classes = self.unique_labels

    #     label_dictionary = {}
    #     encoded_labels = []

    #     for i in range(len(classes)):
    #         label_dictionary[classes[i]] = i

    #     for i in range(len(labels)):
    #         encoded_labels.append( label_dictionary[labels[i]])

    #     return encoded_labels

    def train(self, features, labels):
        labels_reshaped = labels.reshape(labels.shape[0], -1)
        train_features = np.append(features, labels_reshaped, 1)
        self.probabilities = self._build_probabilities(train_features)
        accuracy, loss = self.evaluate(features, labels)

        return accuracy, loss

    def _get_unique(self, data):
        labels = data[:, -1]
        unique_values = np.unique(labels, return_counts=True)

        return unique_values

    def _build_probabilities(self, data):
        probabilities = {}
        for i in range(data.shape[1]):
            probabilities[i] = {}
        probabilities[data.shape[1] - 1] = self._get_probability(data)
        self.unique_labels, label_count = self._get_unique(data)

        for i in range(data.shape[1] - 1):
            split_column = data[:, i]
            values = np.unique(split_column)
            for value in values:
                count = self._feature_prob(self._split(data, split_column, value))

                value_probabilities = []

                for j, elem in enumerate(self.unique_labels):
                    value_probabilities.append(count[elem] / label_count[j])
                probabilities[i][value] = value_probabilities

        self.print_tree(probabilities)

        return probabilities

    def print_tree(self, data):
        print(json.dumps(data, indent=4, default=str))