In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sts
import pandas as pd
import seaborn as sns


In [13]:
data_train = np.load('data/fashion_train.npy', mmap_mode='r')
data_test = np.load('data/fashion_test.npy', mmap_mode='r')

In [14]:
X_train = data_train[:, :-1]
y_train = data_train[:, -1]


In [16]:
import numpy as np

class NaiveBayes():
    def __init__(self):
        # Initialize the NaiveBayes object with None for prior, mean, and var
        self.prior = None
        self.mean = None
        self.var = None
    
    def predict(self, X):
        # Make predictions for each sample in X using the fit method
        y_pred = [self.fit(x) for x in X]
        return np.array(y_pred)

    def fit(self, X, y):
        # Get unique class labels and dimensions of the feature matrix
        class_labels = np.unique(y)
        n, n_features = X.shape[0], X.shape[1]
        
        # Prior class probabilities
        prior = [np.sum(y == c) / n for c in class_labels]
        
        # Calculate means and variances for each class and feature using list comprehensions
        mean = np.array([np.mean(X[y == c], axis=0) for c in class_labels])
        var = np.array([np.var(X[y == c], axis=0) for c in class_labels])

        # Store prior, mean, and var as class variables
        self.prior = prior
        self.mean = mean
        self.var = var

        # Posterior probabilities using Gaussian
        posteriors = []
        for i, c in enumerate(class_labels):
            posterior = np.sum(np.log(self._pdf(i, X)))
            posterior = prior[i] + posterior
            posteriors.append(posterior)
        
        # Return the class with the highest posterior probability
        return class_labels[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        # Calculate the probability density function (pdf) for a given class
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var)) # the likelihood of the given feature values
        denominator = np.sqrt(2 * np.pi * var) #normalizes the likelihood
        return numerator / denominator 


In [5]:
NB = NaiveBayes()

In [8]:
NB.fit(X_train,y_train)

1

In [18]:
# Testing
if __name__ == "__main__":
    # Imports
    from sklearn.model_selection import train_test_split
    from sklearn import datasets

    def accuracy_score(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_classes=2, random_state=123
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=123
    )


    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    predictions = nb.fit(X_test,y_test)

    print("Naive Bayes classification accuracy", accuracy_score(y_test, predictions))

Naive Bayes classification accuracy 0.505
