# 2 Naive Bayes Classifier

Assume that all classes are equally likely, i.e. the priors are $p(y=k)=1/C$ with $C$ the number of classes.  
The decision rule is defined by  
$$
\hat{y}=\argmax_k\Bigg(\prod_{j=1}p_j(x_j|y=k)\Bigg)
$$
where $p_j(x_j|y=k)$ are 1-dimensional histograms for each feature $j$ and class $k$.  
Rewrite to
$$
\hat{y}=\argmax_k\Bigg(\sum_{j=1}\log p_j(x_j|y=k)\Bigg)
$$
since tiny numbers are prone to numerical inaccuracy.

> Implement training of the naive Bayes classifier as a function  
> `histograms, binning = fit_naive_bayes(features, labels, bincount)`  
> where `histograms` is the $C×D×L$ array if histograms ($D$ is the number of feature dimensions, $L$ the number of bins), and `binning` is a $C×D×2$ array describing the bin layout.

In [53]:
from typing import Tuple
import numpy as np

In [54]:
from sklearn.datasets import load_digits


digits = load_digits()

print(digits.keys())

data = digits["data"]
images = digits["images"]
target = digits["target"]
target_names = digits["target_names"]

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])


In [55]:
def IQR(features) -> float:
    """Calculates the IQR of the given features."""
    q75, q25 = np.percentile(
        features, [0.75, 0.25], axis=0, interpolation="nearest")
    return q75 - q25


def freedman_diaconis(features, labels) -> Tuple[np.ndarray, np.ndarray]:
    """Returns the bins and their respective bin widths."""
    klasses = np.unique(labels)
    bins = np.zeros((klasses.size, features.shape[1]))
    binwidths = np.zeros((klasses.size, features.shape[1]))
    for k, klass in enumerate(klasses):
        klass_features = features[labels == klass]
        h = (2*IQR(klass_features)/np.cbrt(klass_features.shape[0]))
       
        h[h==0] = np.inf  # cant divide by 0

        binwidths[k] = h
        bins[k] = np.ceil(
            (np.max(klass_features, axis=0)-np.min(klass_features, axis=0))/h
        )
    binwidths[binwidths==np.inf] = 0  # done with dividing, 
    bins[bins==0] = 1  # We always need at least one bin

    return bins, binwidths


In [56]:
features = np.array(
    [
        [3, 4],
        [1, 0],
        [12, 2]
    ]
)
freedman_diaconis(data[:10], target[:10])


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [57]:
def fit_naive_bayes(
        features: np.ndarray,
        labels: np.ndarray,
        bincount: int
) -> Tuple[np.ndarray, np.ndarray]:
    """Fit the given features and labels for the naive bayes algorithm.

    Parameters
    ----------
    features : numpy.ndarray
        `X×D` dimensional array, 
        with `D` beeing the number of feature dimensions.
    labels : numpy.ndarray
        `X×1` dimensional array.
    bincount : int
        Number of bins for the histograms.

    Returns
    -------
    histograms : numpy.ndarray
        `C×D×L` dimensional array, with
        `C` as the number of unique classes,
        `D` as the number of feature dimensions and
        `L` as the number of bins.
    binning : numpy.ndarray
        `C×D×2` dimensional array describing the bin layout.
    """
    N: int = labels.size
    C: int = np.unique(labels).size
    D: int = features.shape[1]
    L: int = bincount

    bincounts: np.ndarray
    binwidths: np.ndarray
    if L==0:
        bincounts, binwidths = freedman_diaconis(features, labels)
        # ToDo: Calculate a reasonable L

    priors: np.ndarray
    priors = labels[]

    histograms = ""
    binning = 5
    np.sum
    return histograms, binning


SyntaxError: invalid syntax (2858211734.py, line 34)