<a href="https://colab.research.google.com/github/jcs-lambda/CS-Unit1-Build/blob/master/naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scikit-Learn GaussianNB

Testing using [wine dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html#sklearn.datasets.load_wine)

In [1]:
import pandas as pd

from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [2]:
wine = load_wine()
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [3]:
print(wine.feature_names)
print(wine.target_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
['class_0' 'class_1' 'class_2']


In [4]:
wine.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [5]:
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['class'] = wine.target
df['class'] = df['class'].replace({
    0: wine.target_names[0],
    1: wine.target_names[1],
    2: wine.target_names[2],
})
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,class_0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,class_0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,class_0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,class_0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,class_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,class_2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,class_2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,class_2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,class_2


In [6]:
df.dtypes

alcohol                         float64
malic_acid                      float64
ash                             float64
alcalinity_of_ash               float64
magnesium                       float64
total_phenols                   float64
flavanoids                      float64
nonflavanoid_phenols            float64
proanthocyanins                 float64
color_intensity                 float64
hue                             float64
od280/od315_of_diluted_wines    float64
proline                         float64
class                            object
dtype: object

In [7]:
df.isna().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
class                           0
dtype: int64

In [8]:
target='class'
features = df.columns.drop(target)

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, stratify=df[target], random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((142, 13), (142,), (36, 13), (36,))

In [9]:
y_train.value_counts(normalize=True)

class_1    0.401408
class_0    0.330986
class_2    0.267606
Name: class, dtype: float64

In [10]:
gnb = GaussianNB().fit(X_train, y_train)
y_pred_sk = gnb.predict(X_test)
acc_sk= accuracy_score(y_test, y_pred_sk)
print(f'Test accuracy: {acc_sk * 100:.02f}%')

Test accuracy: 97.22%


In [11]:
y_pred_sk

array(['class_0', 'class_2', 'class_0', 'class_1', 'class_1', 'class_0',
       'class_0', 'class_0', 'class_1', 'class_2', 'class_1', 'class_2',
       'class_0', 'class_2', 'class_0', 'class_1', 'class_1', 'class_0',
       'class_1', 'class_0', 'class_1', 'class_1', 'class_0', 'class_0',
       'class_1', 'class_1', 'class_0', 'class_2', 'class_1', 'class_2',
       'class_0', 'class_2', 'class_1', 'class_2', 'class_2', 'class_2'],
      dtype='<U7')

## My estimator

In [12]:
import math

import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.validation import check_is_fitted

In [13]:
class NaiveBayes(BaseEstimator, ClassifierMixin):
    """Gaussian Naive Bayes Classifier"""

    def __init__(self):
        """No initialization parameters."""
        pass

    def _validate_input(self, X, y=None):
        """Returns validated input.

        :param X: 2d array-like of numeric values with no NaNs or infinite values

        :param y: 1d array-like of hashable values with no NaNs or infinite values

        :return: validated data, converted to numpy arrays
        """
        if y is not None:
            # fitting the model, validate X and y
            return check_X_y(X, y)
        else:
            # predicting, validate X
            check_is_fitted(self, ['num_features_', 'feature_summaries_'])
            X = check_array(X)
            if X.shape[1] != self.num_features_:
                raise(ValueError('unexpected input shape: (x, {X.shape[1]}); must be (x, {self.num_features_})'))
            return X

    def fit(self, X, y):
        """Fit the model with training data. X and y must be of equal length.

        :param X: 2d array-like of numeric values with no NaNs or infinite values

        :param y: 1d array-like of hashable values with no NaNs or infinite values
        
        :return: fitted instance
        """
        X, y = self._validate_input(X, y)
        self.num_features_ = X.shape[1]

        # create dictionary containing input data separated by class label
        data_by_class = {}
        for i in range(len(X)):
            features = X[i]
            label = y[i]
            if label not in data_by_class:
                # first occurence of label, create empty list in dictionary
                data_by_class[label] = []
            data_by_class[label].append(features)
        
        # summarize the distribution of features by label as list of
        # (mean, standard deviation) tuples
        # store in instance attribute for use in prediction
        self.feature_summaries_ = {}
        for label, features in data_by_class.items():
            self.feature_summaries_[label] = [
                (np.mean(column), np.std(column))
                for column in zip(*features)
            ]

        return self

    def _liklihood(self, x, mean, stdev):
        """Calculate conditional probability of a Gaussian distribution.

        :param x: float
        
        :param mean: float, sample mean

        :param stdev: float, sample standard deviation

        :return: float
        """
        exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
        return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
    
    def predict(self, X):
        """Returns class predictions for each row in X.

        :param X: 2d array-like of numeric values with no NaNs or infinite values
        whose .shape[1] == .shape[1] of fitted data

        :return: np.array of class predictions
        """
        X = self._validate_input(X)

        # predicted class labels
        predictions = []

        # iterate input rows
        for x in X:
            # get cumulative log probabilites for each class for this row
            probabilities = {}
            for label, features in self.feature_summaries_.items():
                probabilities[label] = 0
                for i in range(len(features)):
                    mean, stdev = features[i]
                    probabilities[label] += math.log2(
                        self._liklihood(x[i], mean, stdev)
                    )

            # find class with highest probability
            best_label, best_prob = None, -1
            for label, probability in probabilities.items():
                if best_label is None or probability > best_prob:
                    best_prob = probability
                    best_label = label

            # prediction for this row
            predictions.append(best_label)

        return np.array(predictions)


In [14]:
nb = NaiveBayes().fit(X_train, y_train)
y_pred_mine = nb.predict(X_test)
acc_mine= accuracy_score(y_test, y_pred_mine)
print(f'Test accuracy: {acc_mine * 100:.02f}%')

Test accuracy: 97.22%


In [15]:
y_pred_mine

array(['class_0', 'class_2', 'class_0', 'class_1', 'class_1', 'class_0',
       'class_0', 'class_0', 'class_1', 'class_2', 'class_1', 'class_2',
       'class_0', 'class_2', 'class_0', 'class_1', 'class_1', 'class_0',
       'class_1', 'class_0', 'class_1', 'class_1', 'class_0', 'class_0',
       'class_1', 'class_1', 'class_0', 'class_2', 'class_1', 'class_2',
       'class_0', 'class_2', 'class_1', 'class_2', 'class_2', 'class_2'],
      dtype='<U7')