In [1]:
import warnings
import random
import numpy as np
import scipy.sparse as sp
import os
import math
import nltk
import re

from abc import ABCMeta, abstractmethod
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import binarize
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import label_binarize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import check_X_y, check_array, deprecated
from sklearn.utils.extmath import safe_sparse_dot
from scipy.special import logsumexp
from sklearn.utils.multiclass import _check_partial_fit_first_call
from sklearn.utils.validation import check_is_fitted, check_non_negative, column_or_1d
from sklearn.utils.validation import _check_sample_weight

from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.multiclass import OneVsRestClassifier

In [2]:
class _BaseNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
    """Abstract base class for naive Bayes estimators"""

    @abstractmethod
    def _joint_log_likelihood(self, X):
        """Compute the unnormalized posterior log probability of X
        I.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of
        shape [n_classes, n_samples].
        Input is passed to _joint_log_likelihood as-is by predict,
        predict_proba and predict_log_proba.
        """

    def _check_X(self, X):
        """To be overridden in subclasses with the actual checks."""
        # Note that this is not marked @abstractmethod as long as the
        # deprecated public alias sklearn.naive_bayes.BayesNB exists
        # (until 0.24) to preserve backward compat for 3rd party projects
        # with existing derived classes.
        return X

    def predict(self, X):
        """
        Perform classification on an array of test vectors X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
        Returns
        -------
        C : ndarray of shape (n_samples,)
            Predicted target values for X
        """
        check_is_fitted(self)
        X = self._check_X(X)
        jll = self._joint_log_likelihood(X)
        return self.classes_[np.argmax(jll, axis=1)]

    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the test vector X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
        Returns
        -------
        C : array-like of shape (n_samples, n_classes)
            Returns the log-probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute :term:`classes_`.
        """
        check_is_fitted(self)
        X = self._check_X(X)
        jll = self._joint_log_likelihood(X)
        # normalize by P(x) = P(f_1, ..., f_n)
        log_prob_x = logsumexp(jll, axis=1)
        return jll - np.atleast_2d(log_prob_x).T

    def predict_proba(self, X):
        """
        Return probability estimates for the test vector X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
        Returns
        -------
        C : array-like of shape (n_samples, n_classes)
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute :term:`classes_`.
        """
        return np.exp(self.predict_log_proba(X))

In [3]:
_ALPHA_MIN = 1e-10

class _BaseDiscreteNB(_BaseNB):
    """Abstract base class for naive Bayes on discrete/categorical data
    Any estimator based on this class should provide:
    __init__
    _joint_log_likelihood(X) as per _BaseNB
    """

    def _check_X(self, X):
        return check_array(X, accept_sparse='csr')

    def _check_X_y(self, X, y):
        return check_X_y(X, y, accept_sparse='csr')

    def _update_class_log_prior(self, class_prior=None):
        n_classes = len(self.classes_)
        if class_prior is not None:
            if len(class_prior) != n_classes:
                raise ValueError("Number of priors must match number of"
                                 " classes.")
            self.class_log_prior_ = np.log(class_prior)
  
        elif self.fit_prior:
            with warnings.catch_warnings():
                # silence the warning when count is 0 because class was not yet
                # observed
                warnings.simplefilter("ignore", RuntimeWarning)
                log_class_count = np.log(self.class_count_)

            # empirical prior, with sample_weight taken into account
            self.class_log_prior_ = (log_class_count -
                                     np.log(self.class_count_.sum()))
            
        else:
            self.class_log_prior_ = np.full(n_classes, -np.log(n_classes))

    def _check_alpha(self):
        if np.min(self.alpha) < 0:
            raise ValueError('Smoothing parameter alpha = %.1e. '
                             'alpha should be > 0.' % np.min(self.alpha))
        if isinstance(self.alpha, np.ndarray):
            if not self.alpha.shape[0] == self.n_features_:
                raise ValueError("alpha should be a scalar or a numpy array "
                                 "with shape [n_features]")
        if np.min(self.alpha) < _ALPHA_MIN:
            warnings.warn('alpha too small will result in numeric errors, '
                          'setting alpha = %.1e' % _ALPHA_MIN)
            return np.maximum(self.alpha, _ALPHA_MIN)
        return self.alpha

    def fit(self, X, y, sample_weight=None):

        self.X = X
        self.y = y
        X, y = self._check_X_y(X, y)
        _, n_features = X.shape
        self.n_features_ = n_features

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # We convert it to np.float64 to support sample_weight consistently;
        # this means we also don't have to cast X to floating point
        if sample_weight is not None:
            Y = Y.astype(np.float64, copy=False)
            sample_weight = np.asarray(sample_weight)
            sample_weight = np.atleast_2d(sample_weight)
            Y *= check_array(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        n_effective_classes = Y.shape[1]

        self._init_counters(n_effective_classes, n_features)
        # added y here as a parameter, to be used in LSNB implementation of _count()
        self._count(X, Y, y)
        alpha = self._check_alpha()
        self._update_feature_log_prob(alpha)
        self._update_class_log_prior(class_prior=class_prior)
        return self

    def _init_counters(self, n_effective_classes, n_features):
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.feature_count_ = np.zeros((n_effective_classes, n_features),
                                       dtype=np.float64)

    # XXX The following is a stopgap measure; we need to set the dimensions
    # of class_log_prior_ and feature_log_prob_ correctly.
    def _get_coef(self):
        return (self.feature_log_prob_[1:]
                if len(self.classes_) == 2 else self.feature_log_prob_)

    def _get_intercept(self):
        return (self.class_log_prior_[1:]
                if len(self.classes_) == 2 else self.class_log_prior_)

    coef_ = property(_get_coef)
    intercept_ = property(_get_intercept)

    def _more_tags(self):
        return {'poor_score': True}

In [4]:
class LooselySymmetricNB(_BaseDiscreteNB):
    
    def __init__(self, alpha=1.0, fit_prior=True, class_prior=None, enhance=False):
        self.alpha = alpha
        self.fit_prior = fit_prior
        self.class_prior = class_prior
        self.enhance = enhance
        
    def _count(self, X, Y, y):
        """Count and smooth feature occurrences."""
        
        check_non_negative(X, "LooselySymmetricNB (input X)")
        self.feature_count_ += safe_sparse_dot(Y.T, X)
        self.class_count_ += Y.sum(axis=0)
        
        # we need these two values in order to calculate document frequency in _calculate_df()
        self.X = X
        self.y = y
        
    def _update_feature_log_prob(self, alpha):

        self.smoothed_fc = self.feature_count_ + alpha
        self.smoothed_cc = self.smoothed_fc.sum(axis=1)
        
        self._calculate_df()
        self._calculate_abcd(self.smoothed_fc, self.smoothed_cc.reshape(-1, 1), self.enhance)
        
        # HAM
        self.bd = (self.b * self.d) / (self.b + self.d)
        self.ac = (self.a * self.c) / (self.a + self.c)
        bd = (self.b * self.d) / (self.b + self.d)
        ac = (self.a * self.c) / (self.a + self.c)
        numerator = self.a + bd
        denumerator = self.a + self.b + ac + bd
        
        # index 0 is for ham, index 1 is for spam
        self.feature_log_prob_ = np.empty(self.feature_count_.shape) 
        self.feature_log_prob_[0] = np.log(numerator) - np.log(denumerator)
        
        # SPAM
        numerator = self.c + bd
        denumerator = self.c + self.d + ac + bd
        
        self.feature_log_prob_[1] = np.log(numerator) - np.log(denumerator)
    
    def _calculate_df(self):
        
        self.df = np.zeros(self.feature_count_.shape, dtype=np.int32)
        for mail_idx, mail in enumerate(self.X):
            for word_idx, word in enumerate(mail):
                if word >= 1:
                    self.df[self.y[mail_idx]][word_idx] += 1
    
    def _calculate_abcd(self, fc, cc, enhance):
        
        # at 0 is ham info, at 1 is spam info
        if enhance:
            word_density_ham = fc[0] / cc[0]
            word_density_spam = fc[1] / cc[1]
        
        else:
            word_density_ham = 1
            word_density_spam = 1
        
        self.a = (self.df[0] / self.class_count_[0]) * word_density_ham
        self.b = (1 - self.a) * word_density_spam
        self.c = (self.df[1] / self.class_count_[1]) * word_density_spam
        self.d = (1 - self.c) * word_density_ham
        
        
    def _joint_log_likelihood(self, X):
       
        return (safe_sparse_dot(X, self.feature_log_prob_.T) + 
                self.class_log_prior_)

In [30]:
data, labels = load_breast_cancer(return_X_y=True)
# data, labels = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.25, shuffle=True)

In [31]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

mnb = MultinomialNB()
bnb = BernoulliNB()
mnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)
print(X_test[:5])
print(y_test[:10])

print(mnb.predict(X_test[:10]))
print(bnb.predict(X_test[:10]))

[[1.481e+01 1.470e+01 9.466e+01 6.807e+02 8.472e-02 5.016e-02 3.416e-02
  2.541e-02 1.659e-01 5.348e-02 2.182e-01 6.232e-01 1.677e+00 2.072e+01
  6.708e-03 1.197e-02 1.482e-02 1.056e-02 1.580e-02 1.779e-03 1.561e+01
  1.758e+01 1.017e+02 7.602e+02 1.139e-01 1.011e-01 1.101e-01 7.955e-02
  2.334e-01 6.142e-02]
 [1.169e+01 2.444e+01 7.637e+01 4.064e+02 1.236e-01 1.552e-01 4.515e-02
  4.531e-02 2.131e-01 7.405e-02 2.957e-01 1.978e+00 2.158e+00 2.095e+01
  1.288e-02 3.495e-02 1.865e-02 1.766e-02 1.560e-02 5.824e-03 1.298e+01
  3.219e+01 8.612e+01 4.877e+02 1.768e-01 3.251e-01 1.395e-01 1.308e-01
  2.803e-01 9.970e-02]
 [1.530e+01 2.527e+01 1.024e+02 7.324e+02 1.082e-01 1.697e-01 1.683e-01
  8.751e-02 1.926e-01 6.540e-02 4.390e-01 1.012e+00 3.498e+00 4.350e+01
  5.233e-03 3.057e-02 3.576e-02 1.083e-02 1.768e-02 2.967e-03 2.027e+01
  3.671e+01 1.493e+02 1.269e+03 1.641e-01 6.110e-01 6.335e-01 2.024e-01
  4.027e-01 9.876e-02]
 [1.916e+01 2.660e+01 1.262e+02 1.138e+03 1.020e-01 1.453e-01 1.921

In [12]:
clf = OneVsRestClassifier(LooselySymmetricNB()).fit(X_train, y_train)
print(X_test[:5])
print(y_test[:5])

print(clf.predict(X_test[:5]))

[[4.7 3.2 1.6 0.2]
 [4.4 3.  1.3 0.2]
 [5.1 3.8 1.5 0.3]
 [4.9 2.4 3.3 1. ]
 [7.7 3.  6.1 2.3]]
[0 0 0 1 2]
[0 0 0 0 0]




In [22]:
# DF se pocita pro vetsi nebo rovno 1, coz u takovychto hodnot dela bordel
# vyzkouset jeste to same pro bernoulli/neco takoveho pro podporu argumentu
print(clf.df)
print(clf.a)
print(clf.b)
print(clf.c)
print(clf.d)

[[165 165 165 165   0   0   0   0   0   0  19 110 165 165   0   0   0   0
    0   0 165 165 165 165   0   1   2   0   0   0]
 [261 261 261 261   0   0   0   0   0   0   0 152 254 261   0   0   0   0
    0   0 261 261 261 261   0   0   1   0   0   0]]
[1.         1.         1.         1.         0.         0.
 0.         0.         0.         0.         0.11515152 0.66666667
 1.         1.         0.         0.         0.         0.
 0.         0.         1.         1.         1.         1.
 0.         0.00606061 0.01212121 0.         0.         0.        ]
[0.         0.         0.         0.         1.         1.
 1.         1.         1.         1.         0.88484848 0.33333333
 0.         0.         1.         1.         1.         1.
 1.         1.         0.         0.         0.         0.
 1.         0.99393939 0.98787879 1.         1.         1.        ]
[1.         1.         1.         1.         0.         0.
 0.         0.         0.         0.         0.         0.58237548

In [14]:
clf = LooselySymmetricNB()
clf.fit(X_train, y_train)
print(X_test[:5])
print(y_test[:5])

print(clf.predict(X_test[:5]))

[[1.296e+01 1.829e+01 8.418e+01 5.252e+02 7.351e-02 7.899e-02 4.057e-02
  1.883e-02 1.874e-01 5.899e-02 2.357e-01 1.299e+00 2.397e+00 2.021e+01
  3.629e-03 3.713e-02 3.452e-02 1.065e-02 2.632e-02 3.705e-03 1.413e+01
  2.461e+01 9.631e+01 6.219e+02 9.329e-02 2.318e-01 1.604e-01 6.608e-02
  3.207e-01 7.247e-02]
 [8.878e+00 1.549e+01 5.674e+01 2.410e+02 8.293e-02 7.698e-02 4.721e-02
  2.381e-02 1.930e-01 6.621e-02 5.381e-01 1.200e+00 4.277e+00 3.018e+01
  1.093e-02 2.899e-02 3.214e-02 1.506e-02 2.837e-02 4.174e-03 9.981e+00
  1.770e+01 6.527e+01 3.020e+02 1.015e-01 1.248e-01 9.441e-02 4.762e-02
  2.434e-01 7.431e-02]
 [1.404e+01 1.598e+01 8.978e+01 6.112e+02 8.458e-02 5.895e-02 3.534e-02
  2.944e-02 1.714e-01 5.898e-02 3.892e-01 1.046e+00 2.644e+00 3.274e+01
  7.976e-03 1.295e-02 1.608e-02 9.046e-03 2.005e-02 2.830e-03 1.566e+01
  2.158e+01 1.012e+02 7.500e+02 1.195e-01 1.252e-01 1.117e-01 7.453e-02
  2.725e-01 7.234e-02]
 [1.120e+01 2.937e+01 7.067e+01 3.860e+02 7.449e-02 3.558e-02 0.000



In [17]:
bunch = load_breast_cancer()
# bunch.target_names
bunch.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')