# LOOSELY SYMMETRIC NAIVE BAYES

## Imports

In [1]:
import warnings
import random
import numpy as np
import scipy.sparse as sp
import os
import math
import nltk
import re

from abc import ABCMeta, abstractmethod
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import binarize
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import label_binarize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import check_X_y, check_array, deprecated
from sklearn.utils.extmath import safe_sparse_dot
from scipy.special import logsumexp
from sklearn.utils.multiclass import _check_partial_fit_first_call
from sklearn.utils.validation import check_is_fitted, check_non_negative, column_or_1d
from sklearn.utils.validation import _check_sample_weight
# from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

## Load dataset

In [14]:
mailsDir = "datasets/enron1/"
spamDir = os.path.join(mailsDir, "spam")
hamDir = os.path.join(mailsDir, "ham")
print(spamDir)
print(hamDir)
mails = []
spaminfo = []

i = 0
hamDirList = os.listdir(hamDir)
for file in hamDirList:
    with open(os.path.join(hamDir, file), "r", encoding="latin-1") as f:
        mail = f.read()
        mails.append(mail)
        spaminfo.append(0)
        i += 1
        if i == 100:
            break
            
print(i)
i = 0
spamDirList = os.listdir(spamDir)
for file in spamDirList:
    with open(os.path.join(spamDir, file), "r", encoding="latin-1") as f:
        mail = f.read()
        mails.append(mail)
        spaminfo.append(1)
        i += 1
        if i == 100:
            break

print(i)

datasets/enron1/spam
datasets/enron1/ham
100
100


## Preprocessing

In [15]:
porter_stemmer = nltk.stem.porter.PorterStemmer()

def tokenize(text, stemmer=porter_stemmer):
    lower_text = text.lower()
    tokens = nltk.wordpunct_tokenize(lower_text)
    stems = [porter_stemmer.stem(token) for token in tokens]
    punct_less = [stem for stem in stems if re.match(
        '^[a-zA-Z]+$', stem
    ) is not None]
    return punct_less

# stopwords = nltk.corpus.stopwords.words("english")
# with open("./stopwords.txt", "w") as outf:
#     outf.write("\n".join(stopwords))

with open("./stopwords.txt", "r") as inf:
    stopwords = inf.read().splitlines()

stop_words = []
for word in stopwords:
    stop_words.append(tokenize(word)[0])
stop_words.append("becau")
stop_words = list(dict.fromkeys(stop_words))  # remove duplicates

vec = CountVectorizer(
    encoding="latin-1",
    decode_error="replace",
    strip_accents="unicode",
    analyzer="word",
    binary=False,
    stop_words = stop_words,
    tokenizer = tokenize,
    ngram_range=(1,1),
    max_df=0.99,
    min_df=2
)

## Train/test split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(mails, spaminfo, test_size = 0.5)

# Count vectorizer
X_train_count = vec.fit_transform(X_train)
print(X_train_count.shape)

# Count vectorizer
X_test_count = vec.transform(X_test)
print(X_test_count.shape)

(100, 1161)
(100, 1161)


## Scikit-learn code

In [17]:
class _BaseNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
    """Abstract base class for naive Bayes estimators"""

    @abstractmethod
    def _joint_log_likelihood(self, X):
        """Compute the unnormalized posterior log probability of X
        I.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of
        shape [n_classes, n_samples].
        Input is passed to _joint_log_likelihood as-is by predict,
        predict_proba and predict_log_proba.
        """

    def _check_X(self, X):
        """To be overridden in subclasses with the actual checks."""
        # Note that this is not marked @abstractmethod as long as the
        # deprecated public alias sklearn.naive_bayes.BayesNB exists
        # (until 0.24) to preserve backward compat for 3rd party projects
        # with existing derived classes.
        return X

    def predict(self, X):
        """
        Perform classification on an array of test vectors X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
        Returns
        -------
        C : ndarray of shape (n_samples,)
            Predicted target values for X
        """
        check_is_fitted(self)
        X = self._check_X(X)
        jll = self._joint_log_likelihood(X)
        return self.classes_[np.argmax(jll, axis=1)]

    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the test vector X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
        Returns
        -------
        C : array-like of shape (n_samples, n_classes)
            Returns the log-probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute :term:`classes_`.
        """
        check_is_fitted(self)
        X = self._check_X(X)
        jll = self._joint_log_likelihood(X)
        # normalize by P(x) = P(f_1, ..., f_n)
        log_prob_x = logsumexp(jll, axis=1)
        return jll - np.atleast_2d(log_prob_x).T

    def predict_proba(self, X):
        """
        Return probability estimates for the test vector X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
        Returns
        -------
        C : array-like of shape (n_samples, n_classes)
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute :term:`classes_`.
        """
        return np.exp(self.predict_log_proba(X))

In [18]:
_ALPHA_MIN = 1e-10

class _BaseDiscreteNB(_BaseNB):
    """Abstract base class for naive Bayes on discrete/categorical data
    Any estimator based on this class should provide:
    __init__
    _joint_log_likelihood(X) as per _BaseNB
    """

    def _check_X(self, X):
        return check_array(X, accept_sparse='csr')

    def _check_X_y(self, X, y):
        return check_X_y(X, y, accept_sparse='csr')

    def _update_class_log_prior(self, class_prior=None):
        n_classes = len(self.classes_)
        if class_prior is not None:
            if len(class_prior) != n_classes:
                raise ValueError("Number of priors must match number of"
                                 " classes.")
            self.class_log_prior_ = np.log(class_prior)
  
        elif self.fit_prior:
            with warnings.catch_warnings():
                # silence the warning when count is 0 because class was not yet
                # observed
                warnings.simplefilter("ignore", RuntimeWarning)
                log_class_count = np.log(self.class_count_)

            # empirical prior, with sample_weight taken into account
            self.class_log_prior_ = (log_class_count -
                                     np.log(self.class_count_.sum()))
            
        else:
            self.class_log_prior_ = np.full(n_classes, -np.log(n_classes))

    def _check_alpha(self):
        if np.min(self.alpha) < 0:
            raise ValueError('Smoothing parameter alpha = %.1e. '
                             'alpha should be > 0.' % np.min(self.alpha))
        if isinstance(self.alpha, np.ndarray):
            if not self.alpha.shape[0] == self.n_features_:
                raise ValueError("alpha should be a scalar or a numpy array "
                                 "with shape [n_features]")
        if np.min(self.alpha) < _ALPHA_MIN:
            warnings.warn('alpha too small will result in numeric errors, '
                          'setting alpha = %.1e' % _ALPHA_MIN)
            return np.maximum(self.alpha, _ALPHA_MIN)
        return self.alpha

    def fit(self, X, y, sample_weight=None):

        self.X = X
        self.y = y
        X, y = self._check_X_y(X, y)
        _, n_features = X.shape
        self.n_features_ = n_features

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # We convert it to np.float64 to support sample_weight consistently;
        # this means we also don't have to cast X to floating point
        if sample_weight is not None:
            Y = Y.astype(np.float64, copy=False)
            sample_weight = np.asarray(sample_weight)
            sample_weight = np.atleast_2d(sample_weight)
            Y *= check_array(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        n_effective_classes = Y.shape[1]

        self._init_counters(n_effective_classes, n_features)
        # added y here as a parameter, to be used in LSNB implementation of _count()
        self._count(X, Y, y)
        alpha = self._check_alpha()
        self._update_feature_log_prob(alpha)
        self._update_class_log_prior(class_prior=class_prior)
        return self

    def _init_counters(self, n_effective_classes, n_features):
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.feature_count_ = np.zeros((n_effective_classes, n_features),
                                       dtype=np.float64)

    # XXX The following is a stopgap measure; we need to set the dimensions
    # of class_log_prior_ and feature_log_prob_ correctly.
    def _get_coef(self):
        return (self.feature_log_prob_[1:]
                if len(self.classes_) == 2 else self.feature_log_prob_)

    def _get_intercept(self):
        return (self.class_log_prior_[1:]
                if len(self.classes_) == 2 else self.class_log_prior_)

    coef_ = property(_get_coef)
    intercept_ = property(_get_intercept)

    def _more_tags(self):
        return {'poor_score': True}

In [19]:
class LooselySymmetricNB(_BaseDiscreteNB):
    
    def __init__(self, alpha=1.0, fit_prior=True, class_prior=None, enhance=False):
        self.alpha = alpha
        self.fit_prior = fit_prior
        self.class_prior = class_prior
        self.enhance = enhance
        
    def _count(self, X, Y, y):
        """Count and smooth feature occurrences."""
        
        check_non_negative(X, "LooselySymmetricNB (input X)")
        self.feature_count_ += safe_sparse_dot(Y.T, X)
        self.class_count_ += Y.sum(axis=0)
        
        # we need these two values in order to calculate document frequency in _calculate_df()
        self.X = X
        self.y = y
        
    def _update_feature_log_prob(self, alpha):

        self.smoothed_fc = self.feature_count_ + alpha
        self.smoothed_cc = self.smoothed_fc.sum(axis=1)
        
        self._calculate_df()
        self._calculate_abcd(self.smoothed_fc, self.smoothed_cc.reshape(-1, 1), self.enhance)
        
        # HAM
        self.bd = (self.b * self.d) / (self.b + self.d)
        self.ac = (self.a * self.c) / (self.a + self.c)
        bd = (self.b * self.d) / (self.b + self.d)
        ac = (self.a * self.c) / (self.a + self.c)
        numerator = self.a + bd
        denumerator = self.a + self.b + ac + bd
        
        # index 0 is for ham, index 1 is for spam
        self.feature_log_prob_ = np.empty(self.feature_count_.shape) 
        self.feature_log_prob_[0] = np.log(numerator) - np.log(denumerator)
        
        # SPAM
        numerator = self.c + bd
        denumerator = self.c + self.d + ac + bd
        
        self.feature_log_prob_[1] = np.log(numerator) - np.log(denumerator)
    
    def _calculate_df(self):
        
        self.df = np.zeros(self.feature_count_.shape, dtype=np.int32)
        for mail_idx, mail in enumerate(self.X.toarray()):
            for word_idx, word in enumerate(mail):
                if word >= 1:
                    self.df[self.y[mail_idx]][word_idx] += 1
    
    def _calculate_abcd(self, fc, cc, enhance):
        
        # at 0 is ham info, at 1 is spam info
        if enhance:
            word_density_ham = fc[0] / cc[0]
            word_density_spam = fc[1] / cc[1]
        
        else:
            word_density_ham = 1
            word_density_spam = 1
        
        self.a = (self.df[0] / self.class_count_[0]) * word_density_ham
        self.b = (1 - self.a) * word_density_spam
        self.c = (self.df[1] / self.class_count_[1]) * word_density_spam
        self.d = (1 - self.c) * word_density_ham
        
        
    def _joint_log_likelihood(self, X):
       
        return (safe_sparse_dot(X, self.feature_log_prob_.T) + 
                self.class_log_prior_)

## Fitting

In [20]:
lsnb = LooselySymmetricNB()
lsnb.fit(X_train_count, y_train)
elsnb = LooselySymmetricNB(enhance=True)
elsnb.fit(X_train_count, y_train)

LooselySymmetricNB(enhance=True)

## Classification

In [21]:
test_mails = [
    "Hello Fred, I'm writing to inform you about the problem you wrote to me about before. Please send me the files today.",
    "ONE TIME OFFER NOW!!! Click here and don't miss because you won a million dollars!"
]

test_mails_count = vec.transform(test_mails)
print(lsnb.predict(test_mails_count))
print(elsnb.predict(test_mails_count))

[1 1]
[0 1]


## Evaluation

In [22]:
mailsDir = "datasets/enron1/"
spamDir = os.path.join(mailsDir, "spam")
hamDir = os.path.join(mailsDir, "ham")
print(spamDir)
print(hamDir)
mails = []
spaminfo = []

hamDirList = os.listdir(hamDir)
for file in hamDirList:
    with open(os.path.join(hamDir, file), "r", encoding="latin-1") as f:
        mail = f.read()
        mails.append(mail)
        spaminfo.append(0)

spamDirList = os.listdir(spamDir)
for file in spamDirList:
    with open(os.path.join(spamDir, file), "r", encoding="latin-1") as f:
        mail = f.read()
        mails.append(mail)
        spaminfo.append(1)

# shuffle the dataset, so it is not divided exactly as first 70% ham, other 30% spam
ordered = list(zip(mails, spaminfo))
random.shuffle(ordered)
mails, spaminfo = zip(*ordered)
print("loaded")

datasets/enron1/spam
datasets/enron1/ham
loaded


### BernoulliNB

In [23]:
from sklearn.naive_bayes import BernoulliNB
vec.binary = True

for size in [50, 100, 200, 300, 400, 500, 600]:
    X_train, X_test, y_train, y_test = train_test_split(mails[:size], spaminfo[:size], test_size = 0.5)
    
    # Count vectorizer
    X_train_count = vec.fit_transform(X_train)
    X_test_count = vec.transform(X_test)

    bnb = BernoulliNB(class_prior=[0.5, 0.5])
    bnb.fit(X_train_count, y_train)
    
    print(f"Dataset size: {X_train_count.shape[0] *2}")
    
    print(f"BernoulliNB accuracy: {bnb.score(X_test_count.toarray(), y_test)}")
    print(f"BernoulliNB precision: {precision_score(y_test, bnb.predict(X_test_count.toarray()))}")
    print(f"BernoulliNB recall: {recall_score(y_test, bnb.predict(X_test_count.toarray()))}")
    print(f"BernoulliNB f1 score: {f1_score(y_test, bnb.predict(X_test_count.toarray()))}")
    print("-----------")

Dataset size: 50
BernoulliNB accuracy: 0.8
BernoulliNB precision: 0.6666666666666666
BernoulliNB recall: 0.75
BernoulliNB f1 score: 0.7058823529411765
-----------
Dataset size: 100
BernoulliNB accuracy: 0.82
BernoulliNB precision: 0.8
BernoulliNB recall: 0.6666666666666666
BernoulliNB f1 score: 0.7272727272727272
-----------
Dataset size: 200
BernoulliNB accuracy: 0.78
BernoulliNB precision: 0.7647058823529411
BernoulliNB recall: 0.41935483870967744
BernoulliNB f1 score: 0.5416666666666666
-----------
Dataset size: 300
BernoulliNB accuracy: 0.86
BernoulliNB precision: 0.7647058823529411
BernoulliNB recall: 0.6666666666666666
BernoulliNB f1 score: 0.7123287671232877
-----------
Dataset size: 400
BernoulliNB accuracy: 0.73
BernoulliNB precision: 0.6923076923076923
BernoulliNB recall: 0.391304347826087
BernoulliNB f1 score: 0.5
-----------
Dataset size: 500
BernoulliNB accuracy: 0.88
BernoulliNB precision: 0.8260869565217391
BernoulliNB recall: 0.76
BernoulliNB f1 score: 0.791666666666666

### GaussianNB, MultinomialNB, LSNB, eLSNB

In [24]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
vec.binary = False

for size in [50, 100, 200, 300, 400, 500, 600]:
    X_train, X_test, y_train, y_test = train_test_split(mails[:size], spaminfo[:size], test_size = 0.5)
    
    # Count vectorizer
    X_train_count = vec.fit_transform(X_train)
    X_test_count = vec.transform(X_test)
    
    gnb = GaussianNB(priors=[0.5, 0.5])
    gnb.fit(X_train_count.toarray(), y_train)
    mnb = MultinomialNB(class_prior=[0.5, 0.5])
    mnb.fit(X_train_count, y_train)
    lsnb = LooselySymmetricNB(class_prior=[0.5, 0.5])
    lsnb.fit(X_train_count, y_train)
    elsnb = LooselySymmetricNB(class_prior=[0.5, 0.5], enhance=True)
    elsnb.fit(X_train_count, y_train)

    print(f"Dataset size: {X_train_count.shape[0] * 2}")
    print(f"LSNB accuracy: {lsnb.score(X_test_count.toarray(), y_test)}")
    print(f"LSNB precision: {precision_score(y_test, lsnb.predict(X_test_count.toarray()))}")
    print(f"LSNB recall: {recall_score(y_test, lsnb.predict(X_test_count.toarray()))}")
    print(f"LSNB f1 score: {f1_score(y_test, lsnb.predict(X_test_count.toarray()))}")
    print("-----------")
    
    print(f"eLSNB accuracy: {elsnb.score(X_test_count.toarray(), y_test)}")
    print(f"eLSNB precision: {precision_score(y_test, elsnb.predict(X_test_count.toarray()))}")
    print(f"eLSNB recall: {recall_score(y_test, elsnb.predict(X_test_count.toarray()))}")
    print(f"eLSNB f1 score: {f1_score(y_test, elsnb.predict(X_test_count.toarray()))}")
    print("-----------")
    
    print(f"GaussianNB accuracy: {gnb.score(X_test_count.toarray(), y_test)}")
    print(f"GaussianNB precision: {precision_score(y_test, gnb.predict(X_test_count.toarray()))}")
    print(f"GaussianNB recall: {recall_score(y_test, gnb.predict(X_test_count.toarray()))}")
    print(f"GaussianNB f1 score: {f1_score(y_test, gnb.predict(X_test_count.toarray()))}")
    print("-----------")

    print(f"MultinomialNB accuracy: {mnb.score(X_test_count.toarray(), y_test)}")
    print(f"MultinomialNB precision: {precision_score(y_test, mnb.predict(X_test_count.toarray()))}")
    print(f"MultinomialNB recall: {recall_score(y_test, mnb.predict(X_test_count.toarray()))}")
    print(f"MultinomialNB f1 score: {f1_score(y_test, mnb.predict(X_test_count.toarray()))}")
    print("==============")

Dataset size: 50
LSNB accuracy: 0.8
LSNB precision: 1.0
LSNB recall: 0.4444444444444444
LSNB f1 score: 0.6153846153846153
-----------
eLSNB accuracy: 0.96
eLSNB precision: 1.0
eLSNB recall: 0.8888888888888888
eLSNB f1 score: 0.9411764705882353
-----------
GaussianNB accuracy: 0.68
GaussianNB precision: 0.6666666666666666
GaussianNB recall: 0.2222222222222222
GaussianNB f1 score: 0.3333333333333333
-----------
MultinomialNB accuracy: 0.96
MultinomialNB precision: 1.0
MultinomialNB recall: 0.8888888888888888
MultinomialNB f1 score: 0.9411764705882353
Dataset size: 100
LSNB accuracy: 0.88
LSNB precision: 0.9375
LSNB recall: 0.75
LSNB f1 score: 0.8333333333333334
-----------
eLSNB accuracy: 0.9
eLSNB precision: 0.8260869565217391
eLSNB recall: 0.95
eLSNB f1 score: 0.8837209302325583
-----------
GaussianNB accuracy: 0.72
GaussianNB precision: 0.875
GaussianNB recall: 0.35
GaussianNB f1 score: 0.4999999999999999
-----------
MultinomialNB accuracy: 0.86
MultinomialNB precision: 0.76
Multinomi