# LOOSELY SYMMETRIC NAIVE BAYES

## Imports

In [50]:
import warnings
import random
import numpy as np
import scipy.sparse as sp
import os
import math
import nltk
import re
import sys

from abc import ABCMeta, abstractmethod
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import binarize
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import label_binarize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import check_X_y, check_array, deprecated
from sklearn.utils.extmath import safe_sparse_dot
from scipy.special import logsumexp
from sklearn.utils.multiclass import _check_partial_fit_first_call
from sklearn.utils.validation import check_is_fitted, check_non_negative, column_or_1d
from sklearn.utils.validation import _check_sample_weight
# from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

## Evaluation

In [58]:
mailsDir = "../datasets/enron1/"
spamDir = os.path.join(mailsDir, "spam")
hamDir = os.path.join(mailsDir, "ham")

mails = []
spaminfo = []

hamDirList = os.listdir(hamDir)
for file in hamDirList:
    with open(os.path.join(hamDir, file), "r", encoding="latin-1") as f:
        mail = f.read()
        mails.append(mail)
        spaminfo.append(0)

spamDirList = os.listdir(spamDir)
for file in spamDirList:
    with open(os.path.join(spamDir, file), "r", encoding="latin-1") as f:
        mail = f.read()
        mails.append(mail)
        spaminfo.append(1)

# shuffle the dataset, so it is not divided exactly as first 70% ham, other 30% spam
ordered = list(zip(mails, spaminfo))
random.shuffle(ordered)
mails, spaminfo = zip(*ordered)
print("loaded")

porter_stemmer = nltk.stem.porter.PorterStemmer()

def tokenize(text, stemmer=porter_stemmer):
    lower_text = text.lower()
    tokens = nltk.wordpunct_tokenize(lower_text)
    stems = [porter_stemmer.stem(token) for token in tokens]
    punct_less = [stem for stem in stems if re.match(
        '^[a-zA-Z]+$', stem
    ) is not None]
    return punct_less

# stopwords = nltk.corpus.stopwords.words("english")
# with open("./stopwords.txt", "w") as outf:
#     outf.write("\n".join(stopwords))

with open("../stop_words.txt", "r") as inf:
    stopwords = inf.read().splitlines()

stop_words = []
for word in stopwords:
    stop_words.append(tokenize(word)[0])
stop_words.append("becau")
stop_words = list(dict.fromkeys(stop_words))  # remove duplicates

vec = CountVectorizer(
    encoding="latin-1",
    decode_error="replace",
    strip_accents="unicode",
    analyzer="word",
    binary=False,
    stop_words = stop_words,
    tokenizer = tokenize,
    ngram_range=(1,1),
    max_df=0.99,
    min_df=2
)

loaded


### BernoulliNB

In [49]:
from sklearn.naive_bayes import BernoulliNB
vec.binary = True

for size in [50, 100, 200, 300, 400, 500, 600]:
    X_train, X_test, y_train, y_test = train_test_split(mails[:size], spaminfo[:size], test_size = 0.5)
    
    # Count vectorizer
    X_train_count = vec.fit_transform(X_train)
    X_test_count = vec.transform(X_test)

    bnb = BernoulliNB(class_prior=[0.5, 0.5])
    bnb.fit(X_train_count, y_train)
    
    print(f"Dataset size: {X_train_count.shape[0] *2}")
    
    print(f"BernoulliNB accuracy: {bnb.score(X_test_count.toarray(), y_test)}")
    print(f"BernoulliNB precision: {precision_score(y_test, bnb.predict(X_test_count.toarray()))}")
    print(f"BernoulliNB recall: {recall_score(y_test, bnb.predict(X_test_count.toarray()))}")
    print(f"BernoulliNB f1 score: {f1_score(y_test, bnb.predict(X_test_count.toarray()))}")
    print("-----------")

Dataset size: 50
BernoulliNB accuracy: 0.52
BernoulliNB precision: 0.3684210526315789
BernoulliNB recall: 1.0
BernoulliNB f1 score: 0.5384615384615384
-----------
Dataset size: 100
BernoulliNB accuracy: 0.74
BernoulliNB precision: 0.3333333333333333
BernoulliNB recall: 0.08333333333333333
BernoulliNB f1 score: 0.13333333333333333
-----------
Dataset size: 200
BernoulliNB accuracy: 0.78
BernoulliNB precision: 0.8461538461538461
BernoulliNB recall: 0.3548387096774194
BernoulliNB f1 score: 0.5
-----------
Dataset size: 300
BernoulliNB accuracy: 0.8
BernoulliNB precision: 0.8666666666666667
BernoulliNB recall: 0.3170731707317073
BernoulliNB f1 score: 0.4642857142857143
-----------
Dataset size: 400
BernoulliNB accuracy: 0.805
BernoulliNB precision: 0.8863636363636364
BernoulliNB recall: 0.5342465753424658
BernoulliNB f1 score: 0.6666666666666666
-----------
Dataset size: 500
BernoulliNB accuracy: 0.856
BernoulliNB precision: 0.8545454545454545
BernoulliNB recall: 0.6266666666666667
Bernoul

### GaussianNB, MultinomialNB, LSNB, eLSNB

In [64]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from lsnb.LooselySymmetricNB import LooselySymmetricNB
vec.binary = False

for size in [50, 100, 200, 300, 400, 500, 600]:
    X_train, X_test, y_train, y_test = train_test_split(mails[:size], spaminfo[:size], test_size = 0.5)
    
    # Count vectorizer
    X_train_count = vec.fit_transform(X_train)
    X_test_count = vec.transform(X_test)
    
    gnb = GaussianNB(priors=[0.5, 0.5])
    gnb.fit(X_train_count.toarray(), y_train)
    mnb = MultinomialNB(class_prior=[0.5, 0.5])
    mnb.fit(X_train_count, y_train)
    clf = LooselySymmetricNB(class_prior=[0.5, 0.5])
    clf.fit(X_train_count, y_train)
    elsnb = LooselySymmetricNB(class_prior=[0.5, 0.5], enhance=True)
    elsnb.fit(X_train_count, y_train)

    print(f"Dataset size: {X_train_count.shape[0] * 2}")
    print(f"LSNB accuracy: {lsnb.score(X_test_count.toarray(), y_test)}")
    print(f"LSNB precision: {precision_score(y_test, lsnb.predict(X_test_count.toarray()))}")
    print(f"LSNB recall: {recall_score(y_test, lsnb.predict(X_test_count.toarray()))}")
    print(f"LSNB f1 score: {f1_score(y_test, lsnb.predict(X_test_count.toarray()))}")
    print("-----------")
    
    print(f"eLSNB accuracy: {elsnb.score(X_test_count.toarray(), y_test)}")
    print(f"eLSNB precision: {precision_score(y_test, elsnb.predict(X_test_count.toarray()))}")
    print(f"eLSNB recall: {recall_score(y_test, elsnb.predict(X_test_count.toarray()))}")
    print(f"eLSNB f1 score: {f1_score(y_test, elsnb.predict(X_test_count.toarray()))}")
    print("-----------")
    
    print(f"GaussianNB accuracy: {gnb.score(X_test_count.toarray(), y_test)}")
    print(f"GaussianNB precision: {precision_score(y_test, gnb.predict(X_test_count.toarray()))}")
    print(f"GaussianNB recall: {recall_score(y_test, gnb.predict(X_test_count.toarray()))}")
    print(f"GaussianNB f1 score: {f1_score(y_test, gnb.predict(X_test_count.toarray()))}")
    print("-----------")

    print(f"MultinomialNB accuracy: {mnb.score(X_test_count.toarray(), y_test)}")
    print(f"MultinomialNB precision: {precision_score(y_test, mnb.predict(X_test_count.toarray()))}")
    print(f"MultinomialNB recall: {recall_score(y_test, mnb.predict(X_test_count.toarray()))}")
    print(f"MultinomialNB f1 score: {f1_score(y_test, mnb.predict(X_test_count.toarray()))}")
    print("==============")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().