# Naive Bayes Classifier from Scratch
## Import some library that we need

In [14]:
import pandas as pd
import numpy as np
import re

## Initialize the data

Data is used to classify spam or not spam email. The data is taken from [here](https://archive.ics.uci.edu/ml/datasets/spambase).

In [15]:
# Preparing the data for Naive Bayes
spam_dataframe = pd.read_csv('./emails.csv')
spam_dataframe

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [16]:
# calculating the probability of spam and ham
spam_probability = len(spam_dataframe[spam_dataframe['spam'] == 1]) / len(spam_dataframe)
ham_probability = len(spam_dataframe[spam_dataframe['spam'] == 0]) / len(spam_dataframe)

print('Spam Probability: ', spam_probability)
print('Ham Probability: ', ham_probability)

Spam Probability:  0.2388268156424581
Ham Probability:  0.7611731843575419


## Preprocessing

Using [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to convert text into a matrix of token counts.

The result of the preprocessing is a matrix of token counts and a list of vocabulary.

### preprocess_text

This function is used to preprocess text. It will remove punctuation, stopwords, and convert text into lowercase.

### build_vocabulary

This function is used to build vocabulary from the text. It will return a dictionary of vocabulary.

### convert_text_to_vector

This function is used to convert text into a vector. It will return a vector of text.

In [17]:
# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    return ' '.join(tokens)

def build_vocabulary(texts):
    vocabulary = set()
    for text in texts:
        tokens = text.split()
        vocabulary.update(tokens)
    return list(vocabulary)

def create_bow(texts, vocabulary):
    bow_matrix = []
    for text in texts:
        tokens = text.split()
        bow_vector = [tokens.count(word) for word in vocabulary]
        bow_matrix.append(bow_vector)
    return bow_matrix

In [18]:
# Preprocess the text data
preprocessed_texts = [preprocess_text(text) for text in spam_dataframe['text']]

# Build vocabulary
vocabulary = build_vocabulary(preprocessed_texts)

# Create Bag of Words (BoW) representation
bow_matrix = create_bow(preprocessed_texts, vocabulary)

# Convert bow_matrix to a NumPy array
X = np.array(bow_matrix)
y = np.array(spam_dataframe['spam'])

In [19]:
# Split the data into train and test sets 0.8/0.2
X_train = X[:int(len(X) * 0.8)]
X_test = X[int(len(X) * 0.8):]
y_train = y[:int(len(y) * 0.8)]
y_test = y[int(len(y) * 0.8):]

## Using Naive Bayes Classifier

The Naive Bayes Classifier is used to classify spam or not spam email. 

The Naive Bayes Classifier is implemented from scratch.

### train

This function is used to train the Naive Bayes Classifier. It will return a dictionary of probability.

Mathematically, the probability is calculated as follows:

$$P(y) = \frac{count(y)}{count(Y)}$$

$$P(x_i|y) = \frac{count(x_i, y)}{count(y)}$$

### predict

This function is used to predict the class of the data. It will return a list of prediction.

Mathematically, the probability is calculated as follows:

$$P(y|x_1, x_2, ..., x_n) = \frac{P(y) \times P(x_1|y) \times P(x_2|y) \times ... \times P(x_n|y)}{P(x_1) \times P(x_2) \times ... \times P(x_n)}$$

$$P(y|x_1, x_2, ..., x_n) = P(y) \times P(x_1|y) \times P(x_2|y) \times ... \times P(x_n|y)$$

$$P(y|x_1, x_2, ..., x_n) = log(P(y)) + log(P(x_1|y)) + log(P(x_2|y)) + ... + log(P(x_n|y))$$

In [20]:
# Naive Bayes implementation from scratch
class CustomMultinomialNB:
    def __init__(self, alpha=1):
        self.alpha = alpha

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.classes = np.unique(y)
        self.parameters = {}
        for i, c in enumerate(self.classes):
            X_c = X[np.where(y == c)]
            self.parameters["phi_" + str(c)] = len(X_c) / len(X)
            self.parameters["theta_" + str(c)] = (X_c.sum(axis=0) + self.alpha) / (np.sum(X_c.sum(axis=0) + self.alpha))

    def predict(self, X):
        predictions = []
        for x in X:
            phi_list = []
            for i, c in enumerate(self.classes):
                phi = np.log(self.parameters["phi_" + str(c)])
                theta = np.sum(np.log(self.parameters["theta_" + str(c)]) * x)
                phi_list.append(phi + theta)
            predictions.append(self.classes[np.argmax(phi_list)])
        return predictions

In [21]:
NB_classifier=CustomMultinomialNB()
NB_classifier.fit(X_train,y_train)

In [22]:
y_predict_test=NB_classifier.predict(X_test)

## Evaluation

Originally, we use classification_report from sklearn.metrics to evaluate the model.

Here, we just calculate the accuracy of the model.

In [23]:
# accuracy
print('Accuracy: ', np.sum(y_predict_test == y_test) / len(y_test))

Accuracy:  0.9912739965095986
