# Email classification based on Bayesian analysis

## 1. Prepare

### 1.1 Import package

In [33]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import plotly.graph_objects as go 
from matplotlib import pyplot as plt
import os
import re  

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_validate # split train and test
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

RANDOM_SEED = 2022

### 1.2. Show datasets in project 

In [34]:
for dirname, _, filenames in os.walk(f'.\data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

.\data\spam_ham_dataset.csv


In [35]:
df=pd.read_csv(f'.\data\spam_ham_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


### Data Clean

In [8]:
clean_txt = []
for w in range(len(df.text)):
   desc = df['text'][w].lower()
   #remove punctuation
   desc = re.sub('[^a-zA-Z]', ' ', desc)
   #remove tags
   desc = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
   #remove digits and special chars
   desc = re.sub("(\\d|\\W)+"," ",desc)
   clean_txt.append(desc)
df['clean'] = clean_txt
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,clean
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter this is a follow ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom for january see attached file ...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho we re around to ...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject re indian springs this deal is to book...


## 

In [28]:
text_train, text_test, label_train, label_test = train_test_split(df['clean'], df['label_num'], test_size=0.2, random_state=RANDOM_SEED)
vectorizer_text = CountVectorizer(max_df=0.9, min_df=10)
x_train = vectorizer_text.fit_transform(text_train)
x_test = vectorizer_text.transform(text_test)
print(x_train.shape)

(4136, 3871)


In [None]:
## 

In [32]:
for NB in [BernoulliNB(), MultinomialNB(), ComplementNB()]:
    NB.fit(x_train, label_train)
    p_test = NB.predict(x_test)
    test_acc = accuracy_score(label_test, p_test)        # accuracy
    test_precision = precision_score(label_test, p_test) # (spam and label as spam)/(labeled as spam)
    test_recall = recall_score(label_test, p_test)       # (spam and label as spam)/(all spam)
    print(NB)
    print('accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}'.format(test_acc, test_precision, test_recall)) # print performace

BernoulliNB()
accuracy: 0.9101, precision: 0.8632, recall: 0.8200
MultinomialNB()
accuracy: 0.9507, precision: 0.8807, recall: 0.9600
ComplementNB()
accuracy: 0.9527, precision: 0.8815, recall: 0.9667


https://scikit-learn.org/stable/modules/naive_bayes.html