# Email classification based on Bayesian analysis

## 1 Prepare

### 1.1. Import package

In [121]:
import os  # I/O
import re  # regular expression

import numpy as np # linear algebra
import pandas as pd # data processing and CSV file I/O

from sklearn.model_selection import train_test_split  # split train and test
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

RANDOM_SEED = 2022

### 1.2. Show datasets in project 

In [109]:
for dirname, _, filenames in os.walk(f'.\Data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

.\Data\SMSSpamCollection
.\Data\spam_ham_dataset.csv


You only need to run one of cells following. If you want to use other Datasets, you should organize your dataset to include at least 'text' and 'label_num' as in 'spam_ham_dataset.csv'. If not, as 1.2.2. DataSet2, you should add 'text' and 'label_num' by yourself. 

#### 1.2.1. DataSet 1

In [110]:
df=pd.read_csv(f'.\Data\spam_ham_dataset.csv')
df.drop([df.columns[0]], axis=1,inplace=True)
df.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


#### 1.2.2. DataSet 2

In [66]:
df = pd.read_csv('./data/SMSSpamCollection',sep='\t')
df.head()
label = []
for w in range(len(df.text)):
    if df['label'][w] == 'ham':
        label.append(0)
    else:
        label.append(1)
df['label_num'] = label
df.head()

Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### 1.3 Data Clean

In [111]:
def clean_data(text):
    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', text)
    #remove tags
    desc = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
    #remove digits and special chars
    desc = re.sub("(\\d|\\W)+"," ",desc) 
    return desc

In [112]:
clean_txt = []
for w in range(len(df.text)):
   text = df['text'][w].lower()
   desc = clean_data(text)
   clean_txt.append(desc)
df['clean'] = clean_txt
df.head()

Unnamed: 0,label,text,label_num,clean
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter this is a follow ...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom for january see attached file ...
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho we re around to ...
3,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...
4,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject re indian springs this deal is to book...


## 2 Split Training data and Test data

**CountVectorizer** is a great tool provided by the scikit-learn library in Python. It is used to transform a given text into a **vector on the basis of the frequency (count) of each word** that occurs in the entire text.

In [113]:
text_train, text_test, label_train, label_test = train_test_split(df['clean'], df['label_num'], test_size=0.2, random_state=RANDOM_SEED)
vectorizer_text = CountVectorizer(max_df=0.9, min_df=10)   
x_train = vectorizer_text.fit_transform(text_train)
x_test = vectorizer_text.transform(text_test)

## 3 Build in implementation and it's performance

In [114]:
for NB in [BernoulliNB(), MultinomialNB(), ComplementNB()]:
    NB.fit(x_train, label_train)
    p_test = NB.predict(x_test)
    test_acc = accuracy_score(label_test, p_test)        # accuracy
    test_precision = precision_score(label_test, p_test) # (spam and label as spam)/(labeled as spam)
    test_recall = recall_score(label_test, p_test)       # (spam and label as spam)/(all spam)
    print(NB)
    print('accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}'.format(test_acc, test_precision, test_recall)) # print performace

BernoulliNB()
accuracy: 0.9101, precision: 0.8632, recall: 0.8200
MultinomialNB()
accuracy: 0.9507, precision: 0.8807, recall: 0.9600
ComplementNB()
accuracy: 0.9527, precision: 0.8815, recall: 0.9667


## 4 Our Result

In [115]:
def fit(X_train, label_train):
    x_data = x_train.toarray()
    m,n = x_data.shape
    spam_num = len(label_train[label_train==1])
    ham_num = m - spam_num
    p_spam = spam_num/len(label_train)
    
    p_spam_cond = np.zeros(n)
    p_ham_cond = np.zeros(n)
    p_x = np.zeros(n)
    for j in range(n):
        for i in range(m):
            if x_data[i,j]>0:
                p_x[j] = p_x[j]+1/m
                if label_train[i]==1:
                    p_spam_cond[j] = p_spam_cond[j]+1/spam_num
                if label_train[i]==0:
                    p_ham_cond[j] = p_ham_cond[j]+1/ham_num
    p_ham_cond[p_ham_cond==0]=0.0001
    p_spam_cond[p_spam_cond==0]=0.0001
    return (p_spam, p_spam_cond, p_ham_cond)

In [116]:
def predict(x_test, p_spam, p_spam_cond, p_ham_cond):
    m,n = x_test.shape
    label_pre = np.zeros(m)
    for i in range(m):
        px_spam = p_spam
        px_ham = 1-p_spam
        for j in range(len(p_spam_cond)):
            if x_test[i,j]>0:
                px_spam = px_spam*p_spam_cond[j]
                px_ham = px_ham*p_ham_cond[j]
        label_pre[i] = int(px_spam>=px_ham)
    return label_pre

In [117]:
p_spam, p_spam_cond, p_ham_cond = fit(x_train, label_train.values)
label_pre = predict(x_test, p_spam, p_spam_cond, p_ham_cond)

In [118]:
test_acc = accuracy_score(label_test, label_pre)
test_precision = precision_score(label_test, label_pre)
test_recall = recall_score(label_test, label_pre)
print('accuracy: {:.4f}, precision: {:.4f}, recall: {:.4f}'.format(test_acc, test_precision, test_recall)) # print performace

accuracy: 0.9353, precision: 0.8416, recall: 0.9567


## 5 Example of our mail

##### **Please paste your email content here!**

In [None]:
spam = '''
Better English handwriting support
We are delighted to announce improvements to our English handwriting recognition. We used spell check and word frequency information to bias the algorithm towards more likely words when our algorithm is uncertain. For example, take this input image:

Before spell check:

Dolution:
Ans 24). In developing a chart to flot a course of action, with many of the events or milestones, we will we Process deciscon pirogram chart.
so, optison (A) is cossect ansuver.

After spell check:

Solution:
Ans 24). In developing a chart to plot a course of action, with many of the events or milestones, we will we Process decision program chart.
so, option (A) is correct answer.

Notice the big improvement! The spelling-aware improvements are only live for English and have no impact on other languages. These improvements will be coming soon for Spanish, French, and German.

Please note that incorrectly spelled words that are clearly written will not be changed: this predictive mode is only enabled when the underlying handwritten word is visually ambiguous.
Better English handwriting support
You can now select text directly from an opened PDF in the Snip web app. You can hover over a piece of text or math and click the clipboard button to copy the Markdown to your clipboard:

You can also click and drag to highlight the region of interest or double click any section to get access to selectable and copyable text:


You can use this feature on printed and handwritten PDFs:


Scribe: human-powered document conversion
AI has become very powerful at converting PDFs and images to editable text. But it’s still not perfect, and probably never will be (although we are getting closer day by day!).

To alleviate the gap, we are now offering human-powered services for document conversion that combine our AI with human LaTeX experts, so that you can get a perfect translation of your document to your desired format.

Simply send your PDF to scribe@mathpix.com with any special instructions on what you need and when you need it converted by. You can accept our standard price of $5 per input page, or request a discount if, for example, there is low text density per page. We then charge your card or payment method on file at accounts.mathpix.com.

This service is ideal for lecture notes (including handwritten ones) or older documents that you want to have perfectly transcribed.

More information about our Scribe service is available on our website.
Spectra writing competition update
We are extending the current Spectra writing competition deadline to January 1st. We will pick just 3 winners, with cash prizes of $2K for each winner. We will rank the winners as in previous competitions. More improvements to Spectra are coming soon.

We are also still giving away the yearly Pro plan for free to all those that submit content!
'''


In [120]:
# No input needed!
spam = clean_data(spam)
x = vectorizer_text.transform(np.array([spam]))
label_pre = predict(x, p_spam, p_spam_cond, p_ham_cond)
if label_pre == 1:
    print('It’s a spam email!')
else:
    print('It’s a ham email!')

It’s a spam email!
