### Naive Bayes Theorem model

In [2]:
## Importing the libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
## load the data set
df = pd.read_csv("C:\\Users\\Hi\Downloads\\mail_data.csv")

In [4]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
# missing vaues

df.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [8]:
## Handling imbalance data
ham = df[df['Category']=='ham']
spam = df[df['Category']=='spam']

In [9]:
spam = spam.sample(ham.shape[0],replace=True)

In [10]:
print(ham.shape)
print(spam.shape)

(4825, 2)
(4825, 2)


In [11]:
data = ham.append(spam, ignore_index =True)
data.shape

(9650, 2)

In [12]:
x= data['Message']
y=data['Category']

In [14]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       U dun say so early hor... U c already then say...
3       Nah I don't think he goes to usf, he lives aro...
4       Even my brother is not like to speak with me. ...
                              ...                        
9645    PRIVATE! Your 2003 Account Statement for 07753...
9646    from www.Applausestore.com MonthlySubscription...
9647    FREE for 1st week! No1 Nokia tone 4 ur mobile ...
9648    sports fans - get the latest sports news str* ...
9649    1000's of girls many local 2 u who r virgins 2...
Name: Message, Length: 9650, dtype: object

In [31]:
## Bag of Words

from sklearn.feature_extraction.text import CountVectorizer

cv= CountVectorizer(max_features = 100)

x1 = cv.fit_transform(data['Message']).toarray()

In [15]:
y

0        ham
1        ham
2        ham
3        ham
4        ham
        ... 
9645    spam
9646    spam
9647    spam
9648    spam
9649    spam
Name: Category, Length: 9650, dtype: object

In [32]:
x1

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
pd.DataFrame(x1).shape

(9650, 8708)

In [34]:
##  split the data into training and test 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test  = train_test_split(x1,y,test_size=0.25,random_state=101,stratify = y)

In [36]:
## Building Naive Bayes Theorem
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

# Gaussian NB
gnb = GaussianNB()
gnb.fit(x_train,y_train)

## Gaussian NB

In [51]:
gnb = GaussianNB()
gnb.fit(x_train,y_train)

In [52]:
y_pred_gnb_train = gnb.predict(x_train)
y_pred_gnb_test = gnb.predict(x_test)

In [53]:
y_pred_gnb_test

array(['ham', 'ham', 'spam', ..., 'spam', 'ham', 'ham'], dtype='<U4')

In [54]:
# evaluation metrics

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score



In [55]:
print(classification_report(y_train,y_pred_gnb_train))
print()
print(classification_report(y_test,y_pred_gnb_test))

              precision    recall  f1-score   support

         ham       1.00      0.93      0.96      3619
        spam       0.94      1.00      0.97      3618

    accuracy                           0.97      7237
   macro avg       0.97      0.97      0.97      7237
weighted avg       0.97      0.97      0.97      7237


              precision    recall  f1-score   support

         ham       1.00      0.89      0.94      1206
        spam       0.90      1.00      0.95      1207

    accuracy                           0.95      2413
   macro avg       0.95      0.95      0.95      2413
weighted avg       0.95      0.95      0.95      2413



In [56]:
print(accuracy_score(y_train,y_pred_gnb_train))
print()
print(accuracy_score(y_test,y_pred_gnb_test))

0.9660080143705956

0.9457107335267302


### Bernoulli NB

In [57]:
bnb = BernoulliNB()
bnb.fit(x_train,y_train)

In [58]:
y_pred_bnb_train = bnb.predict(x_train)
y_pred_bnb_test = bnb.predict(x_test)

In [59]:
print(classification_report(y_train,y_pred_bnb_train))
print()
print(classification_report(y_test,y_pred_bnb_test))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      3619
        spam       1.00      0.97      0.98      3618

    accuracy                           0.98      7237
   macro avg       0.99      0.98      0.98      7237
weighted avg       0.99      0.98      0.98      7237


              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1206
        spam       1.00      0.97      0.99      1207

    accuracy                           0.99      2413
   macro avg       0.99      0.99      0.99      2413
weighted avg       0.99      0.99      0.99      2413



In [60]:
print(accuracy_score(y_train,y_pred_bnb_train))
print()
print(accuracy_score(y_test,y_pred_bnb_test))

0.9846621528257565

0.9859096560298384


                         ****END***