# Classification of spam and ham(good) email with the help of Naive Bayes classification algorithms


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer      #CountVectorizer convert text into matrics
from sklearn.naive_bayes import MultinomialNB

In [2]:
#read the spam.csv file with the help of pandas and form a dataframe
spam_and_ham = pd.read_csv("D:\DataSetfor MLandDL\spam.csv")
spam_and_ham.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Perform some exploratory data analysis on our dataset for the better understanding and to perform some preprocessing (cleaning)

In [3]:
spam_and_ham.columns
# Means we have 2 categories in our dataset --> email will be spam or ham

Index(['Category', 'Message'], dtype='object')

In [4]:
spam_and_ham.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
spam_and_ham.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


### from the above result it is clear that we have some duplicate row.. I will used (duplicated method) if rows are duplicated it will return "True" otherwise "False"

In [6]:
print(spam_and_ham.duplicated().to_string())

# False means

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
30      False
31      False
32      False
33      False
34      False
35      False
36      False
37      False
38      False
39      False
40      False
41      False
42      False
43      False
44      False
45      False
46      False
47      False
48      False
49      False
50      False
51      False
52      False
53      False
54      False
55      False
56      False
57      False
58      False
59      False
60      False
61      False
62      False
63      False
64      False
65      False
66      False
67      False
68      False
69      False
70      False
71    

#####  Drop dulpicate rows

In [7]:
spam_and_ham.drop_duplicates(inplace=True)

In [8]:
spam_and_ham.describe()

Unnamed: 0,Category,Message
count,5157,5157
unique,2,5157
top,ham,"Go until jurong point, crazy.. Available only ..."
freq,4516,1


In [9]:
# to check empty cell in our dataset
spam_and_ham.isnull().sum()

# from below result it is clear we have no empty cell

Category    0
Message     0
dtype: int64

In [10]:
# to check NaN cell in our dataset
spam_and_ham.isna().sum()

# from below result it is clear we have NaN empty cell

Category    0
Message     0
dtype: int64

###### As we known that Ml only understand number not text and not images ..convert our category and message into number

In [11]:
# convert category into binary(0/1)---> spam = 1 and ham = 0
# Lambda function--> syntax
# lambda argument : expression

spam_and_ham['Spam'] = spam_and_ham['Category'].apply(lambda x : 1 if x=='spam' else 0)
spam_and_ham.head(5)
pd.set_option('display.max_rows', None)
spam_and_ham


Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


###### Split data into Train and Test

In [12]:
x_train, x_test, y_train, y_test = train_test_split(spam_and_ham.Message, spam_and_ham.Spam, test_size=0.2)

In [13]:
len(y_test)

1032

###### First i convert category into binary and after i will convert messaage into vector

In [14]:
countvictorizer = CountVectorizer()
x_train_count = countvictorizer.fit_transform(x_train.values)
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

##### 😒  3 types of Naive Bayes Algorithms:  1----> Bernolli NB:it will be used when feature are of binary 0/1:e.g 1 when word are represent in document , 0 when word are not present in documents    2----> Mulitnomial NB: it will be used when data is discrete(e.g movie rating from 1 to 5) or Text learning count the number of words to predict class   3--->Gaussian NB:B/z of the assumption of the Normal distribution(ball like shape) when data are in continous form: e.g Iris dataset,, sepal length , sepal width , petal lenght and petal width --> this can vary , we can,t represent it in term of other occrance

In [15]:
# i will used MultinomialNB
model = MultinomialNB()
model.fit(x_train_count, y_train)

MultinomialNB()

In [16]:
# lets check score of our model
test_count = countvictorizer.transform(x_test)
model.score(test_count, y_test)

0.9854651162790697

In [17]:
# Actual class label of our test dataset
y_test.head(10)

2720    0
5142    0
628     0
4102    1
2726    0
1475    0
4178    0
1359    0
2450    0
2994    0
Name: Spam, dtype: int64

In [18]:
model.predict(test_count)[:10]

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

In [26]:
emails =[
    '[nltk-users] Log Tokenization',
    'Dropbox bonus space received!',
    'Lost files are a thing of the past',
    'A Liberation Theology, for Children | Sarah Stankorb',
    'Take a Career Track in 2 Months for the Price of 1'
]

emails_count = countvictorizer.transform(emails)
model.predict(emails_count)

array([0, 1, 0, 0, 0], dtype=int64)

#####  It is very difficult to convert message into matric with the help of  CountVictorizer() ... so we made piplines

In [30]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
    
])

In [35]:
clf.fit(x_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [36]:
clf.score(x_train,y_train)

0.992969696969697

In [37]:
clf.predict(x_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [44]:
emails =[
    '[nltk-users] Log Tokenization',
    'Dropbox bonus space received!',
    'Lost files are a thing of the past',
    'A Liberation Theology, for Children | Sarah Stankorb',
    'Take a Career Track in 2 Months for the Price of 1'
]


clf.predict(emails)

array([0, 1, 0, 0, 0], dtype=int64)