In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
#loading  the dataset
data=pd.read_csv("/content/spam.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# **Data analysis and exploration**

In [None]:
#Grouping by category and getting the description
data.groupby("Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [None]:
#checking for null values
data.isna().sum()

Unnamed: 0,0
Category,0
Message,0


In [None]:
#getting  summary of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


# **Data cleaning and conversion**

In [None]:
#  converting data into numerical
data['spam']=data['Category'].apply(lambda x:1 if x=='spam' else 0)
data.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
#splitting the dataset
x_train,x_test,y_train,y_test=train_test_split(data.Message,data.spam,test_size=0.25)

In [None]:
#converting the message column into numerical format using count vectorizatiion
#converting the x_train data
vect=CountVectorizer()
x_train_count=vect.fit_transform(x_train.values)
x_train_count.toarray()[:3]  # just showing the first three  rows

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
#converting the x_test data
x_test_count=vect.transform(x_test)
x_test_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
#using multinomial  naives bayes as the algorithm  for classification of text
model=MultinomialNB()
model.fit(x_train_count,y_train)



In [None]:
#prediction  whether emails are spam or  not
emails=[
        'Hey mohan, can we get together to watch footbal game tomorrow?',
        'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count=vect.transform(emails)
model.predict(emails_count) # the output shows that the second email is a spam

array([0, 1])

In [None]:
#getting the accuracy of the model
model.score(x_test_count,y_test) #  you must not use the sklearn accuracy libraries , also the score function works the same

0.9842067480258435

# **Sklearn pipeline**

it is used in simplifying the code  using a simple API

In [None]:
model2=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('classifier',MultinomialNB())
])


In [None]:
#fitting our model where we will use the uncoverted data that is x_train  and y_train because in  our pipeline we have included the vectorizer which will perform the vectorization before fitting the model
model2.fit(x_train,y_train)

In [None]:
model2.score(x_test,y_test) # the output is the same usng the pipeline and  without the pipeline

0.9842067480258435

In [None]:
model2.predict(emails)

array([0, 1])