**Importing required libraries to perform analysis**

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

**Importing Dataframe**

In [21]:
df=pd.read_csv("/content/drive/MyDrive/Treue_Internship/4.Email_spam_detection/spam.csv")

In [22]:
df #displaying data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [23]:
 df.head() # This HEAD method will display only first 5 rows of the dataset.

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
df.tail() # This TAIL method will display only last 5 rows of the dataset.

Unnamed: 0,v1,v2
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [25]:
df.shape # This SHAPE is an attribute which shows count of rows and columns of dataset.

(5572, 2)

In [26]:
df.info() # This method shows all physical properties of the dataset.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [27]:
df.isnull() #This ISNULL method indicate True only if value is missing

Unnamed: 0,v1,v2
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
5567,False,False
5568,False,False
5569,False,False
5570,False,False


In [28]:
df.isnull().sum() # Thsi SUM method will show the total of null values in column fields.

v1    0
v2    0
dtype: int64

In [29]:
df.describe() # This DESCRIBE method works only on statistical column and shows aggregate values.

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [30]:
df.describe(include ='all') # This DESCRIBE method works only on statistical column but (include ='all') gives statistic for numeric and non numeric columns.

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [31]:
df.columns # columns is an attribute which shows columns name.

Index(['v1', 'v2'], dtype='object')

Grouping of SPAM and HAM emails


In [32]:
df.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


Applying categorization and creating new spam column

In [33]:
df['spam']=df['v1'].apply (lambda x:1 if x== 'spam' else 0)

In [34]:
df

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


Splitting the dataset into Training and Testing

In [35]:
x_train, x_test, y_train, y_test=train_test_split(df.v2,df.spam,test_size=0.25)

In [36]:
#Using CountVectorizer to convert text format into values for feature extraction in Machine learning.
cv=CountVectorizer()
x_train_count=cv.fit_transform(x_train.values)

In [37]:
#using multinomialNB, it assign classes based on statistical analysis of content
model=MultinomialNB()
model.fit(x_train_count, y_train)

In [40]:
 #Testing the model
ham_email=["Hello Students, Report at school playground tomorrow morning at 5.0A.M."]
ham_email_count=cv.transform(ham_email)
model.predict(ham_email_count)

array([0])

Above array([0]) shows email is not spam

In [41]:
spam_email=["Lucky Draw!!! You have won lottery ticket"]
spam_email_count=cv.transform(spam_email)
model.predict(spam_email_count)

array([1])

Above array([1]) shows email is spam

In [44]:
#Model accuracy
x_test_count=cv.transform(x_test)
model.score(x_test_count,y_test)

0.9834888729361091