# EMAIL SPAM DETECTION

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

### Reading the dataset

In [2]:
df = pd.read_csv(r"C:\Users\hanaa\Documents\hanan\DUK\internships\oasis infobyte\email spam detection\spam.csv",encoding=('ISO-8859-1'),low_memory=False)

In [3]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


### Basic information about the data

In [4]:
df.head()   # Displays the first 5 entries

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.tail()   # Displays the last 5 entries

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [6]:
df.shape   # Displays the number of rows and columns in the dataset

(5572, 5)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [8]:
df.describe()   # Basic statistical information about the data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [9]:
df.columns   # Lists out all the columns in the dataset

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [10]:
df.rename(columns={'v1':'ham or spam','v2':'mail'}, inplace=True)

In [11]:
df

Unnamed: 0,ham or spam,mail,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [12]:
df.isna().sum()   # Get the number of null or missing values in the data

ham or spam       0
mail              0
Unnamed: 2     5522
Unnamed: 3     5560
Unnamed: 4     5566
dtype: int64

Since more than 99% of the entries in the latter 3 columns are missing, they are dropped.

In [13]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1,inplace=True)

In [14]:
df.duplicated().sum()   # Get the total number of duplicated entries in the data

403

All the duplicate entries are dropped from the dataframe.

In [15]:
df=df.drop_duplicates()

In [16]:
df

Unnamed: 0,ham or spam,mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Encoding the target column

In [17]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['ham or spam'] = encoder.fit_transform(df['ham or spam'])
df

Unnamed: 0,ham or spam,mail
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


### Model building

In [18]:
# Splitting the dataset

from sklearn.model_selection import train_test_split

x = df['mail']
y = df['ham or spam']

X_test, X_train, y_test, y_train = train_test_split(x,y,random_state=42, test_size=0.25)


In [19]:
X_train.shape, y_train.shape

((1293,), (1293,))

In [20]:
X_test.shape, y_test.shape

((3876,), (3876,))

In [21]:
# Vectorizing

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer()

X_train_count = tfidf.fit_transform(X_train.values)
X_train_count.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
# Building and fitting the model

from sklearn.naive_bayes import MultinomialNB

model=MultinomialNB()
model.fit(X_train_count, y_train)

X_test_count = tfidf.transform(X_test)

### Evaluating the performance of the model

In [23]:
print("Accuracy =", model.score(X_test_count,y_test))

Accuracy = 0.936016511867905


The model has an accuracy of 93.6%