# Spam Mail Detection using Deep Learning

## Importing the dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

## Data Collection & Pre-Processing

In [2]:
data = pd.read_csv("spam_emails.csv")
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
print(data)

                                                   text  spam
0     Subject: naturally irresistible your corporate...     1
1     Subject: the stock trading gunslinger  fanny i...     1
2     Subject: unbelievable new homes made easy  im ...     1
3     Subject: 4 color printing special  request add...     1
4     Subject: do not have money , get software cds ...     1
...                                                 ...   ...
5723  Subject: re : research and development charges...     0
5724  Subject: re : receipts from visit  jim ,  than...     0
5725  Subject: re : enron case study update  wow ! a...     0
5726  Subject: re : interest  david ,  please , call...     0
5727  Subject: news : aurora 5 . 2 update  aurora ve...     0

[5728 rows x 2 columns]


In [4]:
data = data.where((pd.notnull(data)), '') 
# Replace the null values

In [5]:
data['spam'].value_counts()

spam
0    4360
1    1368
Name: count, dtype: int64

In [6]:
X = data['text']
y = data['spam']

In [7]:
X

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5728, dtype: object

In [8]:
y

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: spam, Length: 5728, dtype: int64

## Splitting the data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=data.spam)

## Feature Extraction

In [10]:
# Transform the text data into feature vectors. So, that can be used as input to the ANN.

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [11]:
X_train = feature_extraction.fit_transform(X_train)
X_test = feature_extraction.transform(X_test)

## Training the Model

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, r2_score, recall_score, confusion_matrix

In [24]:
lor = LogisticRegression()
lor.fit(X_train, y_train)
y_pred = lor.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
r2 = r2_score(y_true=y_test, y_pred=y_pred)
print(f"Accuracy Score: {accuracy}\nR2 Score: {r2}")

Accuracy Score: 0.9825479930191972
R2 Score: 0.9040715194535592


In [27]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
r2 = r2_score(y_true=y_test, y_pred=y_pred)
print(f"Accuracy Score: {accuracy}\nR2 Score: {r2}")

Accuracy Score: 0.9834205933682374
R2 Score: 0.9088679434808813


In [34]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[867,   5],
       [ 14, 260]], dtype=int64)

## Predictive System

In [32]:
input_mail = ["interview schedules for tony hamilton and damian likely  please find the interview packets for the above - referenced candidates . the  interviews will occur on friday january 26 , 2001 . please print all documents  for your reference . hardcopies of their resumes will be delivered via  runner . if you have any questions , or conflicts of schedule , please do not  hesitate to contact me .  shawn grady  58701"]
input_features = feature_extraction.transform(input_mail)

prediction = rf.predict(input_features)

if prediction[0] == 1:
    print("It is a Ham Mail")
else:
    print("It is a Spam Mail")

It is a Spam Mail


## Save the Model

In [35]:
import joblib

In [37]:
with open('spam_model.pkl', 'wb') as file:
    joblib.dump(rf, file)

## Load and test the saved model

In [38]:
with open('spam_model.pkl', 'rb') as file:
    loaded_model = joblib.load(file)

In [39]:
tem_pred = loaded_model.predict(input_features)
print(tem_pred)

[0]
