In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

Display Header

In [13]:
df = pd.read_csv("emails.txt")
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


1) Exploratory Data Analysis:

a) Get Total no of rows and columns

In [14]:
df.shape

(5728, 2)

b) Get Column Names:

In [15]:
df.columns

Index(['text', 'spam'], dtype='object')

c) Check for duplicates

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
print(df.shape)

(5695, 2)


d)  Check for missing values

In [18]:
print(df.isnull().sum())

text    0
spam    0
dtype: int64


2) Data Cleansing:

a) Remove unwanted words

In [19]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hemap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
def processes(text):
    nopunc= [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
        
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean

df['text'].head().apply(processes)


0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

b) Convert text into matrix of token counts:

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
message = CountVectorizer(analyzer=processes).fit_transform(df['text'])
print(message.shape)

(5695, 37229)


3) Split Data: Training: 80%; Testing: 20%

In [23]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(message, df['spam'], test_size=0.20, random_state=0)
print(message.shape)

(5695, 37229)


4) Model Development and prediction:
    Create and train Multinomial Naive Bayes classifier to classify discrete features

In [24]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xtrain, ytrain)
print(classifier.predict(xtrain))
print(ytrain.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


5) Model Evaluation on training dataset:

In [28]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
predict_xtrain=classifier.predict(xtrain)
print("Model Evaluation Results on training dataset:")
print("Classification Report:")
print(classification_report(ytrain,predict_xtrain))
print("\n Confusion Metrics:")
print(confusion_matrix(ytrain,predict_xtrain))
print("\n Accuracy Score:")
print(accuracy_score(ytrain,predict_xtrain))

Model Evaluation Results on training dataset:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


 Confusion Metrics:
[[3445   12]
 [   1 1098]]

 Accuracy Score:
0.9971466198419666


Model is 99.71% accurate. 

Let's evaluate model on test data, by printing the predicted value and the actual value to see if the model can accurately classify email text

In [29]:
print(classifier.predict(xtest))
print(ytest.values)

[1 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]


In [30]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
predict_xtest=classifier.predict(xtest)
print("Model Evaluation Results on training dataset:")
print("Classification Report:")
print(classification_report(ytest,predict_xtest))
print("\n Confusion Metrics:")
print(confusion_matrix(ytest,predict_xtest))
print("\n Accuracy Score:")
print(accuracy_score(ytest,predict_xtest))

Model Evaluation Results on training dataset:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


 Confusion Metrics:
[[862   8]
 [  1 268]]

 Accuracy Score:
0.9920983318700615


Prediction Results: The classifier model accurately identified the email messages as spam or email with 99.2% accuracy on test data