In [2]:
# Following youtube lesson https://www.youtube.com/watch?v=cNLPt02RwF0 
# Email Spam Detection Using Python & Machine Learning
# Dataset is here https://www.kaggle.com/venky73/spam-mails-dataset

# This data set was to big, I was not able to download it https://www.kaggle.com/balaka18/email-spam-classification-dataset-csv

# Import libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string



In [3]:
 # load the data
 from google.colab import files
 uploaded = files.upload()
 #The csv file contains 5171 rows, each row for each email. There are 4 columns. 
 #The first column indicates Email name. The name has been set with numbers and not recipients' name to protect privacy. 
 #The last column has the labels for prediction : 1 for spam, 0 for not spam. The remaining 3000 columns are the 3000 most common words in all the emails, 
 #after excluding the non-alphabetical characters/words. For each row, the count of each word(column) in that email(row) is stored in the respective cells. 
 #Thus, information regarding all 5172 emails are stored in a compact dataframe rather than as separate text files.


Saving emails.csv to emails.csv


In [13]:
# read the csv file
df = pd.read_csv('emails.csv')
df.head(5)
df.shape

(5171, 4)

In [14]:
# get the column name
df.columns
#label_num = 1 for spam and 0 for non-spam


Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [15]:
# check for duplicates and remove them
df.drop_duplicates(inplace = True)
# Show the new set
df.shape

(5171, 4)

In [16]:
# Show the number of missing data (NA) for each column
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [17]:
 # Download the stopwords package
 nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
def process_text(text):
  #1 remove punktuation
  #2 remove stopwords
  #3 return a list of clean text words

  #1
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)
  #2 
  clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
  #3
  return clean_words


In [26]:
# Show the tokenization (a list of tokens also called lemmas)
df['text'].head().apply(process_text)


0    [Subject, enron, methanol, meter, 988291, foll...
1    [Subject, hpl, nom, january, 9, 2001, see, att...
2    [Subject, neon, retreat, ho, ho, ho, around, w...
3    [Subject, photoshop, windows, office, cheap, m...
4    [Subject, indian, springs, deal, book, teco, p...
Name: text, dtype: object

In [27]:
# Show an example what we are doing here

message4 = 'hello world hello hello world play'
message5 = 'test test test test one hello'
print(message4)
print()

#Convert the text to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
bow4 = CountVectorizer(analyzer=process_text).fit_transform([message4], [message5])
print(bow4)
print()
print(bow4.shape)


hello world hello hello world play

  (0, 0)	3
  (0, 2)	2
  (0, 1)	1

(1, 3)


In [28]:
#Convert a collection of text to a matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(df['text'])


In [33]:
# Split the data into 80% trining and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['label_num'], test_size=0.2, random_state=0)



In [34]:
# Get the shape of messages_bow

messages_bow.shape

(5171, 50381)

In [35]:
# Create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train, y_train)

In [39]:
# Print the predictions
print(classifier.predict(X_train))

# Print the actual values
print(y_train.values)

print(y_train)



[0 0 0 ... 1 0 0]
3628    0
2491    0
3262    0
2972    0
2481    1
       ..
4931    1
3264    1
1653    1
2607    0
2732    0
Name: label_num, Length: 4136, dtype: int64
[0 0 0 ... 1 0 0]


In [42]:
# Evaluate the model on the training dataset
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train, pred))
print()
print('Confusion Matrix: \n', confusion_matrix(y_train, pred))
print()
print('Accuracy: ', accuracy_score(y_train, pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2940
           1       0.98      0.97      0.98      1196

    accuracy                           0.99      4136
   macro avg       0.99      0.98      0.98      4136
weighted avg       0.99      0.99      0.99      4136


Confusion Matrix: 
 [[2918   22]
 [  30 1166]]

Accuracy:  0.9874274661508704


In [43]:
# Print the predictions
print(classifier.predict(X_test))

# Print the actual values
print(y_test.values)


[0 0 0 ... 0 1 0]
[0 0 0 ... 0 1 0]


In [44]:
# Evaluate the model on the training dataset
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test, pred))
print()
print('Confusion Matrix: \n', confusion_matrix(y_test, pred))
print()
print('Accuracy: ', accuracy_score(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       732
           1       0.95      0.96      0.96       303

    accuracy                           0.97      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035


Confusion Matrix: 
 [[718  14]
 [ 13 290]]

Accuracy:  0.9739130434782609
