<a href="https://colab.research.google.com/github/itzrubyy/spam_email_detector/blob/main/spam_email_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('email_classification.csv')

In [3]:
df.head(5)

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham


In [4]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
stopwards = set(nltk.corpus.stopwords.words('english'))
wordnet = nltk.WordNetLemmatizer()

In [7]:
import re
corpus = []

for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['email'][i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if word not in stopwards]
    review = ' '.join(review)
    corpus.append(review)

In [8]:
corpus[0]

'upgrade premium plan exclusive access premium content feature'

In [12]:
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

words = []

for sent in corpus:
    sent_tokens = sent_tokenize(sent)
    for sent in sent_tokens:
        words.append(simple_preprocess(sent))

model = Word2Vec(words)

In [13]:
model.wv.most_similar('free')

[('premium', 0.16782471537590027),
 ('excited', 0.16361083090305328),
 ('congratulation', 0.13955314457416534),
 ('prize', 0.13200700283050537),
 ('quick', 0.1290973722934723),
 ('password', 0.11548753827810287),
 ('unlock', 0.10651745647192001),
 ('secure', 0.09859161823987961),
 ('activity', 0.098051518201828),
 ('help', 0.09625499695539474)]

In [14]:
def avg_word2vec(doc):
    words = simple_preprocess(doc)
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)


In [15]:
X = []
y = []

for i in range(len(df)):
    avg_doc = avg_word2vec(df['email'][i])
    if avg_doc is not None:
        X.append(avg_doc)
        y.append(df['label'][i])

In [16]:
X = np.array(X)

In [17]:
X_new = pd.DataFrame(X)
y_new = pd.Series(y).map({'spam': 1, 'ham': 0})

In [20]:
# X_new.isnull().sum()
y_new.isnull().sum()

0

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

naive_bayes_hyper_params = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.0]
}

nb = GaussianNB()
nb.fit(X_train, y_train)

nb_grid = GridSearchCV(estimator=nb, param_grid=naive_bayes_hyper_params, cv=5)
nb_grid.fit(X_train, y_train)

print(f'Best parameters: {nb_grid.best_params_}')
print(f'Best score: {nb_grid.best_score_}')

nb.set_params(**nb_grid.best_params_)
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)*100}%')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report: \n{classification_report(y_test, y_pred)})')



Best parameters: {'var_smoothing': 1e-09}
Best score: 0.8955665024630542
Accuracy: 100.0%
Confusion Matrix: 
[[14  0]
 [ 0 22]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        22

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36
)


In [25]:
test_email = "Congratulations you've won an iphone 13 pro max, to claim it visit our website"

In [27]:
import re

test_email = re.sub('[^a-zA-Z]', ' ', test_email)
test_email = test_email.lower()
test_email

'congratulations you ve won an iphone    pro max  to claim it visit our website'

In [28]:
wordnet = nltk.WordNetLemmatizer()

test_email = test_email.split()
test_email = [wordnet.lemmatize(word) for word in test_email if word not in stopwards]
test_email = ' '.join(test_email)
test_email

'congratulation iphone pro max claim visit website'

In [34]:
test_email = avg_word2vec(test_email)
test_email = scaler.transform([test_email])
test_email

array([[ 0.21541937, -0.21186628, -0.24938755, -0.418155  ,  0.1640353 ,
         0.66999153, -0.28568536, -0.61130994,  0.32426662,  0.54929264,
        -0.27297506,  0.58803264,  0.00731757, -0.50920855, -0.1896107 ,
        -0.00724346, -0.39533281, -0.11198318,  0.31663994,  0.5540882 ,
        -0.46189823, -0.19145693, -0.61951159,  0.4535632 , -0.45859911,
         0.02300024,  0.43679886, -0.18715128,  0.39594293,  0.06143347,
        -0.29552911,  0.23158485, -0.3876269 ,  0.74252835,  0.22399041,
        -0.21796362, -0.36277334, -0.02742018,  0.03195796,  0.17265942,
         0.09852243, -0.03330202,  0.41243823,  0.36838835, -0.06588696,
         0.03447258,  0.27999463, -0.10583765, -0.22213565, -0.50948667,
        -0.13266446,  0.30807964,  0.35693907, -0.05409474, -0.28597423,
        -0.01734135, -0.47669753,  0.08021907,  0.2088663 , -0.32771489,
         0.29138407,  0.1826612 , -0.18528875,  0.15840626,  0.16990259,
        -0.7019429 ,  0.22025098, -0.1436678 ,  0.2

In [35]:
result = nb.predict(test_email)

if result == 1:
    print('Spam')
else:
    print('Ham')

Spam
