IMPORTING LIBRARIES

In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

IMPORTING DATASET

In [127]:
dataset = pd.read_csv('spam_ham_dataset.csv')
X = dataset.drop(dataset.columns[1], axis=1).values
y = dataset.iloc[:,1].values


emails = dataset['text']
label = dataset['label']



ENCODING THE INDEPEDENT VARIABLES

In [128]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(label)

SPLIT THE DATASET INTO TRAINING SET AND TEST SET

In [129]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(emails, y, test_size = 0.20, random_state = 0)

CONVERT TEXT TO NUMBERS

In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

TRAINING THE NAVIE BAYES CLASSIFICATION MODEL

In [131]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_tfidf, y_train)

PREDICTING THE RESULT OF TEST SET

In [132]:
y_pred = classifier.predict(X_test_tfidf)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 1]
 [0 0]]


MAKING THE CONFUSION MATRIX

In [133]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
print("CONFUSION MATRIX: \n", confusion_matrix(y_test, y_pred))
print("\nACCURACY SCORE: \n", accuracy_score(y_test, y_pred))
print("\nCLASSIFICATION REPORT: \n", classification_report(y_test, y_pred))

[[725   7]
 [ 14 289]]
CONFUSION MATRIX: 
 [[725   7]
 [ 14 289]]

ACCURACY SCORE: 
 0.9797101449275363

CLASSIFICATION REPORT: 
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       732
           1       0.98      0.95      0.96       303

    accuracy                           0.98      1035
   macro avg       0.98      0.97      0.98      1035
weighted avg       0.98      0.98      0.98      1035



PREDICTING THE MORE HARDEST CASE

In [134]:
new_x_text = [
    "Please review the attached invoice and confirm payment details. Log in to our secure portal here: http://bit.ly/3XfFaP",
    "Hello, Your monthly report is attached. Also, you’ve been selected to receive a $50 Amazon gift card.",
    "We detected unusual login activity on your account. Reset your password immediately.",
    "Remote position available with flexible hours. Apply now.",
    "You are eligible for a tax refund of $1,245.34. Complete the attached form.",

    "Thinking about getting Dad that $200 fishing rod for his birthday.",
    "Can we push tomorrow’s meeting to 4 PM? Also, I’ve attached the sales report you asked for.",
    "Please find attached the updated remote work guidelines, effective next month.",
    "Your hotel booking for Dec 15–18 is confirmed. Please review the details.",
    "You recently requested a password change for your company account."
]

new_y = [
    "spam", "spam", "spam", "spam", "spam",
    "ham", "ham", "ham", "ham", "ham"
]
new_y_encoded = le.transform(new_y)
new_x_tfidf = vectorizer.transform(new_x_text)
new_y_pred = classifier.predict(new_x_tfidf)
new_y_proba = classifier.predict_proba(new_x_tfidf)

print("\npredictions on hard test cases:")
for text, prediction, actual, prob in zip(new_x_text, new_y_pred, new_y_encoded, new_y_proba):
    predicted_label = le.inverse_transform([prediction])[0]
    actual_label = le.inverse_transform([actual])[0]
    confidence = np.max(prob) * 100
    print(f"text: {text[:60]}...")
    print(f"predicted: {predicted_label} ({confidence:.2f}% confidence), actual: {actual_label}\n")


predictions on hard test cases:
text: Please review the attached invoice and confirm payment detai...
predicted: spam (53.29% confidence), actual: spam

text: Hello, Your monthly report is attached. Also, you’ve been se...
predicted: ham (56.49% confidence), actual: spam

text: We detected unusual login activity on your account. Reset yo...
predicted: ham (54.53% confidence), actual: spam

text: Remote position available with flexible hours. Apply now....
predicted: ham (50.98% confidence), actual: spam

text: You are eligible for a tax refund of $1,245.34. Complete the...
predicted: ham (61.68% confidence), actual: spam

text: Thinking about getting Dad that $200 fishing rod for his bir...
predicted: ham (51.84% confidence), actual: ham

text: Can we push tomorrow’s meeting to 4 PM? Also, I’ve attached ...
predicted: ham (79.79% confidence), actual: ham

text: Please find attached the updated remote work guidelines, eff...
predicted: ham (79.88% confidence), actual: ham

text: Your h