<a href="https://colab.research.google.com/github/k-dinakaran/spam-email-detection-using-naive-bayes/blob/main/spam_email_detection_using_naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub
import os
import pandas as pd
import numpy as np
import re
import string

In [2]:
#load dataset
path = kagglehub.dataset_download("mohinurabdurahimova/maildataset")
print("Dataset downloaded at:", path)

dataset_path = os.path.join(path, "mail_data.csv")
df = pd.read_csv(dataset_path)


Dataset downloaded at: /root/.cache/kagglehub/datasets/mohinurabdurahimova/maildataset/versions/995


In [3]:
# Debugging: Print dataset structure
print("\nColumn Names in Dataset:", df.columns)
print("\nDataset Sample:\n", df.head())
print("\nDataset Info:\n")
print(df.info())


Column Names in Dataset: Index(['Category', 'Message'], dtype='object')

Dataset Sample:
   Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score


In [5]:
#Ensure correct column name
if 'Category' not in df.columns or 'Message' not in df.columns:
    raise ValueError("Error: Expected columns 'Category' and 'Message' not found in the dataset.")

df = df.dropna(subset=['Message', 'Category'])

df['Category'] = df['Category'].map({'spam': 1, 'ham': 0})

df = df.dropna(subset=['Category'])

df['Category'] = df['Category'].astype(int)

if df.empty:
    raise ValueError("Error: Dataset is empty after preprocessing. Check if the file is loaded correctly.")

In [6]:
# Preprocess the text data
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Message'] = df['Message'].apply(clean_text)

In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Category'], test_size=0.2, random_state=42)

In [8]:
#Create a text classification pipeline (Vectorization + NaÃ¯ve Bayes)
model = Pipeline([
    ('vectorizer', CountVectorizer()),  # Convert text to word counts
    ('tfidf', TfidfTransformer()),  # Apply TF-IDF transformation
    ('classifier', MultinomialNB())  # Train NaÃ¯ve Bayes model
])


In [9]:
#Train the model
model.fit(X_train, y_train)


In [10]:
#Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("\nModel Evaluation:")
print("-----------------------")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")


Model Evaluation:
-----------------------
Accuracy: 95.61%
Precision: 100.00%
Recall: 67.11%


In [11]:
#User Input for Email Classification
while True:
    print("\nEnter email details (or type 'exit' to quit):")
    subject = input("Subject: ").strip()
    if subject.lower() == 'exit':
        break
    body = input("Body: ").strip()
    if body.lower() == 'exit':
        break

    # Combine subject and body for classification
    email_text = f"{subject} {body}"
    email_text = clean_text(email_text)  # Apply text preprocessing

    # Predict the class
    prediction = model.predict([email_text])[0]
    print("\nPrediction: ", "ðŸš¨ Spam" if prediction == 1 else "âœ… Not Spam")


Enter email details (or type 'exit' to quit):
Subject: Win a Free Vacation Now!
Body: Congratulations! You have been selected to receive a free trip to the Bahamas. Click the link to claim your prize.

Prediction:  ðŸš¨ Spam

Enter email details (or type 'exit' to quit):
Subject: Meeting Agenda for Monday
Body: Dear team, please find attached the agenda for our Monday meeting. Let me know if you have any questions.

Prediction:  âœ… Not Spam

Enter email details (or type 'exit' to quit):
Subject: exit
