<a href="https://colab.research.google.com/github/k-dinakaran/spam-email-detection-using-naive-bayes/blob/main/spam_email_detection_using_naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import kagglehub
import os
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Step 1: Download the dataset from Kaggle
path = kagglehub.dataset_download("mohinurabdurahimova/maildataset")
print("Dataset downloaded at:", path)

# Step 2: Load the dataset (Update filename if needed)
dataset_path = os.path.join(path, "mail_data.csv")  # Adjust filename if necessary
df = pd.read_csv(dataset_path)

# Debugging: Print dataset structure
print("\nColumn Names in Dataset:", df.columns)
print("\nDataset Sample:\n", df.head())
print("\nDataset Info:\n")
print(df.info())

# Step 3: Ensure correct column names
if 'Category' not in df.columns or 'Message' not in df.columns:
    raise ValueError("Error: Expected columns 'Category' and 'Message' not found in the dataset.")

# Step 4: Handle Missing Values and Label Mapping
df = df.dropna(subset=['Message', 'Category'])  # Remove rows with missing values

# Convert 'Category' to binary labels
df['Category'] = df['Category'].map({'spam': 1, 'ham': 0})

# Drop rows where mapping failed (i.e., still NaN)
df = df.dropna(subset=['Category'])

# Convert 'Category' to integer type
df['Category'] = df['Category'].astype(int)

# Ensure dataset is not empty after preprocessing
if df.empty:
    raise ValueError("Error: Dataset is empty after preprocessing. Check if the file is loaded correctly.")

# Step 5: Preprocess the text data
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['Message'] = df['Message'].apply(clean_text)

# Step 6: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Category'], test_size=0.2, random_state=42)

# Step 7: Create a text classification pipeline (Vectorization + Naïve Bayes)
model = Pipeline([
    ('vectorizer', CountVectorizer()),  # Convert text to word counts
    ('tfidf', TfidfTransformer()),  # Apply TF-IDF transformation
    ('classifier', MultinomialNB())  # Train Naïve Bayes model
])

# Step 8: Train the model
model.fit(X_train, y_train)

# Step 9: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("\nModel Evaluation:")
print("-----------------------")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")

# Step 10: User Input for Email Classification
while True:
    print("\nEnter email details (or type 'exit' to quit):")
    subject = input("Subject: ").strip()
    if subject.lower() == 'exit':
        break
    body = input("Body: ").strip()
    if body.lower() == 'exit':
        break

    # Combine subject and body for classification
    email_text = f"{subject} {body}"
    email_text = clean_text(email_text)  # Apply text preprocessing

    # Predict the class
    prediction = model.predict([email_text])[0]
    print("\nPrediction: ", "🚨 Spam" if prediction == 1 else "✅ Not Spam")


Dataset downloaded at: /root/.cache/kagglehub/datasets/mohinurabdurahimova/maildataset/versions/995

Column Names in Dataset: Index(['Category', 'Message'], dtype='object')

Dataset Sample:
   Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None

Model Evaluation:
-----------------------
Accuracy: 95.61%
Precision: 100.00%
Recall: 67.11%

Enter email details (o