In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import warnings
warnings.filterwarnings("ignore")

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/ibrahimfadu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahimfadu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# 1. Load Dataset (You can replace this with a custom email dataset)
# Using SMS Spam Collection Dataset as an example
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', header=None, names=['Label', 'Message'])

# Display dataset structure
data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.shape

(5572, 2)

In [4]:
# Map labels to binary (Spam=1, Not Spam=0)
data['Label'] = data['Label'].map({'spam': 1, 'ham': 0})

In [5]:
#Preprocess Text
def preprocess_text(text):
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    # Tokenize words
    words = word_tokenize(text.lower())
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)



In [6]:
# Apply preprocessing
data['Message'] = data['Message'].apply(preprocess_text)

In [7]:
#Split Data into Training and Testing Sets
X = data['Message']  # Features
y = data['Label']    # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train.shape,X_test.shape

((4457,), (1115,))

In [9]:
# Build Pipeline with CountVectorizer and Naive Bayes
model = Pipeline([
    ('vectorizer', CountVectorizer()),    # Convert text to bag-of-words
    ('classifier', MultinomialNB())      # Train a Naive Bayes classifier
])

# Train the model
model.fit(X_train, y_train)

In [10]:
# 5. Evaluate the Model
y_pred = model.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.91      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [11]:
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Accuracy: 0.98


In [12]:
# 6. Test with Custom Email
def classify_email(text):
    preprocessed_text = preprocess_text(text)
    prediction = model.predict([preprocessed_text])
    return "Spam" if prediction[0] == 1 else "Not Spam"

In [13]:
#Test examples
example_email = "Congratulations! You've won a $1,000 gift card. Click here to claim now!"
print(f"\nEmail: {example_email}")
print(f"Classification: {classify_email(example_email)}")



Email: Congratulations! You've won a $1,000 gift card. Click here to claim now!
Classification: Spam
