In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re
import pickle

In [3]:
# Load dataset
file_path = 'spam.csv'  
data = pd.read_csv(file_path, encoding='latin-1')

In [5]:
# Retain necessary columns and rename them
data_cleaned = data[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})

In [7]:
# Encode labels: 'ham' -> 0, 'spam' -> 1
data_cleaned['label'] = data_cleaned['label'].map({'ham': 0, 'spam': 1})

In [9]:
# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W+', ' ', text)  # Remove special characters and punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [11]:
# Apply preprocessing
data_cleaned['message'] = data_cleaned['message'].apply(preprocess_text)

In [13]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data_cleaned['message'], data_cleaned['label'], test_size=0.2, random_state=42
)

In [15]:
# Transform text data using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [17]:
# Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [19]:
# Make predictions
y_pred = nb_model.predict(X_test_tfidf)

In [21]:
# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 0.98


In [23]:
# Save the model and vectorizer
with open('spam_detector_model.pkl', 'wb') as model_file:
    pickle.dump(nb_model, model_file)


In [25]:
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)

In [27]:
# Function to predict if a new email is spam or not
def predict_spam(message):
    processed_message = preprocess_text(message)
    message_tfidf = tfidf_vectorizer.transform([processed_message])
    prediction = nb_model.predict(message_tfidf)
    return "Spam" if prediction[0] == 1 else "Ham"

In [29]:
# Test the function with a sample message
sample_message = "Congratulations! You've won a free ticket. Call now!"
print(f"Message: '{sample_message}' => Prediction: {predict_spam(sample_message)}")

Message: 'Congratulations! You've won a free ticket. Call now!' => Prediction: Ham


In [33]:
# Test the function with a sample message
sample_message = "Congratulations! You've won a $1,000 gift card. Click here to claim your prize now!"
print(f"Message: '{sample_message}' => Prediction: {predict_spam(sample_message)}")

Message: 'Congratulations! You've won a $1,000 gift card. Click here to claim your prize now!' => Prediction: Spam
