In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Step 1: Load the Dataset (Assuming a CSV with 'text' and 'label' columns)
# The 'label' column contains 'spam' or 'ham' (non-spam) labels.
data = pd.read_csv('spam_dataset.csv')  # Replace with your dataset path

In [4]:
# Step 2: Preprocess the Data (e.g., convert text to lowercase)
data['email_text'] = data['email_text'].str.lower()

In [6]:
# Step 3: Split the Data into Training and Testing Sets
X = data['email_text']  # Features (email text)
y = data['label']  # Labels (spam or ham)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# Step 4: Convert Text Data to Numerical Format (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.85)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
# Step 5: Train Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [9]:
# Step 6: Train SVM Model
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_tfidf, y_train)

In [10]:
# Step 7: Evaluate Naive Bayes Model
y_pred_nb = nb_model.predict(X_test_tfidf)
print("Naive Bayes - Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nNaive Bayes - Classification Report:\n", classification_report(y_test, y_pred_nb))

Naive Bayes - Accuracy: 0.8936170212765957

Naive Bayes - Classification Report:
               precision    recall  f1-score   support

         ham       1.00      0.35      0.52       100
        spam       0.89      1.00      0.94       511

    accuracy                           0.89       611
   macro avg       0.94      0.68      0.73       611
weighted avg       0.91      0.89      0.87       611



In [11]:
# Step 8: Evaluate SVM Model
y_pred_svm = svm_model.predict(X_test_tfidf)
print("\nSVM - Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nSVM - Classification Report:\n", classification_report(y_test, y_pred_svm))


SVM - Accuracy: 0.9950900163666121

SVM - Classification Report:
               precision    recall  f1-score   support

         ham       0.98      0.99      0.99       100
        spam       1.00      1.00      1.00       511

    accuracy                           1.00       611
   macro avg       0.99      0.99      0.99       611
weighted avg       1.00      1.00      1.00       611



In [12]:
# Optional: Make Predictions with new data
sample_text = ["Win a free iPhone! Click here now!", "Please find the report attached."]
sample_tfidf = tfidf_vectorizer.transform(sample_text)
sample_pred_nb = nb_model.predict(sample_tfidf)
sample_pred_svm = svm_model.predict(sample_tfidf)

In [13]:
print("\nSample Predictions (Naive Bayes):")
for text, pred in zip(sample_text, sample_pred_nb):
    print(f"Text: {text} -> Spam: {pred}")



Sample Predictions (Naive Bayes):
Text: Win a free iPhone! Click here now! -> Spam: spam
Text: Please find the report attached. -> Spam: spam


In [14]:
print("\nSample Predictions (SVM):")
for text, pred in zip(sample_text, sample_pred_svm):
    print(f"Text: {text} -> Spam: {pred}")


Sample Predictions (SVM):
Text: Win a free iPhone! Click here now! -> Spam: ham
Text: Please find the report attached. -> Spam: ham
