In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [6]:

# Step 1: Load and preprocess the data
data = pd.read_csv("spam.csv", encoding="latin-1")
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1": "label", "v2": "text"})
data["label"] = data["label"].map({"ham": 0, "spam": 1})
# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42)

# Step 3: Feature extraction and model training
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Using Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
# Using Logistic Regression classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)

# Using Support Vector Machine classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)

# Step 4: Model evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

nb_accuracy, nb_report = evaluate_model(nb_classifier, X_test_tfidf, y_test)
lr_accuracy, lr_report = evaluate_model(lr_classifier, X_test_tfidf, y_test)
svm_accuracy, svm_report = evaluate_model(svm_classifier, X_test_tfidf, y_test)

print("Naive Bayes Accuracy:", nb_accuracy)
print(nb_report)

print("Logistic Regression Accuracy:", lr_accuracy)
print(lr_report)

print("Support Vector Machine Accuracy:", svm_accuracy)
print(svm_report)


Naive Bayes Accuracy: 0.9623318385650225
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

Logistic Regression Accuracy: 0.9659192825112107
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.99      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Support Vector Machine Accuracy: 0.9820627802690582
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.87      0.93       150

    accuracy                           0