In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("/content/spam.csv",encoding="latin")

X = data["v2"]  # Extract text messages
y = data["v1"]  # Extract labels (spam, non-spam)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [5]:
vectorizer = TfidfVectorizer(max_features=2000)  # Adjust max_features as needed
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)


In [6]:
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(solver="lbfgs"),
    "Support Vector Machine": SVC(kernel="linear"),
}

for name, model in models.items():
    model.fit(X_train_features, y_train)
    y_pred = model.predict(X_test_features)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,pos_label="ham")
    recall = recall_score(y_test, y_pred,pos_label="ham")
    f1 = f1_score(y_test, y_pred,pos_label="ham")

    print(f"\nModel: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")



Model: Multinomial Naive Bayes
Accuracy: 0.9755
Precision: 0.9732
Recall: 0.9993
F1-score: 0.9861

Model: Logistic Regression
Accuracy: 0.9689
Precision: 0.9661
Recall: 0.9993
F1-score: 0.9824

Model: Support Vector Machine
Accuracy: 0.9815
Precision: 0.9811
Recall: 0.9979
F1-score: 0.9894


In [7]:
# Example parameter tuning for LogisticRegression
from sklearn.model_selection import GridSearchCV
lr = LogisticRegression(solver="lbfgs")
param_grid = {"C": [0.001, 0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train_features, y_train)
best_model = grid_search.best_estimator_


In [8]:
import joblib

# Example saving the best LogisticRegression model as "spam_classifier.pkl"
joblib.dump(best_model, "spam_classifier.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']

In [9]:
# Load the saved model
model = joblib.load("spam_classifier.pkl")
vectorizer = joblib.load("vectorizer.pkl")  # Load the trained vectorizer

import re

def preprocess_text(text):
    """Preprocesses text for better model performance."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespace
    return text

def predict_spam_percentage(text):
    """Predicts the spam percentage for a given text input."""
    processed_text = preprocess_text(text)  # Apply any necessary preprocessing
    text_features = vectorizer.transform([processed_text])
    prediction = model.predict_proba(text_features)[0][1]  # Get probability of spam
    spam_percentage = prediction * 100
    return spam_percentage

# Example usage
input_text = "Congratulations! You've won a free prize! Click here to claim: bit.ly/freeprize"
spam_percentage = predict_spam_percentage(input_text)
print("Spam Percentage:", spam_percentage)


Spam Percentage: 96.70311839515882
