In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [5]:
# Load dataset
df = pd.read_csv(r"D:\projects\SpamClassifier\training\emails.csv")

# Clean text
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r"[.,/]", "", text)
        return text.lower()
    return ""

df["text"] = df["text"].apply(clean_text)
df["text"] = df["text"].astype(str)

X = df["text"]
Y = df["label"]

In [None]:
# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Save vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [7]:
# Train Logistic Regression model
LRmodel = LogisticRegression(max_iter=1000)
LRmodel.fit(X_train_tfidf, Y_train)
Y_pred = LRmodel.predict(X_test_tfidf)

In [8]:
# Evaluate
accuracy = accuracy_score(Y_test, Y_pred)
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(Y_test, Y_pred))

# Save model
joblib.dump(LRmodel, 'spam_classifierLR.pkl')


Logistic Regression Results:
Accuracy: 98.46%
              precision    recall  f1-score   support

           0       0.99      0.98      0.98     11818
           1       0.98      0.99      0.99     13217

    accuracy                           0.98     25035
   macro avg       0.99      0.98      0.98     25035
weighted avg       0.98      0.98      0.98     25035



['spam_classifierLR.pkl']