In [1]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC


In [2]:
data = pd.read_csv("spam.csv", encoding='latin-1')
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

In [3]:
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

data['message'] = data['message'].apply(clean_text)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    data['message'], data['label'], test_size=0.25, random_state=42)

In [6]:
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [7]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)

y_pred_lr = lr.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.9562096195262024
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1202
           1       0.98      0.70      0.81       191

    accuracy                           0.96      1393
   macro avg       0.97      0.85      0.89      1393
weighted avg       0.96      0.96      0.95      1393



In [15]:
new_sms = ["Congratulations! You won a free lottery ticket"]
new_sms_clean = [clean_text(new_sms[0])]
new_sms_tfidf = tfidf.transform(new_sms_clean)

prediction = lr.predict(new_sms_tfidf)

print("Spam" if prediction[0] == 1 else "Ham")


Ham
