In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

In [14]:
data = pd.read_csv("SPAM.csv", encoding='latin-1')
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

print(data.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [15]:
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [16]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['message'])
y = data['label']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [18]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:,1]

In [19]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:,1]

In [20]:
print("Logistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_lr))

Logistic Regression Results
Accuracy: 0.9443946188340807
Precision: 0.9680851063829787
Recall: 0.6066666666666667
F1 Score: 0.7459016393442623
ROC-AUC: 0.9884145077720207


In [21]:
print("\nRandom Forest Results")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))


Random Forest Results
Accuracy: 0.9775784753363229
Precision: 1.0
Recall: 0.8333333333333334
F1 Score: 0.9090909090909091
ROC-AUC: 0.9840449050086355


In [22]:
cv_lr = cross_val_score(lr, X, y, cv=5)
cv_rf = cross_val_score(rf, X, y, cv=5)

print("Logistic Regression CV Accuracy:", cv_lr.mean())
print("Random Forest CV Accuracy:", cv_rf.mean())

Logistic Regression CV Accuracy: 0.9540552769078424
Random Forest CV Accuracy: 0.9741547850029386


In [25]:
def spam_checker():
    user_msg = input("Enter a message to check spam or not spam: ")

    msg_vector = vectorizer.transform([user_msg])
    prediction = lr.predict(msg_vector)[0]

    print("\nEntered Message:", user_msg)

    if prediction == 1:
        print("Final Output: SPAM ARE FOUND IN THIS MESSAGE")
    else:
        print("Final Output:SPAM ARE NOT SPAM IN THIS MESSAGE")

In [26]:
spam_checker()

Enter a message to check spam or not spam:  Congratulations! YOU WON A FREE IPHONE FOR YOUR LOTTERY



Entered Message: Congratulations! YOU WON A FREE IPHONE FOR YOUR LOTTERY
Final Output:  NOT SPAM
