# **Spam Email Detection**

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

**Step 1: Load the dataset**

In [4]:
# Replace with actual dataset link or local path
data = pd.read_csv('spam.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Step 2: Data Cleaning**

In [3]:
# Check for missing values
print(data.isnull().sum())  # Verify missing values

Category    0
Message     0
dtype: int64


In [5]:
# Drop any missing or unnecessary columns
# Assuming the dataset has irrelevant columns (drop them if needed)
data = data.dropna()  # Drop rows with missing values (if any)

**Step 3: Data Preprocessing**

In [6]:
# Convert the 'Category' column to numerical values (spam = 1, ham = 0)
label_encoder = LabelEncoder()
data['Category'] = label_encoder.fit_transform(data['Category'])  # 1 = spam, 0 = ham


In [7]:
data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# Vectorize the 'Message' column using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to 5000 most important features
X = vectorizer.fit_transform(data['Message']).toarray()
# Separate features (X) and target (y)
y = data['Category']

In [9]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Step 4: Model Building**

In [10]:
# 1. Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)


In [12]:
# 2. Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)

In [13]:
# 3. AdaBoost Classifier
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)



In [14]:
# 4. K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

**Step 5: Model Evaluation**

In [15]:
# Function to evaluate model performance
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")
    print(f"Classification Report for {model_name}:\n")
    print(classification_report(y_true, y_pred))

In [16]:
# Evaluate each model
evaluate_model(y_test, y_pred_log_reg, "Logistic Regression")
evaluate_model(y_test, y_pred_rfc, "Random Forest Classifier")
evaluate_model(y_test, y_pred_ada, "AdaBoost Classifier")
evaluate_model(y_test, y_pred_knn, "K-Nearest Neighbors")

Logistic Regression Accuracy: 0.96
Classification Report for Logistic Regression:

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

Random Forest Classifier Accuracy: 0.98
Classification Report for Random Forest Classifier:

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

AdaBoost Classifier Accuracy: 0.98
Classification Report for AdaBoost Classifier:

              precision    recall  f1-score   support

           0       0.98      0.9

**Step 6: Compare and Conclusion**

In [17]:
models = ['Logistic Regression', 'Random Forest', 'AdaBoost', 'KNN']
accuracies = [
    accuracy_score(y_test, y_pred_log_reg),
    accuracy_score(y_test, y_pred_rfc),
    accuracy_score(y_test, y_pred_ada),
    accuracy_score(y_test, y_pred_knn)
]

In [18]:
# Print model accuracies
for model, accuracy in zip(models, accuracies):
    print(f"{model} Accuracy: {accuracy:.2f}")

Logistic Regression Accuracy: 0.96
Random Forest Accuracy: 0.98
AdaBoost Accuracy: 0.98
KNN Accuracy: 0.92


**Step 7: Function to collect new user input for spam prediction**

In [20]:
def get_user_input():
    print("Please enter the email message you want to classify (spam/ham):")
    email_message = input("Email Message: ")
    return email_message

# Get new user input
user_input = get_user_input()

# Vectorize the user input using the same TF-IDF vectorizer as the training data
user_input_vectorized = vectorizer.transform([user_input]).toarray()

user_pred_log_reg = log_reg.predict(user_input_vectorized)
user_pred_rfc = rfc.predict(user_input_vectorized)
user_pred_ada = ada.predict(user_input_vectorized)
user_pred_knn = knn.predict(user_input_vectorized)

# Convert the numerical prediction to "Spam" or "Ham"
def interpret_prediction(pred):
    return "Spam" if pred == 1 else "Ham"

# Display the predictions for the user input
print(f"Logistic Regression Prediction: {interpret_prediction(user_pred_log_reg[0])}")
print(f"Random Forest Classifier Prediction: {interpret_prediction(user_pred_rfc[0])}")
print(f"AdaBoost Classifier Prediction: {interpret_prediction(user_pred_ada[0])}")
print(f"K-Nearest Neighbors Prediction: {interpret_prediction(user_pred_knn[0])}")


Please enter the email message you want to classify (spam/ham):
Email Message: As a valued customer, I am pleased to advise you that following recent review of your Mob No. you are awarded with a Â£1500 Bonus Prize, call 09066364589
Logistic Regression Prediction: Spam
Random Forest Classifier Prediction: Spam
AdaBoost Classifier Prediction: Spam
K-Nearest Neighbors Prediction: Ham
