# **Spam Email Detection**

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
df = pd.read_csv('/content/spamEmailDetection.csv')

In [25]:
df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [26]:
df.shape

(83448, 2)

In [27]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,43910
0,39538


# **Handle Text Data**

## Text cleaning

Clean the 'text' column by removing punctuation, converting to lowercase, and removing special characters and extra whitespace.


In [28]:
import re

# 1. Convert all text to lowercase
df['text'] = df['text'].str.lower()

# 2. Remove any characters that are not letters, numbers, or whitespace
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

# 3. Remove any leading or trailing whitespace
df['text'] = df['text'].str.strip()

# 4. Remove any extra whitespace within the text
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x))

df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday esc...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


## Tokenization and stop word removal

Tokenize the cleaned text and remove common stop words.


**Reasoning**:
The traceback indicates that `punkt_tab` is also required by `word_tokenize`. The corrected code will explicitly download `punkt_tab` along with `punkt` and `stopwords` before proceeding with tokenization and stop word removal.



In [29]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download the 'punkt', 'punkt_tab' and 'stopwords' corpora
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab', quiet=True)

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)


# Define a function to tokenize and remove stop words
def tokenize_and_remove_stopwords(text):
    """Tokenizes text and removes English stop words."""
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

# Apply the function to the 'text' column
df['tokenized_text'] = df['text'].apply(tokenize_and_remove_stopwords)

# Display the head of the DataFrame to inspect the new column
display(df.head())

Unnamed: 0,label,text,tokenized_text
0,1,ounce feather bowl hummingbird opec moment ala...,"[ounce, feather, bowl, hummingbird, opec, mome..."
1,1,wulvob get your medircations online qnb ikud v...,"[wulvob, get, medircations, online, qnb, ikud,..."
2,0,computer connection from cnn com wednesday esc...,"[computer, connection, cnn, com, wednesday, es..."
3,1,university degree obtain a prosperous future m...,"[university, degree, obtain, prosperous, futur..."
4,0,thanks for all your answers guys i know i shou...,"[thanks, answers, guys, know, checked, rsync, ..."


## Feature extraction (tf-idf)

Convert the processed text into numerical features using TF-IDF.


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the list of tokens back to strings
df['text_for_vectorization'] = df['tokenized_text'].apply(lambda x: ' '.join(x))

**Reasoning**:
Instantiate and fit the TfidfVectorizer to the processed text data and transform it into a TF-IDF matrix.



In [31]:
# Instantiate TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the processed text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_for_vectorization'])

# Display the shape of the resulting TF-IDF matrix
print(tfidf_matrix.shape)

(83448, 295301)


## Data splitting

Split the data into training and testing sets.


In [32]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df['label'], test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (66758, 295301)
Shape of X_test: (16690, 295301)
Shape of y_train: (66758,)
Shape of y_test: (16690,)


## Model training

Train a Logistic Regression model, a Decision Tree model, and a Random Forest model on the training data.


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Instantiate and train Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Instantiate and train Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Instantiate and train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

print("Models trained successfully.")

Models trained successfully.


## Prediction

Make predictions on the testing data using each trained model.


In [34]:
# Make predictions using Logistic Regression
log_reg_pred = log_reg_model.predict(X_test)

# Make predictions using Decision Tree
dt_pred = dt_model.predict(X_test)

# Make predictions using Random Forest
rf_pred = rf_model.predict(X_test)

print("Predictions made successfully.")

Predictions made successfully.


## Model evaluation

Calculate the accuracy and other relevant metrics for each model.


In [35]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate Logistic Regression model
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)
log_reg_precision = precision_score(y_test, log_reg_pred)
log_reg_recall = recall_score(y_test, log_reg_pred)
log_reg_f1 = f1_score(y_test, log_reg_pred)

print("Logistic Regression Metrics:")
print(f"  Accuracy: {log_reg_accuracy:.4f}")
print(f"  Precision: {log_reg_precision:.4f}")
print(f"  Recall: {log_reg_recall:.4f}")
print(f"  F1-score: {log_reg_f1:.4f}")

# Evaluate Decision Tree model
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_precision = precision_score(y_test, dt_pred)
dt_recall = recall_score(y_test, dt_pred)
dt_f1 = f1_score(y_test, dt_pred)

print("\nDecision Tree Metrics:")
print(f"  Accuracy: {dt_accuracy:.4f}")
print(f"  Precision: {dt_precision:.4f}")
print(f"  Recall: {dt_recall:.4f}")
print(f"  F1-score: {dt_f1:.4f}")

# Evaluate Random Forest model
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)

print("\nRandom Forest Metrics:")
print(f"  Accuracy: {rf_accuracy:.4f}")
print(f"  Precision: {rf_precision:.4f}")
print(f"  Recall: {rf_recall:.4f}")
print(f"  F1-score: {rf_f1:.4f}")

Logistic Regression Metrics:
  Accuracy: 0.9848
  Precision: 0.9778
  Recall: 0.9936
  F1-score: 0.9857

Decision Tree Metrics:
  Accuracy: 0.9684
  Precision: 0.9679
  Recall: 0.9720
  F1-score: 0.9700

Random Forest Metrics:
  Accuracy: 0.9848
  Precision: 0.9882
  Recall: 0.9829
  F1-score: 0.9855


## Comparison
Compare the performance of the three models based on the evaluation metrics.


In [36]:
# Print a summary of the evaluation metrics for each model
print("Model Performance Comparison:")
print(f"  Logistic Regression: Accuracy={log_reg_accuracy:.4f}, Precision={log_reg_precision:.4f}, Recall={log_reg_recall:.4f}, F1-score={log_reg_f1:.4f}")
print(f"  Decision Tree      : Accuracy={dt_accuracy:.4f}, Precision={dt_precision:.4f}, Recall={dt_recall:.4f}, F1-score={dt_f1:.4f}")
print(f"  Random Forest      : Accuracy={rf_accuracy:.4f}, Precision={rf_precision:.4f}, Recall={rf_recall:.4f}, F1-score={rf_f1:.4f}")

# Determine the best model based on F1-score (a balanced metric)
best_model = max([(log_reg_f1, 'Logistic Regression'),
                  (dt_f1, 'Decision Tree'),
                  (rf_f1, 'Random Forest')],
                 key=lambda item: item[0])

print(f"\nBased on F1-score, the best performing model is: {best_model[1]} with an F1-score of {best_model[0]:.4f}")

# Further considerations for selecting the best model:
print("\nConsiderations for selecting the best model:")
print("- Accuracy: Overall correctness of the model.")
print("- Precision: Ability of the model to identify only relevant instances (minimize false positives - marking non-spam as spam).")
print("- Recall: Ability of the model to find all relevant instances (minimize false negatives - marking spam as non-spam).")
print("- F1-score: Harmonic mean of precision and recall, providing a balance between the two.")
print("\nFor spam detection, minimizing false negatives (missing a spam email - low recall) is often more critical than minimizing false positives (marking a non-spam as spam - low precision).")
print("A high recall ensures that most spam emails are caught, even if it means occasionally flagging a legitimate email as spam.")
print(f"\nBased on Recall, the best performing model is: {max([(log_reg_recall, 'Logistic Regression'), (dt_recall, 'Decision Tree'), (rf_recall, 'Random Forest')], key=lambda item: item[0])[1]} with a Recall of {max([(log_reg_recall, 'Logistic Regression'), (dt_recall, 'Decision Tree'), (rf_recall, 'Random Forest')], key=lambda item: item[0])[0]:.4f}")

Model Performance Comparison:
  Logistic Regression: Accuracy=0.9848, Precision=0.9778, Recall=0.9936, F1-score=0.9857
  Decision Tree      : Accuracy=0.9684, Precision=0.9679, Recall=0.9720, F1-score=0.9700
  Random Forest      : Accuracy=0.9848, Precision=0.9882, Recall=0.9829, F1-score=0.9855

Based on F1-score, the best performing model is: Logistic Regression with an F1-score of 0.9857

Considerations for selecting the best model:
- Accuracy: Overall correctness of the model.
- Precision: Ability of the model to identify only relevant instances (minimize false positives - marking non-spam as spam).
- Recall: Ability of the model to find all relevant instances (minimize false negatives - marking spam as non-spam).
- F1-score: Harmonic mean of precision and recall, providing a balance between the two.

For spam detection, minimizing false negatives (missing a spam email - low recall) is often more critical than minimizing false positives (marking a non-spam as spam - low precision).