# Email spam Detection with Machine Learning

## Necessary Libraries

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB


## Importing Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Oasis/Detection/

/content/drive/MyDrive/Oasis/Detection


In [6]:
df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Preprocessing and cleaning

In [12]:
print("Number of Null Values: ",df.isnull().sum().sum())

# df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.dropna(inplace=True)
df['v2'] = df['v2'].str.lower()
print(df.head())

print("Number of Null Values: ",df.isnull().sum().sum())


Number of Null Values:  0
     v1                                                 v2
0   ham  go until jurong point, crazy.. available only ...
1   ham                      ok lar... joking wif u oni...
2  spam  free entry in 2 a wkly comp to win fa cup fina...
3   ham  u dun say so early hor... u c already then say...
4   ham  nah i don't think he goes to usf, he lives aro...
Number of Null Values:  0


## Text Preprocessing

In [14]:
nltk.download('stopwords')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = text.split()
    # Remove stop words and apply stemming
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(processed_tokens)

# Apply preprocessing to the text column
df['v2'] = df['v2'].apply(preprocess_text)

# Display the preprocessed DataFrame
print(df.head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


     v1                                                 v2
0   ham  go jurong point crazi avail bugi n great world...
1   ham                              ok lar joke wif u oni
2  spam  free entri wkli comp win fa cup final tkt st m...
3   ham                u dun say earli hor u c alreadi say
4   ham          nah dont think goe usf live around though


## Feature Extraction

In [16]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['v2'])
y = df['v1'].map({'ham': 0, 'spam': 1})
print(f"Feature matrix shape: {X.shape}")

Feature matrix shape: (5572, 7054)


## Model Training

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.95
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.96      0.63      0.76       150

    accuracy                           0.95      1115
   macro avg       0.95      0.81      0.87      1115
weighted avg       0.95      0.95      0.94      1115



## Model Improvement

In [21]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_nb_pred = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_nb_pred)
nb_report = classification_report(y_test, y_nb_pred)

print(f"Naive Bayes Accuracy: {nb_accuracy:.2f}")
print("Naive Bayes Classification Report:")
print(nb_report)


Naive Bayes Accuracy: 0.97
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115



## Conclusion

In this project, I successfully built an email spam detector using Python and machine learning techniques. The initial dataset contained a mix of spam and non-spam emails, which I preprocessed to remove noise and prepare the data for analysis.

After feature extraction using TF-IDF, I trained two machine learning models: Logistic Regression and Naive Bayes. The Logistic Regression model achieved an accuracy of 95%, but it struggled with the recall for spam detection. By switching to the Naive Bayes model, I improved the accuracy to 97% and achieved a recall of 75% for spam emails.

Overall, this project demonstrated my ability to apply machine learning techniques to real-world problems, specifically in identifying spam emails. I learned the importance of data preprocessing, model selection, and evaluation metrics in developing an effective spam detection system. The improvements in performance suggest that the Naive Bayes model is well-suited for this task, and further enhancements could be made through hyperparameter tuning and ensemble methods.


## Author  
### Ishan Rahul Surdi