# Step 1: Reading and Exploring Email Data

In [None]:
import pandas as pd

# Load the email dataset into a pandas DataFrame
df = pd.read_csv('emails.csv')
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [None]:
# Show the top 5 rows of the data
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
# Describe the data
df.describe()

Unnamed: 0,spam
count,5728.0
mean,0.238827
std,0.426404
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


# Step 2: Data Cleaning

In [None]:
# Remove duplicate emails (if any)
df = df.drop_duplicates()
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [None]:
# Handle missing values (if any)
df = df.dropna()
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [None]:
#Write a function to Clean the text data by removing special characters, HTML tags, and other irrelevant information.

import re
from bs4 import BeautifulSoup

def clean_text(text):
    # Remove HTML tags using BeautifulSoup
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove special characters and symbols
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove extra whitespaces
    text = ' '.join(text.split())

    return text


In [None]:
# Apply the cleaning function
df['cleaned_text'] = df['text'].apply(clean_text)

  text = BeautifulSoup(text, "html.parser").get_text()


In [None]:
print(df[['text', 'cleaned_text']].head())

                                                text  \
0  Subject: naturally irresistible your corporate...   
1  Subject: the stock trading gunslinger  fanny i...   
2  Subject: unbelievable new homes made easy  im ...   
3  Subject: 4 color printing special  request add...   
4  Subject: do not have money , get software cds ...   

                                        cleaned_text  
0  Subject naturally irresistible your corporate ...  
1  Subject the stock trading gunslinger fanny is ...  
2  Subject unbelievable new homes made easy im wa...  
3  Subject 4 color printing special request addit...  
4  Subject do not have money get software cds fro...  


# Step 3: Emoji Removal

In [None]:
import re

def remove_emojis(text):
    # Define a regular expression pattern for emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # Emojis in the first Unicode plane
        u"\U0001F300-\U0001F5FF"  # Emojis in the second Unicode plane
        u"\U0001F680-\U0001F6FF"  # Emojis in the third Unicode plane
        u"\U0001F700-\U0001F77F"  # Emojis in the fourth Unicode plane
        "]+", flags=re.UNICODE)

    # Use the sub method to remove emojis from the text
    text_no_emojis = emoji_pattern.sub(r'', text)

    return text_no_emojis

In [None]:
# Apply the function to the email body
df['cleaned_text'] = df['text'].apply(remove_emojis)
print(df['cleaned_text'].head(20))

0     Subject: naturally irresistible your corporate...
1     Subject: the stock trading gunslinger  fanny i...
2     Subject: unbelievable new homes made easy  im ...
3     Subject: 4 color printing special  request add...
4     Subject: do not have money , get software cds ...
5     Subject: great nnews  hello , welcome to medzo...
6     Subject: here ' s a hot play in motion  homela...
7     Subject: save your money buy getting this thin...
8     Subject: undeliverable : home based business f...
9     Subject: save your money buy getting this thin...
10    Subject: las vegas high rise boom  las vegas i...
11    Subject: save your money buy getting this thin...
12    Subject: brighten those teeth  get your  teeth...
13    Subject: wall street phenomenon reaps rewards ...
14    Subject: fpa notice : ebay misrepresentation o...
15    Subject: search engine position  be the very f...
16    Subject: only our software is guaranteed 100 %...
17    Subject: localized software , all language

# Step 4: Stemming and Lemmatization

In [None]:
import nltk
from nltk.stem import PorterStemmer

# Initialize the stemmer
stemmer = PorterStemmer()

# Define a function to apply stemming
def apply_stemming(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Apply stemming to the cleaned text
df['stemmed_email'] = df['cleaned_text'].apply(apply_stemming)

In [None]:
print(df['stemmed_email'].head(20))

0     subject: natur irresist your corpor ident lt i...
1     subject: the stock trade gunsling fanni is mer...
2     subject: unbeliev new home made easi im want t...
3     subject: 4 color print special request addit i...
4     subject: do not have money , get softwar cd fr...
5     subject: great nnew hello , welcom to medzonli...
6     subject: here ' s a hot play in motion homelan...
7     subject: save your money buy get thi thing her...
8     subject: undeliver : home base busi for grownu...
9     subject: save your money buy get thi thing her...
10    subject: la vega high rise boom la vega is fas...
11    subject: save your money buy get thi thing her...
12    subject: brighten those teeth get your teeth b...
13    subject: wall street phenomenon reap reward sm...
14    subject: fpa notic : ebay misrepresent of iden...
15    subject: search engin posit be the veri first ...
16    subject: onli our softwar is guarante 100 % le...
17    subject: local softwar , all languag avail

# Step 5: Stop Word Removal

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Define a function to remove stop words
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply stop word removal to the cleaned text
df['no_stop_words_email'] = df['cleaned_text'].apply(remove_stop_words)
print(df['no_stop_words_email'].head(20))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0     Subject: naturally irresistible corporate iden...
1     Subject: stock trading gunslinger fanny merril...
2     Subject: unbelievable new homes made easy im w...
3     Subject: 4 color printing special request addi...
4     Subject: money , get software cds ! software c...
5     Subject: great nnews hello , welcome medzonlin...
6     Subject: ' hot play motion homeland security i...
7     Subject: save money buy getting thing tried ci...
8     Subject: undeliverable : home based business g...
9     Subject: save money buy getting thing tried ci...
10    Subject: las vegas high rise boom las vegas fa...
11    Subject: save money buy getting thing tried ci...
12    Subject: brighten teeth get teeth bright white...
13    Subject: wall street phenomenon reaps rewards ...
14    Subject: fpa notice : ebay misrepresentation i...
15    Subject: search engine position first listing ...
16    Subject: software guaranteed 100 % legal . nam...
17    Subject: localized software , languages av

# Step 6: Tokenization

In [None]:
# Define a function to tokenize text into words
def tokenize_text(text):
    words = text.split()
    return words

# Apply tokenization to the cleaned text
df['tokenized_email'] = df['cleaned_text'].apply(tokenize_text)
print(df['tokenized_email'])


0       [Subject:, naturally, irresistible, your, corp...
1       [Subject:, the, stock, trading, gunslinger, fa...
2       [Subject:, unbelievable, new, homes, made, eas...
3       [Subject:, 4, color, printing, special, reques...
4       [Subject:, do, not, have, money, ,, get, softw...
                              ...                        
5723    [Subject:, re, :, research, and, development, ...
5724    [Subject:, re, :, receipts, from, visit, jim, ...
5725    [Subject:, re, :, enron, case, study, update, ...
5726    [Subject:, re, :, interest, david, ,, please, ...
5727    [Subject:, news, :, aurora, 5, ., 2, update, a...
Name: tokenized_email, Length: 5695, dtype: object


# Step 7: Feature Extraction using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the email text to TF-IDF vectors
tfidf_vectors = tfidf_vectorizer.fit_transform(df['cleaned_text'])
print(tfidf_vectors)

  (0, 18540)	0.05061905307329464
  (0, 26414)	0.0732828530764711
  (0, 24800)	0.03169462982738551
  (0, 5567)	0.02717198414590654
  (0, 21007)	0.04486113847092893
  (0, 16757)	0.024010680912168382
  (0, 9073)	0.08787828034734055
  (0, 33557)	0.024368324506786252
  (0, 28911)	0.07079175806813673
  (0, 21096)	0.07781443410647693
  (0, 32650)	0.10268410801494346
  (0, 15009)	0.020096415785977668
  (0, 14514)	0.08615403143497394
  (0, 14182)	0.08051620392146595
  (0, 23796)	0.043312214446288994
  (0, 36487)	0.0245598455741449
  (0, 8335)	0.06222935561846597
  (0, 4756)	0.06935169919390856
  (0, 34849)	0.08662786112530435
  (0, 27257)	0.053061604746478926
  (0, 16380)	0.07851997763285898
  (0, 29892)	0.08592231759892231
  (0, 343)	0.0651128477204767
  (0, 7397)	0.08206173793612623
  (0, 15569)	0.0976295734937339
  :	:
  (5694, 36487)	0.011457264629925912
  (5694, 33616)	0.023590812505506417
  (5694, 37011)	0.009265609638958809
  (5694, 18942)	0.02522861000446417
  (5694, 8332)	0.02893438810

# Step 8: Feature Extraction using Bag of Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the email text to BoW representations
bow_vectors = count_vectorizer.fit_transform(df['cleaned_text'])
print(bow_vectors)

  (0, 32371)	1
  (0, 23381)	1
  (0, 18838)	1
  (0, 37024)	7
  (0, 10045)	1
  (0, 17688)	1
  (0, 21148)	1
  (0, 18848)	3
  (0, 28018)	1
  (0, 16655)	1
  (0, 33798)	4
  (0, 28142)	1
  (0, 9281)	3
  (0, 33450)	5
  (0, 21665)	2
  (0, 15415)	1
  (0, 24274)	4
  (0, 32635)	1
  (0, 4836)	5
  (0, 18234)	1
  (0, 18885)	1
  (0, 7546)	2
  (0, 16072)	2
  (0, 8043)	1
  (0, 20960)	3
  :	:
  (5694, 24851)	2
  (5694, 21635)	1
  (5694, 5715)	9
  (5694, 30968)	1
  (5694, 2807)	3
  (5694, 13324)	1
  (5694, 13113)	1
  (5694, 17380)	1
  (5694, 14115)	1
  (5694, 20276)	1
  (5694, 31860)	1
  (5694, 13114)	1
  (5694, 20470)	1
  (5694, 35337)	1
  (5694, 8614)	1
  (5694, 30116)	1
  (5694, 13507)	5
  (5694, 36243)	1
  (5694, 943)	2
  (5694, 2776)	1
  (5694, 30316)	1
  (5694, 17582)	1
  (5694, 33970)	1
  (5694, 10354)	1
  (5694, 11368)	1
