In [1]:
import os
import pandas as pd

# Path to the dataset folder
dataset_folder = 'dataset'

# Initialize an empty list to store the data
data = []

# Iterate through each subdirectory (spam, hard_ham, easy_ham)
for folder_name in os.listdir(dataset_folder):
    folder_path = os.path.join(dataset_folder, folder_name)
    
    # Determine the label based on the folder name
    if folder_name == 'spam':
        label = 'SPAM'
    else:
        label = 'HAM'
    
    # Iterate through each file in the subdirectory
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Read the content of the file
        with open(file_path, 'r', encoding='latin-1') as file:
            content = file.read()
        
        # Append the label and content to the data list
        data.append({'Label': label, 'Content': content})


# Create a DataFrame from the collected data
df_unshuffled = pd.DataFrame(data)
# Display the rows
print("Dataframe:\n\n",df_unshuffled)

df = df_unshuffled.sample(frac=1).reset_index(drop=True)

print("\n Data Schuffling :\n\n",df)

Dataframe:

      Label                                            Content
0      HAM  From exmh-workers-admin@redhat.com  Thu Aug 22...
1      HAM  From Steve_Burt@cursor-system.com  Thu Aug 22 ...
2      HAM  From timc@2ubh.com  Thu Aug 22 13:52:59 2002\n...
3      HAM  From irregulars-admin@tb.tf  Thu Aug 22 14:23:...
4      HAM  From exmh-users-admin@redhat.com  Thu Aug 22 1...
...    ...                                                ...
4193  SPAM  From tba@insiq.us  Wed Dec  4 11:46:34 2002\nR...
4194  SPAM  Return-Path: <raye@yahoo.lv>\nReceived: from u...
4195  SPAM  From cweqx@dialix.oz.au  Tue Aug  6 11:03:54 2...
4196  SPAM  From ilug-admin@linux.ie  Wed Dec  4 11:52:36 ...
4197  SPAM  mv 00001.317e78fa8ee2f54cd4890fdc09ba8176 0000...

[4198 rows x 2 columns]

 Data Schuffling :

      Label                                            Content
0      HAM  From rssfeeds@jmason.org  Tue Sep 24 10:47:25 ...
1     SPAM  From reply-56446664-9@william.monsterjoke.com ...
2      HAM

### Removal of unnecessary things 

In [2]:
import pandas as pd 
import email 
from email import policy
from email.parser import BytesParser

In [3]:
import re
# Regex patterns to extract metadata
metadata_patterns = {
    'From': r"^From: (.*)$",
    'To': r"^To: (.*)$",
    'Subject': r"^Subject: (.*)$",
    'Date': r"^Date: (.*)$",
}

def parse_email(raw_email):
    # Parse email metadata
    metadata = {}
    for key, pattern in metadata_patterns.items():
        match = re.search(pattern, raw_email, re.MULTILINE)
        if match:
            metadata[key] = match.group(1)

    # Extract email body
    email_body_start = raw_email.find('\n\n') + 2
    body = raw_email[email_body_start:]

    return metadata, body


# Replace 'Content' with the appropriate column name if it's different

# Apply the parse_email function to each row in the DataFrame
parsed_data = df['Content'].apply(parse_email)

# Create a new DataFrame from the parsed email data
parsed_df = pd.DataFrame(parsed_data.tolist(), columns=['Metadata', 'Body'])
parsed_df.head()

Unnamed: 0,Metadata,Body
0,"{'From': 'aaronsw <rssfeeds@example.com>', 'To...",URL: http://www.aaronsw.com/weblog/000611\nDat...
1,"{'From': '""Joke-of-the-Day!"" <reply-56446664-9...",<html><body><center><a href=http://www.vitafac...
2,{'From': 'Theo Van Dinter <felicity@kluge.net>...,\n--qcHopEYAB45HaUaB\nContent-Type: text/plain...
3,"{'From': '""4everyoung@telekbird.com.cn"" <4ever...",<=21-- saved from url=3D(0022)http://internet....
4,{'From': 'Julian Bond <julian_bond@voidstar.co...,"""Bort, Paul"" <pbort@tmwsystems.com> wrote:\n>I..."


In [4]:
# Remove the date from the body column
parsed_df['Body'] = parsed_df['Body'].apply(lambda x: re.sub(r"^Date: .*", "", x, flags=re.MULTILINE))
parsed_df.head()

Unnamed: 0,Metadata,Body
0,"{'From': 'aaronsw <rssfeeds@example.com>', 'To...",URL: http://www.aaronsw.com/weblog/000611\n\n\...
1,"{'From': '""Joke-of-the-Day!"" <reply-56446664-9...",<html><body><center><a href=http://www.vitafac...
2,{'From': 'Theo Van Dinter <felicity@kluge.net>...,\n--qcHopEYAB45HaUaB\nContent-Type: text/plain...
3,"{'From': '""4everyoung@telekbird.com.cn"" <4ever...",<=21-- saved from url=3D(0022)http://internet....
4,{'From': 'Julian Bond <julian_bond@voidstar.co...,"""Bort, Paul"" <pbort@tmwsystems.com> wrote:\n>I..."


In [13]:
def parse_email_body(body):
    # Remove HTML tags
    body = re.sub(r'<[^>]+>', '', body)
    
    # Remove URLs
    body = re.sub(r'http\S+', '', body)
    
    # Remove special characters and escape sequences
    body = re.sub(r'=22', '"', body)
    body = re.sub(r'=3D', '=', body)
    body = re.sub(r'<=21', '!', body)
    body = re.sub(r'=\n', '', body)
    body = re.sub(r'\n', ' ', body)
    
    return body.strip()

# Apply the parse_email_body function to the 'Body' column of parsed_df
parsed_df['Body'] = parsed_df['Body'].apply(parse_email_body)
parsed_df.head()

Unnamed: 0,Metadata,Body
0,"{'From': 'aaronsw <rssfeeds@example.com>', 'To...","URL: Finally finished up (most of) TRAMP, m..."
1,"{'From': '""Joke-of-the-Day!"" <reply-56446664-9...",Unsubscribe:Please send a blank mail to:unsub-...
2,{'From': 'Theo Van Dinter <felicity@kluge.net>...,--qcHopEYAB45HaUaB Content-Type: text/plain; c...
3,"{'From': '""4everyoung@telekbird.com.cn"" <4ever...",Business EVER YOUNG Nutritionals GH 2000 -...
4,{'From': 'Julian Bond <julian_bond@voidstar.co...,"""Bort, Paul"" wrote: >If your sendmail has bee..."


In [15]:
# Define a function to extract the main body of the email
def extract_main_body(body):
    # Split the body by lines
    lines = body.split('\n')
    main_body_lines = []
    # Iterate over the lines and exclude lines that indicate metadata or signatures
    for line in lines:
        if not re.match(r'^[-]+\s*$', line) and not re.match(r'^------------------', line) and not re.match(r'^\s*Yahoo! Groups Sponsor', line) and not re.match(r'^Your use of Yahoo! Groups is subject to', line):
            main_body_lines.append(line)
    # Join the remaining lines to form the main body
    main_body = '\n'.join(main_body_lines)
    return main_body.strip()

# Apply the extract_main_body function to the 'Body' column of parsed_df
parsed_df['Body'] = parsed_df['Body'].apply(extract_main_body)
parsed_df.head()

Unnamed: 0,Metadata,Body
0,"{'From': 'aaronsw <rssfeeds@example.com>', 'To...","URL: Finally finished up (most of) TRAMP, m..."
1,"{'From': '""Joke-of-the-Day!"" <reply-56446664-9...",Unsubscribe:Please send a blank mail to:unsub-...
2,{'From': 'Theo Van Dinter <felicity@kluge.net>...,--qcHopEYAB45HaUaB Content-Type: text/plain; c...
3,"{'From': '""4everyoung@telekbird.com.cn"" <4ever...",Business EVER YOUNG Nutritionals GH 2000 -...
4,{'From': 'Julian Bond <julian_bond@voidstar.co...,"""Bort, Paul"" wrote: >If your sendmail has bee..."


### Concatenate my df with my parced_df and keep only the Label column and Body column 

In [17]:
# Concatenate the two DataFrames along the columns axis
merged_df = pd.concat([df, parsed_df], axis=1)
merged_df.head()

Unnamed: 0,Label,Content,Metadata,Body
0,HAM,From rssfeeds@jmason.org Tue Sep 24 10:47:25 ...,"{'From': 'aaronsw <rssfeeds@example.com>', 'To...","URL: Finally finished up (most of) TRAMP, m..."
1,SPAM,From reply-56446664-9@william.monsterjoke.com ...,"{'From': '""Joke-of-the-Day!"" <reply-56446664-9...",Unsubscribe:Please send a blank mail to:unsub-...
2,HAM,From felicity@kluge.net Sun Sep 22 21:55:57 2...,{'From': 'Theo Van Dinter <felicity@kluge.net>...,--qcHopEYAB45HaUaB Content-Type: text/plain; c...
3,SPAM,Received: from hq.pro-ns.net (localhost [127.0...,"{'From': '""4everyoung@telekbird.com.cn"" <4ever...",Business EVER YOUNG Nutritionals GH 2000 -...
4,HAM,From razor-users-admin@lists.sourceforge.net ...,{'From': 'Julian Bond <julian_bond@voidstar.co...,"""Bort, Paul"" wrote: >If your sendmail has bee..."


In [18]:
# Drop the 'Metadata' and 'Content' columns from the DataFrame
merged_df = merged_df.drop(columns=['Metadata', 'Content'])
merged_df.head()

Unnamed: 0,Label,Body
0,HAM,"URL: Finally finished up (most of) TRAMP, m..."
1,SPAM,Unsubscribe:Please send a blank mail to:unsub-...
2,HAM,--qcHopEYAB45HaUaB Content-Type: text/plain; c...
3,SPAM,Business EVER YOUNG Nutritionals GH 2000 -...
4,HAM,"""Bort, Paul"" wrote: >If your sendmail has bee..."


In [19]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4198 entries, 0 to 4197
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   4198 non-null   object
 1   Body    4198 non-null   object
dtypes: object(2)
memory usage: 65.7+ KB


### Assign the category to spam and ham email

In [20]:
# Convert 'SPAM' label to 0 and 'HAM' label to 1
merged_df.loc[merged_df['Label'] == 'SPAM', 'Label'] = 0
merged_df.loc[merged_df['Label'] == 'HAM', 'Label'] = 1
merged_df.head()

Unnamed: 0,Label,Body
0,1,"URL: Finally finished up (most of) TRAMP, m..."
1,0,Unsubscribe:Please send a blank mail to:unsub-...
2,1,--qcHopEYAB45HaUaB Content-Type: text/plain; c...
3,0,Business EVER YOUNG Nutritionals GH 2000 -...
4,1,"""Bort, Paul"" wrote: >If your sendmail has bee..."


### Split the data set into training and testing

In [21]:
x = merged_df['Body']
y = merged_df['Label']

In [22]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

### Transform the text data to feature vectors that can be used as input to the logistic regression 

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
# Fit the vectorizer on the training data and transform both training and testing data
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

In [24]:
# convert y_train and y_test values into integers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [25]:
print(x_train)

964     --wLAMOaPNJ0fu1fTG Content-Type: text/plain; c...
239     What have you been up to? I don't know about y...
2387    The USAF had a program in the 1959-1960 in whi...
990     FIND OUT WHO THEY ARE CHATTING/E-MAILING WITH ...
108     Hello   You may have seen this business before...
                              ...                        
3444    On Tue, 10 Sep 2002 15:43:32 BST, \tJames Gibb...
466     &nbsp;                                        ...
3772    URL:    Eli the Bearded sez:       Ever wanted...
860     URL:    *Business:* Abbey National has receive...
Name: Body, Length: 3358, dtype: object


In [26]:
print(x_train_features)

  (0, 23531)	0.04625512601925466
  (0, 15071)	0.12198431999577275
  (0, 30434)	0.12198431999577275
  (0, 24674)	0.12198431999577275
  (0, 51504)	0.12198431999577275
  (0, 29855)	0.12198431999577275
  (0, 30247)	0.12198431999577275
  (0, 31911)	0.055983213682106694
  (0, 17736)	0.06520708570225747
  (0, 35563)	0.0490658645368861
  (0, 27526)	0.06736718893727206
  (0, 54822)	0.06564360221220951
  (0, 27532)	0.06703815884811398
  (0, 55229)	0.0496901639557801
  (0, 13599)	0.05854703177769256
  (0, 48947)	0.17758397977530405
  (0, 42389)	0.19152103486007543
  (0, 11601)	0.054202507402555264
  (0, 44275)	0.1002102416351794
  (0, 46592)	0.06410356494545678
  (0, 43741)	0.10191535139444276
  (0, 30767)	0.07935855805176022
  (0, 49349)	0.07825853281610132
  (0, 57149)	0.0584568339735166
  (0, 10043)	0.0860110700638136
  :	:
  (3356, 35888)	0.11713685679241877
  (3356, 20031)	0.15372779865810204
  (3356, 56125)	0.10729928238918222
  (3356, 35891)	0.08617365955894042
  (3356, 54611)	0.0658439727

### Train the model (Logistic Regression)

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
mailModel = LogisticRegression()

In [29]:
mailModel.fit(x_train_features, y_train)

### The evaluating of the training model and the prediction of the training data

In [30]:
from sklearn.metrics import accuracy_score

In [32]:
prediction_training_data = mailModel.predict(x_train_features)
accuracy_training_data = accuracy_score(y_train, prediction_training_data)
print(accuracy_training_data)

0.9755807027992853


In [34]:
prediction_test_data = mailModel.predict(x_test_features)
accuracy_training_data = accuracy_score(y_test, prediction_test_data)
print(accuracy_training_data)

0.9607142857142857


In [39]:
input_your_email = ["urgent action required dear valued customer urgent action required update account information click link verify details thank cooperation"]
input_data_features = feature_extraction.transform(input_your_email)
prediction = mailModel.predict(input_data_features)
print(prediction)
if(prediction[0]==1):
    print('Ham mail')
else:
    print('Spam mail') 

[0]
Spam mail
