In [1]:
import os
import pandas as pd

# Path to the dataset folder
dataset_folder = 'dataset'

# Initialize an empty list to store the data
data = []

# Iterate through each subdirectory (spam, hard_ham, easy_ham)
for folder_name in os.listdir(dataset_folder):
    folder_path = os.path.join(dataset_folder, folder_name)
    
    # Determine the label based on the folder name
    if folder_name == 'spam':
        label = 'SPAM'
    else:
        label = 'HAM'
    
    # Iterate through each file in the subdirectory
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        # Read the content of the file
        with open(file_path, 'r', encoding='latin-1') as file:
            content = file.read()
        
        # Append the label and content to the data list
        data.append({'Label': label, 'Content': content})


# Create a DataFrame from the collected data
df_unshuffled = pd.DataFrame(data)
# Display the rows
print("Dataframe:\n\n",df_unshuffled)

df = df_unshuffled.sample(frac=1).reset_index(drop=True)

print("\n Data Schuffling :\n\n",df)

Dataframe:

      Label                                            Content
0      HAM  From exmh-workers-admin@redhat.com  Thu Aug 22...
1      HAM  From Steve_Burt@cursor-system.com  Thu Aug 22 ...
2      HAM  From timc@2ubh.com  Thu Aug 22 13:52:59 2002\n...
3      HAM  From irregulars-admin@tb.tf  Thu Aug 22 14:23:...
4      HAM  From exmh-users-admin@redhat.com  Thu Aug 22 1...
...    ...                                                ...
4193  SPAM  From tba@insiq.us  Wed Dec  4 11:46:34 2002\nR...
4194  SPAM  Return-Path: <raye@yahoo.lv>\nReceived: from u...
4195  SPAM  From cweqx@dialix.oz.au  Tue Aug  6 11:03:54 2...
4196  SPAM  From ilug-admin@linux.ie  Wed Dec  4 11:52:36 ...
4197  SPAM  mv 00001.317e78fa8ee2f54cd4890fdc09ba8176 0000...

[4198 rows x 2 columns]

 Data Schuffling :

      Label                                            Content
0      HAM  From tony@svanstrom.com  Wed Aug 28 11:02:33 2...
1     SPAM  From dave650@altavista.com  Mon Jun 24 17:49:4...
2     SPAM

### Removal of unnecessary things 

In [2]:
import pandas as pd 
import email 
from email import policy
from email.parser import BytesParser

In [3]:
import re
# Regex patterns to extract metadata
metadata_patterns = {
    'From': r"^From: (.*)$",
    'To': r"^To: (.*)$",
    'Subject': r"^Subject: (.*)$",
    'Date': r"^Date: (.*)$",
}

def parse_email(raw_email):
    # Parse email metadata
    metadata = {}
    for key, pattern in metadata_patterns.items():
        match = re.search(pattern, raw_email, re.MULTILINE)
        if match:
            metadata[key] = match.group(1)

    # Extract email body
    email_body_start = raw_email.find('\n\n') + 2
    body = raw_email[email_body_start:]

    return metadata, body


# Replace 'Content' with the appropriate column name if it's different

# Apply the parse_email function to each row in the DataFrame
parsed_data = df['Content'].apply(parse_email)

# Create a new DataFrame from the parsed email data
parsed_df = pd.DataFrame(parsed_data.tolist(), columns=['Metadata', 'Body'])
parsed_df.head()

Unnamed: 0,Metadata,Body
0,"{'From': '""Tony L. Svanstrom"" <tony@svanstrom....","On Tue, 27 Aug 2002 the voices made Robin Lynn..."
1,"{'From': '<dave650@altavista.com>', 'To': '<we...",<html>\n<head>\n<title>Have a BLAST in bed</ti...
2,"{'From': 'Webmaster_r4623@yahoo.com', 'To': 'w...","<html>\n<body bgcolor=""#7CB5F0"">\n<a href=""htt..."
3,"{'From': 'boingboing <rssfeeds@example.com>', ...",URL: http://boingboing.net/#85512062\nDate: No...
4,"{'From': 'tim.one@comcast.net (Tim Peters)', '...",[Tim]\n>> I'd prefer to strip HTML tags from e...


In [4]:
# Remove the date from the body column
parsed_df['Body'] = parsed_df['Body'].apply(lambda x: re.sub(r"^Date: .*", "", x, flags=re.MULTILINE))
parsed_df.head()

Unnamed: 0,Metadata,Body
0,"{'From': '""Tony L. Svanstrom"" <tony@svanstrom....","On Tue, 27 Aug 2002 the voices made Robin Lynn..."
1,"{'From': '<dave650@altavista.com>', 'To': '<we...",<html>\n<head>\n<title>Have a BLAST in bed</ti...
2,"{'From': 'Webmaster_r4623@yahoo.com', 'To': 'w...","<html>\n<body bgcolor=""#7CB5F0"">\n<a href=""htt..."
3,"{'From': 'boingboing <rssfeeds@example.com>', ...",URL: http://boingboing.net/#85512062\n\n\nHypo...
4,"{'From': 'tim.one@comcast.net (Tim Peters)', '...",[Tim]\n>> I'd prefer to strip HTML tags from e...


In [5]:
def parse_email_body(body):
    # Remove HTML tags
    body = re.sub(r'<[^>]+>', '', body)
    
    # Remove URLs
    body = re.sub(r'http\S+', '', body)
    
    # Remove special characters and escape sequences
    body = re.sub(r'=22', '"', body)
    body = re.sub(r'=3D', '=', body)
    body = re.sub(r'<=21', '!', body)
    body = re.sub(r'=\n', '', body)
    body = re.sub(r'\n', ' ', body)
    
    return body.strip()

# Apply the parse_email_body function to the 'Body' column of parsed_df
parsed_df['Body'] = parsed_df['Body'].apply(parse_email_body)
parsed_df.head()

Unnamed: 0,Metadata,Body
0,"{'From': '""Tony L. Svanstrom"" <tony@svanstrom....","On Tue, 27 Aug 2002 the voices made Robin Lynn..."
1,"{'From': '<dave650@altavista.com>', 'To': '<we...","Have a BLAST in bed Have a BLAST in bed, GUA..."
2,"{'From': 'Webmaster_r4623@yahoo.com', 'To': 'w...",4623
3,"{'From': 'boingboing <rssfeeds@example.com>', ...",URL: Hypocrites in the recording industry h...
4,"{'From': 'tim.one@comcast.net (Tim Peters)', '...",[Tim] >> I'd prefer to strip HTML tags from ev...


In [6]:
# Define a function to extract the main body of the email
def extract_main_body(body):
    # Split the body by lines
    lines = body.split('\n')
    main_body_lines = []
    # Iterate over the lines and exclude lines that indicate metadata or signatures
    for line in lines:
        if not re.match(r'^[-]+\s*$', line) and not re.match(r'^------------------', line) and not re.match(r'^\s*Yahoo! Groups Sponsor', line) and not re.match(r'^Your use of Yahoo! Groups is subject to', line):
            main_body_lines.append(line)
    # Join the remaining lines to form the main body
    main_body = '\n'.join(main_body_lines)
    return main_body.strip()

# Apply the extract_main_body function to the 'Body' column of parsed_df
parsed_df['Body'] = parsed_df['Body'].apply(extract_main_body)
parsed_df.head()

Unnamed: 0,Metadata,Body
0,"{'From': '""Tony L. Svanstrom"" <tony@svanstrom....","On Tue, 27 Aug 2002 the voices made Robin Lynn..."
1,"{'From': '<dave650@altavista.com>', 'To': '<we...","Have a BLAST in bed Have a BLAST in bed, GUA..."
2,"{'From': 'Webmaster_r4623@yahoo.com', 'To': 'w...",4623
3,"{'From': 'boingboing <rssfeeds@example.com>', ...",URL: Hypocrites in the recording industry h...
4,"{'From': 'tim.one@comcast.net (Tim Peters)', '...",[Tim] >> I'd prefer to strip HTML tags from ev...


### Concatenate my df with my parced_df and keep only the Label column and Body column 

In [7]:
# Concatenate the two DataFrames along the columns axis
merged_df = pd.concat([df, parsed_df], axis=1)
merged_df.head()

Unnamed: 0,Label,Content,Metadata,Body
0,HAM,From tony@svanstrom.com Wed Aug 28 11:02:33 2...,"{'From': '""Tony L. Svanstrom"" <tony@svanstrom....","On Tue, 27 Aug 2002 the voices made Robin Lynn..."
1,SPAM,From dave650@altavista.com Mon Jun 24 17:49:4...,"{'From': '<dave650@altavista.com>', 'To': '<we...","Have a BLAST in bed Have a BLAST in bed, GUA..."
2,SPAM,From Webmaster_r@icqmail.com Wed Jul 3 12:08...,"{'From': 'Webmaster_r4623@yahoo.com', 'To': 'w...",4623
3,HAM,From rssfeeds@jmason.org Wed Oct 2 11:44:52 ...,"{'From': 'boingboing <rssfeeds@example.com>', ...",URL: Hypocrites in the recording industry h...
4,HAM,Return-Path: tim.one@comcast.net\nDelivery-Dat...,"{'From': 'tim.one@comcast.net (Tim Peters)', '...",[Tim] >> I'd prefer to strip HTML tags from ev...


In [8]:
# Drop the 'Metadata' and 'Content' columns from the DataFrame
merged_df = merged_df.drop(columns=['Metadata', 'Content'])
merged_df.head()

Unnamed: 0,Label,Body
0,HAM,"On Tue, 27 Aug 2002 the voices made Robin Lynn..."
1,SPAM,"Have a BLAST in bed Have a BLAST in bed, GUA..."
2,SPAM,4623
3,HAM,URL: Hypocrites in the recording industry h...
4,HAM,[Tim] >> I'd prefer to strip HTML tags from ev...


In [9]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4198 entries, 0 to 4197
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   4198 non-null   object
 1   Body    4198 non-null   object
dtypes: object(2)
memory usage: 65.7+ KB


### Assign the category to spam and ham email

In [10]:
# Convert 'SPAM' label to 0 and 'HAM' label to 1
merged_df.loc[merged_df['Label'] == 'SPAM', 'Label'] = 0
merged_df.loc[merged_df['Label'] == 'HAM', 'Label'] = 1
merged_df.head()

Unnamed: 0,Label,Body
0,1,"On Tue, 27 Aug 2002 the voices made Robin Lynn..."
1,0,"Have a BLAST in bed Have a BLAST in bed, GUA..."
2,0,4623
3,1,URL: Hypocrites in the recording industry h...
4,1,[Tim] >> I'd prefer to strip HTML tags from ev...


### Split the data set into training and testing

In [11]:
x = merged_df['Body']
y = merged_df['Label']

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

### Transform the text data to feature vectors that can be used as input to the logistic regression 

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
# Fit the vectorizer on the training data and transform both training and testing data
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

In [14]:
# convert y_train and y_test values into integers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [15]:
print(x_train)

964     Definitional nit to pick:  Robert Harley write...
239     URL:    Japanese toilet technology has develop...
2387    --===_SecAtt_000_1fuklemuttfusq Content-Type: ...
990     NEW! -&gt; Vigoral Herbal Love Enhancers &lt;-...
108     --eJnRUKwClWJh1Khz Content-Type: text/plain; c...
                              ...                        
3444    [Jeremy Hylton] > I think one step towards dep...
466     Hello,  Premium Phone Qualified  Business Oppo...
3092    This is a multi-part message in MIME format.  ...
3772    URL:    I am thinking about getting a new Mac....
860     On Thursday, August 22, 2002, at 10:24  AM, Ju...
Name: Body, Length: 3358, dtype: object


In [16]:
print(x_train_features)

  (0, 23992)	0.06759202437700694
  (0, 50206)	0.09535654439965532
  (0, 16696)	0.06798185201716103
  (0, 49880)	0.054278016664376025
  (0, 39871)	0.07341210551766524
  (0, 15009)	0.08928350312346656
  (0, 25204)	0.05790860071582236
  (0, 33892)	0.11908947388409351
  (0, 33138)	0.17056658619663329
  (0, 4049)	0.06684873999406919
  (0, 36815)	0.19948148319734568
  (0, 37326)	0.06014429311429126
  (0, 44773)	0.04785963681383048
  (0, 33258)	0.054403823953132806
  (0, 52390)	0.10781678138134491
  (0, 47459)	0.1001769884475507
  (0, 44260)	0.09535654439965532
  (0, 20308)	0.06614920424502743
  (0, 37328)	0.2746040222003029
  (0, 6939)	0.28253893524816037
  (0, 35737)	0.07866161759276477
  (0, 46783)	0.039619033284625405
  (0, 44258)	0.2271019547709221
  (0, 24848)	0.044847553138721914
  (0, 37999)	0.04892728638910912
  :	:
  (3357, 32542)	0.04115714227405116
  (3357, 1570)	0.06107566950443514
  (3357, 9416)	0.060388158773411436
  (3357, 13310)	0.05005059385750853
  (3357, 51521)	0.042004147

### Train the model (Logistic Regression)

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
mailModel = LogisticRegression()

In [19]:
mailModel.fit(x_train_features, y_train)

### The evaluating of the training model and the prediction of the training data

In [20]:
from sklearn.metrics import accuracy_score

In [22]:
prediction_training_data = mailModel.predict(x_train_features)
accuracy_training_data = accuracy_score(y_train, prediction_training_data)
print(accuracy_training_data)

0.9714115544967242


In [23]:
prediction_test_data = mailModel.predict(x_test_features)
accuracy_testing_data = accuracy_score(y_test, prediction_test_data)
print(accuracy_testing_data)

0.969047619047619


In [24]:
input_your_email = ["urgent action required dear valued customer urgent action required update account information click link verify details thank cooperation"]
input_data_features = feature_extraction.transform(input_your_email)
prediction = mailModel.predict(input_data_features)
print(prediction)
if(prediction[0]==1):
    print('Ham mail')
else:
    print('Spam mail') 

[0]
Spam mail
