Source: https://colab.research.google.com/drive/1FMS93E029-Z6-1dEt1Gm5QJqSlVxdbTs

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random

In [2]:
# Define parameters
num_samples_bad_emails = 1000
num_samples_good_emails = 1000
file_size_range = (1000, 10000000)
attachment_types_bad = [".exe", ".dll", ".zip", ".js"]
attachment_types_good = [".doc", ".pdf", ".txt", ".jpg", ".png"]
keywords_bad_subject = [
    "urgent",
    "action required",
    "account suspension",
    "security alert",
    "click here",
    "verify your account",
    "password reset",
    "your account has been compromised",
    "important notice",
    "confirm your identity",
    "unauthorized access",
    "payment verification required",
    "immediate action needed",
    "account locked",
    "suspicious activity detected",
]
keywords_bad_body = [
    "urgent",
    "action required",
    "account suspension",
    "security alert",
    "click here",
    "verify your account",
    "password reset",
    "your account has been compromised",
    "important notice",
    "confirm your identity",
    "unauthorized access",
    "payment verification required",
    "immediate action needed",
    "account locked",
    "suspicious activity detected",
]
keywords_good_subject = [
    "meeting",
    "invoice",
    "report",
    "thank you",
    "attached file",
    "please find attached",
    "details attached",
    "feedback requested",
    "confirmation",
    "subscription update",
    "delivery confirmation",
    "account update",
    "newsletter",
    "invitation",
    "announcement",
    "welcome",
    "feedback request",
    "survey response required",
]
keywords_good_body = [
    "meeting",
    "invoice",
    "report",
    "thank you",
    "attached file",
    "please find attached",
    "details attached",
    "feedback requested",
    "confirmation",
    "subscription update",
    "delivery confirmation",
    "account update",
    "newsletter",
    "invitation",
    "announcement",
    "welcome",
    "feedback request",
    "survey response required",
]
spam_phrases = ["you have won", "limited time offer", "free gift", "click now"]
start_date = datetime.now() - timedelta(days=365)
end_date = datetime.now()

In [3]:
# Function to generate email body length
def generate_email_length():
    return np.random.randint(50, 1000)  # Random length between 50 and 1000 characters


# Function to generate number of attachments
def generate_num_attachments():
    return np.random.randint(0, 5)  # Random number of attachments between 0 and 5


# Function to generate presence of links
def generate_presence_of_links():
    return 1 if random.random() < 0.5 else 0  # 50% chance of having links


# Function to generate time since last email
def generate_time_since_last_email():
    return np.random.randint(1, 30)  # Random time between 1 and 30 days


# Function to generate frequency of email
def generate_frequency_of_email():
    return np.random.randint(1, 10)  # Random frequency between 1 and 10 emails per week


# Function to generate subjectivity score
def generate_subjectivity_score():
    return np.random.uniform(0, 1)  # Random subjectivity score between 0 and 1


# Function to generate number of recipients
def generate_num_recipients():
    return np.random.randint(1, 10)  # Random number of recipients between 1 and 10


# Function to generate time of day
def generate_time_of_day():
    return random.choice(["morning", "afternoon", "evening"])  # Random time of day

In [4]:
# gen synth data for bad emails
bad_email_data = []
for _ in range(num_samples_bad_emails):
    sender_reputation = random.uniform(0, 1)
    email_subject = random.choice(keywords_bad_subject)
    email_body = random.choice(keywords_bad_body)
    attachment_type = random.choice(attachment_types_bad)
    links_in_email = generate_presence_of_links()
    email_header_info = random.uniform(0, 1)
    time_of_arrival = random.uniform(0, 1)
    email_recipients = generate_num_recipients()
    spam_phishing_indicators = (
        1
        if any(
            phrase in email_subject or phrase in email_body for phrase in spam_phrases
        )
        else 0
    )
    metadata_creation_date = start_date + (end_date - start_date) * np.random.rand()
    metadata_modification_date = metadata_creation_date + timedelta(
        days=np.random.randint(1, 365)
    )
    feature1 = generate_email_length()
    feature2 = generate_num_attachments()
    feature3 = generate_time_since_last_email()
    feature4 = generate_frequency_of_email()
    feature5 = generate_subjectivity_score()
    feature6 = generate_num_recipients()
    feature7 = generate_time_of_day()
    bad_email_data.append(
        [
            sender_reputation,
            email_subject,
            email_body,
            attachment_type,
            links_in_email,
            email_header_info,
            time_of_arrival,
            email_recipients,
            spam_phishing_indicators,
            metadata_creation_date,
            metadata_modification_date,
            feature1,
            feature2,
            feature3,
            feature4,
            feature5,
            feature6,
            feature7,
            1,
        ]
    )

In [5]:
# gen synth data for good emails
good_email_data = []
for _ in range(num_samples_good_emails):
    sender_reputation = random.uniform(0, 1)
    email_subject = random.choice(keywords_good_subject)
    email_body = random.choice(keywords_good_body)
    attachment_type = random.choice(attachment_types_good)
    links_in_email = generate_presence_of_links()
    email_header_info = random.uniform(0, 1)
    time_of_arrival = random.uniform(0, 1)
    email_recipients = generate_num_recipients()
    spam_phishing_indicators = 0
    metadata_creation_date = start_date + (end_date - start_date) * np.random.rand()
    metadata_modification_date = metadata_creation_date + timedelta(
        days=np.random.randint(1, 365)
    )
    feature1 = generate_email_length()
    feature2 = generate_num_attachments()
    feature3 = generate_time_since_last_email()
    feature4 = generate_frequency_of_email()
    feature5 = generate_subjectivity_score()
    feature6 = generate_num_recipients()
    feature7 = generate_time_of_day()
    good_email_data.append(
        [
            sender_reputation,
            email_subject,
            email_body,
            attachment_type,
            links_in_email,
            email_header_info,
            time_of_arrival,
            email_recipients,
            spam_phishing_indicators,
            metadata_creation_date,
            metadata_modification_date,
            feature1,
            feature2,
            feature3,
            feature4,
            feature5,
            feature6,
            feature7,
            0,
        ]
    )

In [6]:
# Combine bad and good email data
data = bad_email_data + good_email_data

In [7]:
# Convert to DataFrame for easier handling
columns = [
    "Sender Reputation",
    "Email Subject",
    "Email Body",
    "Attachment Type",
    "Links in Email",
    "Email Header Info",
    "Time of Arrival",
    "Email Recipients",
    "Spam/Phishing Indicators",
    "Metadata Creation Date",
    "Metadata Modification Date",
    "Email Length",
    "Number of Attachments",
    "Time Since Last Email",
    "Frequency of Email",
    "Subjectivity Score",
    "Number of Recipients",
    "Time of Day",
    "Label",
]
df = pd.DataFrame(data, columns=columns)

In [None]:
df.head()

Unnamed: 0,Sender Reputation,Email Subject,Email Body,Attachment Type,Links in Email,Email Header Info,Time of Arrival,Email Recipients,Spam/Phishing Indicators,Metadata Creation Date,Metadata Modification Date,Email Length,Number of Attachments,Time Since Last Email,Frequency of Email,Subjectivity Score,Number of Recipients,Time of Day,Label
0,0.732123,invitation,feedback request,.png,0,0.1416,0.323311,8,0,2023-07-05 06:58:32.271078,2024-05-08 06:58:32.271078,680,4,8,1,0.303372,1,evening,0
1,0.858427,confirm your identity,unauthorized access,.js,1,0.676196,0.848633,7,0,2024-01-24 09:43:52.098709,2024-07-09 09:43:52.098709,163,0,6,6,0.845329,4,afternoon,1
2,0.421975,report,subscription update,.pdf,0,0.894849,0.219638,5,0,2023-06-30 21:27:33.951214,2023-11-22 21:27:33.951214,845,2,27,8,0.035958,3,afternoon,0
3,0.109848,delivery confirmation,confirmation,.png,1,0.136166,0.952526,6,0,2023-08-04 05:16:25.445592,2024-01-16 05:16:25.445592,617,4,1,6,0.884829,8,afternoon,0
4,0.652415,newsletter,meeting,.doc,1,0.490723,0.453885,4,0,2024-02-10 02:26:35.868473,2024-03-23 02:26:35.868473,250,1,19,9,0.921329,6,afternoon,0


In [8]:
# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

df.to_csv("synthetic_email_dataset_with_features.csv", index=False)