In [28]:
import pandas as pd
import re

from email.utils import parseaddr

df = pd.read_csv("../dataset/raw/CEAS_08.csv")

df[["sender", "receiver", "subject", "body", "date"]] = (
    df[["sender", "receiver", "subject", "body", "date"]].fillna("").astype(str)
)


In [None]:
SPAM_WORDS = [
    # Financial
    "free", "win", "winner", "prize", "cash", "money",
    "bonus", "reward", "lottery", "gift",

    # Urgency
    "urgent", "immediately", "action required", "final notice",
    "important", "alert", "warning",

    # Credential theft
    "verify", "confirm", "password", "account", "login",
    "unauthorized", "security", "reset", "access",

    # Phishing actions
    "click", "open", "review", "download", "invoice", "statement",

    # Commercial
    "offer", "promo", "sale", "discount", "limited time"
]

def extract_features(row):
  features = {}

  # Sender Format: Full Name <Local@Domain>
  sender = str(row["sender"])
  sender_name, sender_email = parseaddr(sender)
  sender_domain = sender_email.split("@")[-1].lower() if "@" in sender_email else ""
  
  #features["sender_domain"] = sender_domain
  features["sender_has_numbers"] = int(bool(re.search(r"\d", sender)))
  features["sender_domain_length"] = len(sender_domain)

  # Receiver Format: Local@Domain
  receiver = str(row["receiver"])
  receiver_name, receiver_email = parseaddr(receiver)
  receiver_domain = receiver_email.split("@")[-1].lower() if "@" in receiver_email else ""
  #features["receiver_domain"] = receiver_domain
  features["domain_mismatch"] = int(sender_domain != receiver_domain)

  # Subject
  subject = str(row["subject"])
  features["subject_text"] = subject
  features["subject_length"] = len(subject)

  features["subject_num_exclamations"] = subject.count("!")
  features["subject_num_question"] = subject.count("?")
  features["subject_max_repeated_punct"] = max(
    (len(m.group(0)) for m in re.finditer(r"([!?.,])\1{1,}", subject)),
    default=1
)

  features["subject_num_caps"] = sum(1 for c in subject if c.isupper())
  features["subject_has_spam_words"] = int(any(
      w in subject.lower() for w in SPAM_WORDS
  ))

  # Body
  body = str(row["body"])
  features["body_text"] = body
  features["body_length"] = len(body)

  features["body_num_links"] = len(re.findall(r"https?://", body.lower()))

  features["body_num_exclamations"] = body.count("!")
  features["body_num_question"] = body.count("?")
  features["body_max_repeated_punct"] = max(
    (len(m.group(0)) for m in re.finditer(r"([!?.,])\1{1,}", body)),
    default=1
)

  features["body_num_caps"] = sum(1 for c in body if c.isupper())
  features["body_has_spam_words"] = int(any(
      w in body.lower() for w in SPAM_WORDS
  ))

  # Date
  date_str = str(row["date"])
  dt = pd.to_datetime(date_str, errors="coerce")
  features["email_hour"] = dt.hour if not pd.isna(dt) else -1
  features["email_day_of_week"] = dt.dayofweek if not pd.isna(dt) else -1

  return features

In [30]:

features_df = df.apply(extract_features, axis=1)
features_df = pd.DataFrame(features_df.tolist())
features_df["label"] = df["label"]
features_df.to_csv("../dataset/processed/CEAS08_features.csv", index=False)
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39154 entries, 0 to 39153
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   sender_has_numbers        39154 non-null  int64 
 1   sender_domain_length      39154 non-null  int64 
 2   domain_mismatch           39154 non-null  int64 
 3   subject_text              39154 non-null  object
 4   subject_length            39154 non-null  int64 
 5   subject_num_exclamations  39154 non-null  int64 
 6   subject_num_caps          39154 non-null  int64 
 7   subject_has_spam_words    39154 non-null  int64 
 8   body_text                 39154 non-null  object
 9   body_length               39154 non-null  int64 
 10  body_num_links            39154 non-null  int64 
 11  body_num_exclamations     39154 non-null  int64 
 12  body_has_spam_words       39154 non-null  int64 
 13  email_hour                39154 non-null  int64 
 14  email_day_of_week     