In [None]:
import email.parser

%load_ext autoreload
%autoreload 2

import os
from email import policy
from email.parser import BytesParser

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl

from src.spam_classifier.constants import PROJECT_ROOT

DATA_DIR = PROJECT_ROOT / "data"

EASY_HAM = DATA_DIR / "easy_ham_1"
SPAM = DATA_DIR / "spam_1"

In [None]:
def parse_email(filepath):
    with open(filepath, "rb") as f:
        msg = BytesParser(policy=policy.default).parse(f)

    try:
        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain" and not part.get_content_disposition():
                    body = part.get_content()
                    break
        else:
            body = msg.get_content()
    except Exception as e:
        print(f"Error processing {filepath.name}: {e}")

    return {"body": body, "type": msg.get_content_type()}

In [None]:
ham_emails = [parse_email(f) for f in EASY_HAM.iterdir()]
spam_emails = [parse_email(f) for f in SPAM.iterdir()]

In [None]:
types_of_ham_emails = [mail["type"] for mail in ham_emails]
types_of_ham_emails_df = pl.Series(types_of_ham_emails)

types_of_ham_emails_df.value_counts().sort(by="count", descending=True).with_columns(
    count=pl.col("count") / pl.col("count").sum()
)

In [None]:
types_of_spam_emails = [mail["type"] for mail in spam_emails]
types_of_spam_emails_df = pl.Series(types_of_spam_emails)

types_of_spam_emails_df.value_counts().sort(by="count", descending=True).with_columns(
    count=pl.col("count") / pl.col("count").sum()
)

In [None]:
from src.spam_classifier.mail_class import Mail

for mail in spam_emails:
    if mail["type"] == "text/html":
        mail_inst = Mail(mail["body"], mail["type"])
        mail_inst.transform_mail()
        break

In [None]:
from src.spam_classifier.mail_class import Mail

for mail in ham_emails[1:]:
    if mail["type"] == "multipart/related":
        print(mail)
        mail_inst = Mail(mail["body"], mail["type"])
        mail_inst.transform_mail()
    break