In [None]:
from nltk.lm import Vocabulary

%load_ext autoreload
%autoreload 2

import os
from email import policy
from email.parser import BytesParser

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl

from src.spam_classifier.constants import PROJECT_ROOT

DATA_DIR = PROJECT_ROOT / "data"

EASY_HAM = DATA_DIR / "easy_ham_1"
SPAM = DATA_DIR / "spam_1"

In [None]:
def parse_email(filepath):
    with open(filepath, "rb") as f:
        msg = BytesParser(policy=policy.default).parse(f)

    try:
        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain" and not part.get_content_disposition():
                    body += f"{part.get_content()} "
                    break
        else:
            body = msg.get_content()
    except Exception as e:
        print(f"Error processing {filepath.name}: {e}")

    return {"body": body, "type": msg.get_content_type()}

In [None]:
ham_emails = [parse_email(f) for f in EASY_HAM.iterdir()]
ham_emails_df = pl.from_dicts(ham_emails)

spam_emails = [parse_email(f) for f in SPAM.iterdir()]
spam_emails_df = pl.from_dicts(spam_emails)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

from src.spam_classifier.mail_class import Mail


class MailTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        email_list = []
        for row in X.rows():
            body, type = row
            email_list.append(Mail(body, type).transform_mail())
        return email_list

In [None]:
transformed_spam = MailTransformer().fit_transform(spam_emails_df)

In [None]:
class MailVocabulary(BaseEstimator, TransformerMixin):
    def __init__(self, vocab_size: int = 1000):
        self.vocab_size = vocab_size

    def fit(self, X, y=None):
        word_counter = {}
        for word_dict in X:
            for word, count in word_dict.items():
                word_counter[word] = word_counter.get(word, 0) + count
        most_common = list(
            dict(sorted(word_counter.items(), key=lambda x: x[1], reverse=True)).keys()
        )[: self.vocab_size]
        self.vocabulary_ = {word: i for i, word in enumerate(["unknown"] + most_common)}
        return self

    def transform(self, X):
        for word_dict in X:
            print(word_dict)
            break
        return

In [None]:
transformed_spam_emails = MailTransformer().fit_transform(spam_emails_df)

In [None]:
mail_vocab = MailVocabulary()

mail_vocab.fit_transform(transformed_spam_emails)