In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import email
import re
from email.message import Message
from email.parser import BytesParser, Parser
from pathlib import Path
from typing import List, Union

import magic
from email_reply_parser import EmailReplyParser
from IPython.core.interactiveshell import InteractiveShell
from langchain.document_loaders import UnstructuredEmailLoader
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import NarrativeText, Title
from unstructured.partition.auto import partition
from unstructured.partition.email import partition_email

from redbox.models.file import Chunk, File
from redbox.parsing.chunkers import email_chunker

InteractiveShell.ast_node_interactivity = "all"

x[0]

# Email chunking tests

See [unstructured's `partition_email` documentation](https://unstructured-io.github.io/unstructured/bricks/partition.html#partition-email) and the [source code](https://github.com/Unstructured-IO/unstructured/blob/7fdddfbc1e98fb4d7a57ebb11aef4eca07b076e6/unstructured/partition/email.py#L4)

In [None]:
emails = Path("/Users/willlangdale/Downloads/mikel mail master spec-fixtures_emails")
email_list = list(emails.glob("**/*.eml"))
email_files = []
for i, file in enumerate(email_list):
    email_files.append(File(path=file.as_posix(), type=file.suffix, name=file.stem))

In [None]:
[(i, e) for i, e in enumerate(email_list) if "DDaT" in e.as_posix()]

In [None]:
email_files[1]

In [None]:
file = email_files[1]

with open(file.path, "r") as f:
    email = f.read()

email_parser = BytesParser()
messages = re.split(r"(?=^From:)", email, flags=re.M)
messages = list(filter(None, messages))

raw_chunks = []
for message in messages:
    message = email_parser.parsebytes(message.encode("utf-8"))

    metadata = dict(message.items())
    metadata["parent_doc_uuid"] = file.uuid
    metadata["file_directory"] = file.path
    metadata["filename"] = file.name
    metadata["filetype"] = "message/rfc822"
    metadata["last_modified"] = message.get("Date") or ""

    if message.is_multipart():
        for part in message.get_payload():
            raw_chunks.append({"metadata": metadata, "text": part.get_payload()})
    else:
        raw_chunks.append({"metadata": metadata, "text": message.get_payload()})

In [None]:
blob = open(file.path, "rb").read()
m = magic.open(magic.MAGIC_MIME_ENCODING)
m.load()
encoding = m.buffer(blob)
encoding

In [None]:
def email_chunker(file: File) -> List[Chunk]:
    with open(file.path, "r") as f:
        email = f.read()

    email_parser = BytesParser()
    messages = re.split(r"(?=^From:)", email, flags=re.M)
    messages = list(filter(None, messages))

    raw_chunks = []
    for message in messages:
        message = email_parser.parsebytes(message.encode("utf-8"))

        metadata = dict(message.items())
        metadata["parent_doc_uuid"] = file.uuid
        metadata["file_directory"] = file.path
        metadata["filename"] = file.name
        metadata["filetype"] = "message/rfc822"
        metadata["last_modified"] = message.get("Date") or ""

        if message.is_multipart():
            for part in message.get_payload():
                raw_chunks.append({"metadata": metadata, "text": part.get_payload()})
        else:
            raw_chunks.append({"metadata": metadata, "text": message.get_payload()})

    chunks = []
    for i, raw_chunk in enumerate(raw_chunks):
        chunk = Chunk(
            parent_file=file,
            index=i,
            text=raw_chunk["text"],
            metadata=raw_chunk["metadata"],
        )
        chunks.append(chunk)

    return chunks

In [None]:
def email_chunker(file: File) -> List[Chunk]:
    elements = partition_email(filename=file.path)

    chunks = []
    for i, raw_chunk in enumerate(elements):
        raw_chunk = raw_chunk.to_dict()
        raw_chunk["metadata"]["parent_doc_uuid"] = file.uuid

        chunk = Chunk(
            parent_file=file,
            index=i,
            text=raw_chunk["text"],
            metadata=raw_chunk["metadata"],
        )
        chunks.append(chunk)

    return chunks

In [None]:
email_chunker(email_files[9])

In [None]:
def get_payload(email: Union[Message, List[Message]]):
    flattened_list = []

    for em in email:
        if isinstance(email, Message):
            flattened_list.append(email.get_payload())
        else:
            flattened_list.extend(get_payload(em))

    return flattened_list

    for item in animal_list:
        if isinstance(item, str):
            flattened_list.append(item.lower())
        else:
            flattened_list.extend(flatten_and_lowercase(item))
    return flattened_list


animals = [
    "DOG",
    ["CAT", "RABBIT"],
    "COW",
    ["CHICKEN", ["NEWT", ["BEAR"], "OWL"], "FISH"],
    "EMU",
]
result = flatten_and_get_characters(animals)
print(result)

In [None]:
# pass an object
# if it's an email, return payload
# if it's a list, iterate over and return payload


def get_payload(email: Union[Message, List[Message]]) -> List[str]:
    out = []
    if isinstance(email, list):
        for em in email:
            content = get_payload(email)
            out.extend(content)
    elif isinstance(email, Message):
        out.append(email.get_payload())

    return out

    # if isinstance(email, Message):
    #     content = email.get_payload()
    #     if isinstance(content, list):
    #         content = get_payload(content)
    #     else:
    #         out.append(content)
    #         # print(out)
    # elif isinstance(email, list):
    #     content = [get_payload(em) for em in email]
    #     out.extend(content)
    #     print(out)
    # return out

In [None]:
msg = email_parser.parsebytes(messages[1].encode("utf-8"))

In [None]:
get_payload(msg)

In [None]:
msg.get_payload()[0].get_payload()

In [None]:
email_parser.parsebytes(messages[1])

In [None]:
raw_chunks

In [None]:
chunks = []
for i, raw_chunk in enumerate(raw_chunks):
    chunk = Chunk(
        parent_file=file,
        index=i,
        text=raw_chunk["text"],
        metadata=raw_chunk["metadata"],
    )
    chunks.append(chunk)

In [None]:
email_chunker(email_files[1])

In [None]:
email_list[0]

# Analyse document structure to extract elements
elements = partition(filename=email_list[0], content_source="text/plain")

# Link elements into chunks
raw_chunks = chunk_by_title(elements=elements)

In [None]:
[el for el in elements]
[el for el in elements]

In [None]:
[str(el) for el in elements if isinstance(el, Title)]
[str(el) for el in elements if isinstance(el, NarrativeText)]

In [None]:
print("\n\n".join([str(el) for el in elements]))

In [None]:
print("\n\n--------------------\n\n".join([str(ch) for ch in raw_chunks]))

In [None]:
email_loader = UnstructuredEmailLoader(
    file_path=email_list[0],
    # mode='elements',
    process_attachments=False,
)

email = email_loader.load()
email

In [None]:
with open(email_list[0], "r") as f:
    parser = Parser()
    email = parser.parse(f)

email.get_boundary()

In [None]:
# email.keys()
email["From"]

In [None]:
def split_email_reply(email_text):
    email_objects = []
    email_parser = BytesParser()
    messages = re.split(r"(?=^From:)", email_text, flags=re.M)
    messages[0:2] = ["".join(messages[0:2])]
    messages = list(filter(None, messages))

    for message in messages:
        message = email_parser.parsebytes(message.encode("utf-8"))
        email_objects.append(message)

    return email_objects

In [None]:
x[0].get("Date")

In [None]:
with open(email_list[0], "r") as f:
    email = f.read()

x = split_email_reply(email)

[m["To"] for m in x]
[m["From"] for m in x]
[m["Subject"] for m in x]
[m["Received"] for m in x]
[m["Date"] for m in x]
[m["Content-Type"] for m in x]
[m.get_payload(decode=True) for m in x]

In [None]:
dir(x[0])
dict(x[0].items())

In [None]:
if email_message.is_multipart():
    for part in email_message.get_payload():
        body = part.get_payload()
        # more processing?
else:
    body = email_message.get_payload()

In [None]:
email_objects = []
email_parser = BytesParser()
messages = re.split(r"(?=^From:)", email, flags=re.M)
messages
# messages = list(filter(None, messages))

In [None]:
messages[0:0]

In [None]:
# [chunk_by_title(n) for n  in x[0]]
y = chunk_by_title(elements=x[3])
[a.to_dict()["text"] for a in y]

In [None]:
print(email)

In [None]:
email_objects = []
email_parser = BytesParser()
messages = re.split(r"(?=^From:)", email, flags=re.M)
messages = list(filter(None, messages))

for message in messages:
    message = email_parser.parsebytes(message.encode("utf-8"))
    email_objects.append(message)

In [None]:
messages[1]

In [None]:
messages = re.split(r"(?=^From:)", email, flags=re.M)

In [None]:
messages

In [None]:
with open(email_list[0], "r") as f:
    email = EmailReplyParser.parse_reply(f.read())

print(email)