In [1]:
import os 
import getpass 
import re
import mailparser 
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore

from llama_index.core.node_parser import SentenceSplitter, SentenceWindowNodeParser, MetadataAwareTextSplitter, SentenceSplitter
from llama_index.core import Document
import uuid

In [2]:
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
llm = OpenAI(model="gpt-4o-mini")

Settings.embed_model = embed_model
Settings.llm = llm

# Email class

class Email():
    def __init__(self, mail):
        self.date = mail.headers.get('Date', 'N/A')
        self.from_ = mail.headers.get('From', 'N/A')
        self.to = mail.headers.get('To', 'N/A')
        self.cc = mail.headers.get('cc', 'N/A')
        self.subject = mail.headers.get('Subject', 'N/A')
        self.body = mail.body if mail.body else 'N/A'
        self.attachments = []
        #self.attachments = mail.attachments if mail.attachments else []

def preprocess_email_body(body):
    """
    Preprocess email body by removing HTML content and recurring boilerplate text.
    
    Args:
        email_body (str): The raw email body text
    
    Returns:
        str: Cleaned email body text
    """
    # Remove HTML content (anything after the mail_boundary tag)
    if "--- mail_boundary ---" in body:
        body = body.split("--- mail_boundary ---")[0].strip()
    
    # Define patterns to remove
    patterns = [
        # Contact information and corporate details - more general pattern
        # Matches lines with "Tel.:" plus any content up to and including the address line
        r"Tel\.:.*?[\r\n].*?Fritz-Erler-Str\. 25 - 76133 Karlsruhe",

        r"Telefon:.*?[\r\n].*?Fritz-Erler-Str\. 25 - 76133 Karlsruhe",
        
        # Website URL
        r"www\.ingenieurgruppe-bauen\.de",
        
        # Company information
        r"BERATENDE INGENIEURE VBI *PRÜFINGENIEURE VPI.*?KARLSRUHE *\| *MANNHEIM *\| *BERLIN *\| *FREIBURG",
        
        # Confidentiality notice
        r"Diese E-Mail einschl\. Anlage\(n\) enthält vertrauliche und/oder rechtlich.*?stattet\.",
        
        # Extra whitespace and blank lines at the end
        r"\n\s*\n\s*$"
    ]
    
    # Apply the patterns
    for pattern in patterns:
        body = re.sub(pattern, "", body, flags=re.DOTALL)
    
    # Clean up extra whitespace
    body = re.sub(r"\n{3,}", "\n\n", body)
    
    return body.strip()

def parse_email(mail):
    email = Email(mail)
    
    node_parser = SentenceSplitter(chunk_size=150, chunk_overlap=20)
    nodes = node_parser.get_nodes_from_documents(
        [Document(text=preprocess_email_body(email.body))]
    )

    mail_uuid = uuid.uuid4()
    for i, node in enumerate(nodes, start=1):
        node.metadata['uuid'] = str(mail_uuid)
        node.metadata['date'] = str(email.date)
        node.metadata['from'] = str(email.from_)
        node.metadata['to'] = str(mail.to)
        node.metadata['subject'] = str(email.subject)
        node.metadata['attachments'] = None
        node.metadata['num'] = i

    
    return nodes

persist_dir = "storage_chroma"

vector_store = ChromaVectorStore.from_params(
    collection_name="text_nodes", persist_dir=persist_dir
)
index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)


In [3]:
# iterate through all emails in a folder and parse them to textnodes
directory_path = 'mails/'
file_list = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]


all_text_nodes = []

for file in file_list:
    file_path = directory_path + file
    with open(file_path, 'r') as f:
        mail = mailparser.parse_from_file_obj(f)

        nodes = parse_email(mail)
        all_text_nodes.extend(nodes)

all_text_nodes

[TextNode(id_='41bc5535-0264-454f-868e-f8a0de3afb08', embedding=None, metadata={'uuid': '2d48b172-9efe-414e-abc3-3da1b6477cd2', 'date': 'Thu, 04 Feb 2016 08:27:00 +0100', 'from': "[('Martin Rudolf', 'martin.rudolf@ingenieurgruppe-bauen.de')]", 'to': "[('', 'Falko.Arend@mobil.hessen.de')]", 'subject': 'Fuldabrücke Bergshausen FTP-Server', 'attachments': None, 'num': 1}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='46fb4dc3-8c4f-4a68-94a4-4fb60cb33ecf', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='bc3c1f6ae4b70e68afd6e270299416cf97a1f5bb5f9618b5c65ab601ac079326'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='2abddc70-de94-4733-880f-d30cfc54dc43', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='f65f99ebc3a92c2bf46048763335040520329e47b3b4faff8b705e003ed37603')}, metadata_template='{key}: {value}', metadata_separator='\n', text='Sehr geehrter Herr Arend,\n\nwie vereinbart s

In [4]:
index.insert_nodes(all_text_nodes)


In [13]:
retriever = index.as_retriever(similarity_top_k=4)
result_nodes = retriever.retrieve("Patick Höhl")

In [14]:
for node_with_score in result_nodes:
    node = node_with_score.node
    embedding = node.embedding
    print(embedding)

None
None
None
None


In [15]:
result_nodes

[NodeWithScore(node=TextNode(id_='ab50dc87-76cc-41b1-9573-1c4da6d73e79', embedding=None, metadata={'uuid': '3dd40b04-b8a4-4547-ab82-a1cedccf4ddf', 'date': 'Mon, 15 Feb 2016 10:18:00 +0100', 'from': "[('Patrick Höhl', 'patrick.hoehl@ingenieurgruppe-bauen.de')]", 'to': "[('', 'Falko.Arend@mobil.hessen.de')]", 'subject': 'A44 Ertüchtigung Fuldatalbrücke Bergshausen - Konzept zur Probenentnahme und Vermessung ohne Untersichtgerät', 'attachments': None, 'num': 4}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='91c6cde9-3a46-4bee-b435-7e5064984a51', node_type='4', metadata={}, hash='ce3452e30f9326794830bfb788473f34e55df156772a932ca324817ca2bb90a7'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='257b740a-0ab2-4f67-b190-54e6a4ca0ae0', node_type='1', metadata={}, hash='b4df1ea5b093934842049c1d39589dfb3f9473abf7e875221e80da5528dfde03')}, metadata_template='{key}: {value}', metadata_separator='\n