In [None]:
%pip install mail-parser llama-index llama-index-vector-stores-chroma


Collecting mail-parser
  Using cached mail_parser-4.1.2-py3-none-any.whl.metadata (10 kB)
Collecting llama-index
  Downloading llama_index-0.12.19-py3-none-any.whl.metadata (12 kB)
Collecting ipaddress (from mail-parser)
  Using cached ipaddress-1.0.23-py2.py3-none-any.whl.metadata (923 bytes)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.6-py3-none-any.whl.metadata (727 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_cli-0.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.19 (from llama-index)
  Downloading llama_index_core-0.12.19-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_lla

In [1]:
import mailparser
import os
import getpass

#with open('mails/message-11-970808.eml', 'r') as f:
#    mail = mailparser.parse_from_file_obj(f)

In [2]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [2]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding(model="text-embedding-3-large")
llm = OpenAI(model="gpt-4o-mini")

Settings.embed_model = embed_model
Settings.llm = llm

In [3]:
# Email class

class Email():
    def __init__(self, mail):
        self.date = mail.headers.get('Date', 'N/A')
        self.from_ = mail.headers.get('From', 'N/A')
        self.to = mail.headers.get('To', 'N/A')
        self.cc = mail.headers.get('cc', 'N/A')
        self.subject = mail.headers.get('Subject', 'N/A')
        self.body = mail.body if mail.body else 'N/A'
        self.attachments = []
        #self.attachments = mail.attachments if mail.attachments else []
  


In [4]:
from llama_index.core.node_parser import SentenceSplitter, SentenceWindowNodeParser, MetadataAwareTextSplitter, SentenceSplitter
from llama_index.core import Document
import uuid

In [5]:
# function that cleans up the email body

import re

def preprocess_email_body(body):
    """
    Preprocess email body by removing HTML content and recurring boilerplate text.
    
    Args:
        email_body (str): The raw email body text
    
    Returns:
        str: Cleaned email body text
    """
    # Remove HTML content (anything after the mail_boundary tag)
    if "--- mail_boundary ---" in body:
        body = body.split("--- mail_boundary ---")[0].strip()
    
    # Define patterns to remove
    patterns = [
        # Contact information and corporate details - more general pattern
        # Matches lines with "Tel.:" plus any content up to and including the address line
        r"Tel\.:.*?[\r\n].*?Fritz-Erler-Str\. 25 - 76133 Karlsruhe",

        r"Telefon:.*?[\r\n].*?Fritz-Erler-Str\. 25 - 76133 Karlsruhe",
        
        # Website URL
        r"www\.ingenieurgruppe-bauen\.de",
        
        # Company information
        r"BERATENDE INGENIEURE VBI *PRÜFINGENIEURE VPI.*?KARLSRUHE *\| *MANNHEIM *\| *BERLIN *\| *FREIBURG",
        
        # Confidentiality notice
        r"Diese E-Mail einschl\. Anlage\(n\) enthält vertrauliche und/oder rechtlich.*?stattet\.",
        
        # Extra whitespace and blank lines at the end
        r"\n\s*\n\s*$"
    ]
    
    # Apply the patterns
    for pattern in patterns:
        body = re.sub(pattern, "", body, flags=re.DOTALL)
    
    # Clean up extra whitespace
    body = re.sub(r"\n{3,}", "\n\n", body)
    
    return body.strip()
    

In [None]:
out = preprocess_email_body(mail.body)

node_parser = SentenceSplitter(chunk_size=150, chunk_overlap=20)
nodes = node_parser.get_nodes_from_documents(
    [Document(text=out)]
)

for i, node in enumerate(nodes, start=1):
    node.metadata['date'] = mail.headers['Date']
    node.metadata['from'] = mail.headers['From']
    node.metadata['to'] = mail.headers['To']
    node.metadata['subject'] = mail.headers['Subject']
    node.metadata['attachments'] = mail.attachments
    node.metadata['num'] = i




In [39]:
for node in nodes:
    print(node.text)
    print(node.metadata)
    print("----")

Sehr geehrter Herr Arend,

wie vereinbart schicke ich Ihnen anbei die Zugangsdaten für den
FTP-Zugang, den wir für das Projekt "Fuldabrücke Bergshausen"
nutzen.

Servername:                     ftp.ingenieurgruppe-bauen.de
Benutzername:               igbf2
Passwort:                           Koh_cM9k

Ich schlage vor, dass wir alle Abgabeunterlagen bzw. Unterlagen zur Information
dort ablegen. Im Moment sind dies das Besprechungsprotokoll des Startgesprächs
und die Bilder des Ortstermins am 26.01.2016.
{}
----
Viele Grüße
 
i.A. Martin Rudolf.
                                                                                                     
 
Dipl.-Ing. Martin Rudolf
Gruppenleiter
 
                                           
                                                
­­­­­­­­­­­­­­­­­­­­­­­­
{}
----


In [6]:
# function to parse an email to a llamaindex textnode
# extracts metadata and body from mail and filters out reoccuring text elements
# splits the body into chunks of text and creates a textnode for each chunk

def parse_email(mail):
    email = Email(mail)
    
    node_parser = SentenceSplitter(chunk_size=150, chunk_overlap=20)
    nodes = node_parser.get_nodes_from_documents(
        [Document(text=preprocess_email_body(email.body))]
    )

    mail_uuid = uuid.uuid4()
    for i, node in enumerate(nodes, start=1):
        node.metadata['uuid'] = str(mail_uuid)
        node.metadata['date'] = str(email.date)
        node.metadata['from'] = str(email.from_)
        node.metadata['to'] = str(mail.to)
        node.metadata['subject'] = str(email.subject)
        node.metadata['attachments'] = None
        node.metadata['num'] = i

    
    return nodes


In [7]:
# iterate through all emails in a folder and parse them to textnodes
directory_path = 'mails/'
file_list = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]


all_text_nodes = []

for file in file_list:
    file_path = directory_path + file
    with open(file_path, 'r') as f:
        mail = mailparser.parse_from_file_obj(f)

        nodes = parse_email(mail)
        all_text_nodes.extend(nodes)

all_text_nodes

[TextNode(id_='4108a6b0-56e3-4ae8-8d99-01162f55a47a', embedding=None, metadata={'uuid': 'db1d5a16-07cd-4f14-9300-1c1f1a4f8b41', 'date': 'Thu, 04 Feb 2016 08:27:00 +0100', 'from': "[('Martin Rudolf', 'martin.rudolf@ingenieurgruppe-bauen.de')]", 'to': "[('', 'Falko.Arend@mobil.hessen.de')]", 'subject': 'Fuldabrücke Bergshausen FTP-Server', 'attachments': None, 'num': 1}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='cb970cfe-45be-42ef-961a-2c936221286b', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='bc3c1f6ae4b70e68afd6e270299416cf97a1f5bb5f9618b5c65ab601ac079326'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='25f89541-2c86-47b7-a106-f551ee763429', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='f65f99ebc3a92c2bf46048763335040520329e47b3b4faff8b705e003ed37603')}, metadata_template='{key}: {value}', metadata_separator='\n', text='Sehr geehrter Herr Arend,\n\nwie vereinbart s

In [None]:
# SAVE
import pickle

pickle.dump(all_text_nodes, open("igb_text_nodes.pkl", "wb"))

In [8]:
# build index from textnodes
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore


In [None]:
persist_dir = "storage_chroma"

vector_store = ChromaVectorStore.from_params(
    collection_name="text_nodes", persist_dir=persist_dir
)
index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)

In [None]:
index.insert_nodes(all_text_nodes)

In [17]:
retriever = index.as_retriever(similarity_top_k=4)
result_nodes = retriever.retrieve("Passwort")

In [18]:
for node_with_score in result_nodes:
    node = node_with_score.node
    embedding = node.embedding
    print(embedding)

None
None
None
None


In [19]:
result_nodes[0].node

TextNode(id_='4108a6b0-56e3-4ae8-8d99-01162f55a47a', embedding=None, metadata={'uuid': 'db1d5a16-07cd-4f14-9300-1c1f1a4f8b41', 'date': 'Thu, 04 Feb 2016 08:27:00 +0100', 'from': "[('Martin Rudolf', 'martin.rudolf@ingenieurgruppe-bauen.de')]", 'to': "[('', 'Falko.Arend@mobil.hessen.de')]", 'subject': 'Fuldabrücke Bergshausen FTP-Server', 'attachments': None, 'num': 1}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='cb970cfe-45be-42ef-961a-2c936221286b', node_type='4', metadata={}, hash='bc3c1f6ae4b70e68afd6e270299416cf97a1f5bb5f9618b5c65ab601ac079326'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='25f89541-2c86-47b7-a106-f551ee763429', node_type='1', metadata={}, hash='f65f99ebc3a92c2bf46048763335040520329e47b3b4faff8b705e003ed37603')}, metadata_template='{key}: {value}', metadata_separator='\n', text='Sehr geehrter Herr Arend,\n\nwie vereinbart schicke ich Ihnen anbei die Zugangsdaten für

In [None]:
import chromadb

client = chromadb.PersistentClient(path="storage_chroma/")
collection = client.get_collection("text_nodes")

import chromadb.utils.embedding_functions as embedding_functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key="",
                model_name="text-embedding-3-large")

In [None]:
results = collection.query(
    query_texts=["Passwort"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results)