In [1]:
import re
import json

def parse_email(text):
    # Define the headers we want to extract in order
    headers = [
        "Message-ID", "Date", "From", "To", "Subject", "Mime-Version",
        "Content-Type", "Content-Transfer-Encoding", "X-From", "X-To",
        "X-cc", "X-bcc", "X-Folder", "X-Origin", "X-FileName"
    ]
    
    # Prepare a dictionary with empty defaults
    email_data = {h: "" for h in headers}
    email_data["Body"] = ""
    
    lines = text.splitlines()
    
    # We will parse header lines until we find the Body
    # A simple state machine: reading headers until X-FileName is done, then reading body.
    current_header = None
    reading_body = False
    
    for line in lines:
        # If we are reading the body, just accumulate
        if reading_body:
            # Add line to body
            email_data["Body"] += line + "\n"
            continue
        
        # Check if line starts a new header
        # A header line typically looks like "Header-Name: value"
        # We can match using a regex or string operations
        header_match = re.match(r"^([A-Za-z0-9\-]+):\s*(.*)", line)
        
        if header_match:
            # We found a new header line
            key = header_match.group(1)
            value = header_match.group(2).strip()
            
            # Normalize the key to the exact case used in our headers list if possible
            # Headers in emails are case-insensitive, but we rely on a known set
            # We'll attempt a case-insensitive match to map the found key to a standard form
            found_key = None
            for h in headers:
                if h.lower() == key.lower():
                    found_key = h
                    break
            
            if found_key:
                # Assign or overwrite current_header
                email_data[found_key] = value
                current_header = found_key
            else:
                # This is a header we don't care about; just ignore it (or keep track if desired)
                current_header = None
            
            # If this header is X-FileName, after this we'll switch to body mode
            if found_key == "X-FileName":
                # The next lines will be the body
                # But note that body starts after this line is fully parsed
                # The Body reading will start from the next iteration
                pass
                
        else:
            # This line does not start a new header
            # If it starts with whitespace and we have a current header, it's a continuation
            if current_header and line.startswith((" ", "\t")):
                # Continuation line
                # Append the trimmed line to the current header's value (with a space separator)
                cont_line = line.strip()
                if cont_line:
                    email_data[current_header] += " " + cont_line
            else:
                # If no current header or this line doesn't start with whitespace,
                # then possibly we've reached the body (if we already got X-FileName)
                if email_data["X-FileName"] != "":
                    # We assume everything after X-FileName is the body
                    reading_body = True
                    email_data["Body"] += line + "\n"
                # If we haven't encountered X-FileName yet, it's probably a line we don't care about.
    
    # Trim trailing newlines in Body if desired
    email_data["Body"] = email_data["Body"].rstrip("\n")
    
    return email_data

In [3]:
import pandas as pd

df = pd.read_csv("raw_emails.csv")
df

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...
...,...,...
517396,zufferli-j/sent_items/95.,Message-ID: <26807948.1075842029936.JavaMail.e...
517397,zufferli-j/sent_items/96.,Message-ID: <25835861.1075842029959.JavaMail.e...
517398,zufferli-j/sent_items/97.,Message-ID: <28979867.1075842029988.JavaMail.e...
517399,zufferli-j/sent_items/98.,Message-ID: <22052556.1075842030013.JavaMail.e...


In [5]:
# Apply the function to the DataFrame and create a new column with the parsed email data
df['parsed_email'] = df['message'].apply(parse_email)

# Optionally, you can expand the parsed email data into separate columns
parsed_df = pd.json_normalize(df['parsed_email'])

# Combine the original DataFrame with the parsed data
final_df = pd.concat([df, parsed_df], axis=1)

# Save the resulting DataFrame to a CSV file
final_df.to_csv("parsed_emails.csv", index=False)

# Display the first few rows of the resulting DataFrame
print(final_df.head())

                       file  \
0     allen-p/_sent_mail/1.   
1    allen-p/_sent_mail/10.   
2   allen-p/_sent_mail/100.   
3  allen-p/_sent_mail/1000.   
4  allen-p/_sent_mail/1001.   

                                             message  \
0  Message-ID: <18782981.1075855378110.JavaMail.e...   
1  Message-ID: <15464986.1075855378456.JavaMail.e...   
2  Message-ID: <24216240.1075855687451.JavaMail.e...   
3  Message-ID: <13505866.1075863688222.JavaMail.e...   
4  Message-ID: <30922949.1075863688243.JavaMail.e...   

                                        parsed_email  \
0  {'Message-ID': '<18782981.1075855378110.JavaMa...   
1  {'Message-ID': '<15464986.1075855378456.JavaMa...   
2  {'Message-ID': '<24216240.1075855687451.JavaMa...   
3  {'Message-ID': '<13505866.1075863688222.JavaMa...   
4  {'Message-ID': '<30922949.1075863688243.JavaMa...   

                                      Message-ID  \
0  <18782981.1075855378110.JavaMail.evans@thyme>   
1  <15464986.1075855378456.JavaMail

In [4]:
parsed_df = pd.read_csv('parsed_emails.csv')


In [5]:
parsed_df.head()

Unnamed: 0,file,message,parsed_email,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,Body
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,{'Message-ID': '<18782981.1075855378110.JavaMa...,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\nHere is our forecast\n\n
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,{'Message-ID': '<15464986.1075855378456.JavaMa...,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\nTraveling to have a business meeting takes t...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,{'Message-ID': '<24216240.1075855687451.JavaMa...,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,\ntest successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,{'Message-ID': '<13505866.1075863688222.JavaMa...,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"\nRandy,\n\n Can you send me a schedule of the..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,{'Message-ID': '<30922949.1075863688243.JavaMa...,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,\nLet's shoot for Tuesday at 11:45.
