## Imports

In [1]:
import mailbox
import pandas as pd
import email
import os
from email.header import decode_header
import datetime
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import sqlite3
import html
import sys
import argparse


In [2]:
### Importing classes

In [3]:
from models.models import EmailAddress, MailingList, Organisation, Position, Entity, Attachment, ReceiverEmail, SenderEmail

In [2]:
## Attempt #1

In [4]:
def decode_str(s):
    """Decode encoded email header strings"""
    if s is None:
        return ""
    decoded_parts = decode_header(s)
    return ''.join([
        part.decode(encoding or 'utf-8', errors='replace') if isinstance(part, bytes) else part
        for part, encoding in decoded_parts
    ])

def mbox_to_dataframe(mbox_path):
    """Convert an mbox file to a pandas DataFrame"""
    # Open the mbox file
    mbox = mailbox.mbox(mbox_path)

    # Extract folder name from path
    folder_name = os.path.basename(mbox_path).replace('.mbox', '')

    # Create a list to store email data
    emails = []

    # Process each message
    for i, message in enumerate(mbox):
        # Extract basic headers
        subject = decode_str(message['subject'])
        from_addr = decode_str(message['from'])
        to_addr = decode_str(message['to'])
        date_str = message['date']

        # Parse date
        try:
            date = email.utils.parsedate_to_datetime(date_str)
        except:
            date = None

        # Get message body
        body = ""
        if message.is_multipart():
            for part in message.walk():
                content_type = part.get_content_type()
                content_disposition = str(part.get("Content-Disposition"))

                # Skip attachments
                if "attachment" in content_disposition:
                    continue

                # Get text content
                if content_type == "text/plain":
                    try:
                        body_part = part.get_payload(decode=True)
                        charset = part.get_content_charset() or 'utf-8'
                        body += body_part.decode(charset, errors='replace')
                    except:
                        body += "[Error decoding message body]"
        else:
            # Not multipart - get payload directly
            try:
                body = message.get_payload(decode=True).decode(message.get_content_charset() or 'utf-8', errors='replace')
            except:
                body = "[Error decoding message body]"

        # Add to emails list
        emails.append({
            'message_id': message['message-id'],
            'subject': subject,
            'from': from_addr,
            'to': to_addr,
            'date': date,
            'folder': folder_name,
            'body': body,
            'has_attachments': "attachment" in str(message).lower()
        })

    # Create DataFrame
    return pd.DataFrame(emails)

# Process all mbox files in a directory
def process_all_mbox_files(directory):
    all_emails = pd.DataFrame()

    for filename in os.listdir(directory):
        if filename.endswith('.mbox'):
            file_path = os.path.join(directory, filename)
            print(f"Processing {filename}...")
            df = mbox_to_dataframe(file_path)
            all_emails = pd.concat([all_emails, df], ignore_index=True)

    return all_emails

# Example usage:
# directory = "/path/to/your/mbox/files"
# emails_df = process_all_mbox_files(directory)

# Save to various formats:
# emails_df.to_csv('emails.csv', index=False)
# emails_df.to_pickle('emails.pkl')  # Pandas format
# emails_df.to_parquet('emails.parquet')  # Parquet format

# To save to SQLite database:
# import sqlite3
# conn = sqlite3.connect('emails.db')
# emails_df.to_sql('emails', conn, index=False, if_exists='replace')
# conn.close()

In [None]:
emails_df = process_all_mbox_files("data/processed/mailbox_cecile/")

Processing AG.mbox...
Processing Archive.mbox...
Processing Archives calssifiees.mbox...
Processing Ateliers.mbox...
Processing Boîte de réception.mbox...
Processing Brouillons.mbox...
Processing Conflit.mbox...
Processing Courrier indésirable.mbox...
Processing Formation à distance.mbox...
Processing Gazette.mbox...
Processing gestioncrise.mbox...
Processing Idees.mbox...
Processing Instances.mbox...
Processing Plaidoyer.mbox...
Processing RH.mbox...
Processing Éléments envoyés.mbox...
Processing Éléments supprimés.mbox...


In [None]:
emails_df.head(1)

# </v:shape><![endif]--><![if !vml]><img width=143 height=201 style='width:1.4895in;height:2.0937in' src="cid:image007.jpg@01D63F43.70FD3750" align=left hspace=12 v:shapes="Image_x0020_1"><![endif]><b><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";color:#9DC107;mso-fareast-language:FR'><o:p></o:p></span></b></p><p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";color:#9DC107;mso-fareast-language:FR'>Anne Clerc<o:p></o:p></span></b></p><p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";color:#9DC107;mso-fareast-language:FR'>D�l�gu�e G�n�rale<o:p></o:p></span></b></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";mso-fareast-language:FR'>Association des archivistes fran�ais<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";mso-fareast-language:FR'>8 rue Jean-Marie J�go<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";mso-fareast-language:FR'>75013 PARIS<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";mso-fareast-language:FR'>T�l. 01 46 06 40 12<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";mso-fareast-language:FR'>Portable 06 79 53 47 40<o:p></o:p></span></p><p class=MsoNormal><span style='mso-fareast-language:FR'><a href="mailto:vieassociative@archivistes.org"><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";color:blue'>delegation_generale@archivistes.org</span></a></span><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";mso-fareast-language:FR'><o:p></o:p></span></p><p class=MsoNormal><span style='mso-fareast-language:FR'><a href="https://www.archivistes.org/"><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";color:blue'>https://www.archivistes.org</span></a></span><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";mso-fareast-language:FR'><o:p></o:p></span></p><p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"HelveticaNeueLT Com 55 Roman";color:#9DC107;mso-fareast-language:FR'><o:p>&nbsp;</o:p></span></b></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal><o:p>&nbsp;</o:p></p><div><div style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm'><p class=MsoNormal><b><span style='mso-fareast-language:FR'>De&nbsp;:</span></b><span style='mso-fareast-language:FR'> Daniel BUCHOUX &lt;d.buchoux@aca.nexia.fr&gt; <br><b>Envoy�&nbsp;:</b> mercredi 10 juin 2020 16:17<br><b>�&nbsp;:</b> AAF - Anne Clerc, d�l�gu�e g�n�rale &lt;delegation_generale@archivistes.org&gt;; Dolly RAZAFINANJA &lt;d.razafinanja@aca.nexia.fr&gt;; Simon DELEDICQ &lt;s.deledicq@aca.nexia.fr&gt;<br><b>Objet&nbsp;:</b> RE: Rappel : Assembl�e g�n�rale de l'AAF le 29 juin 2020 � partir de 17h en visioconf�rence<o:p></o:p></span></p></div></div><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:Effra;color:#575756'>Bonjour Anne,<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:Effra;color:#575756'><o:p>&nbsp;</o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:Effra;color:#575756'>J&#8217;esp�re que vous allez bien.<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:Effra;color:#575756'><o:p>&nbsp;</o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:Effra;color:#575756'>Je serai pr�sent.<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:Effra;color:#575756'><o:p>&nbsp;</o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:Effra;color:#575756'>Bonne journ�e.<o:p></o:p></span></p><p class=MsoNormal><span style='font-size:10.0pt;font-family:Effra;color:#575756'><o:p>&nbsp;</o:p></span></p><p class=MsoNormal><b><span style='font-size:10.0pt;font-family:Effra;color:#00B2A9;mso-fareast-language:FR'>Daniel Buchoux </span></b><span style='font-size:9.0pt;font-family:Effra;color:#575756;mso-fareast-language:FR'><br><br>Aca Nexia </span><b><span style='font-size:9.0pt;font-family:Effra;color:#00B2A9;mso-fareast-language:FR'>|</span></b><span style='font-size:9.0pt;font-family:Effra;color:#575756;mso-fareast-language:FR'> 31, rue Henri Rochefort, 75017 Paris, France <br></span><b><span style='font-size:9.0pt;font-family:Effra;color:#00B2A9;mso-fareast-language:FR'>T</span></b><span style='font-size:9.0pt;font-family:Effra;color:#575756;mso-fareast-language:FR'> +33 (0)1 47 66 77 88 </span><b><span style='font-size:9.0pt;font-family:Effra;color:#00B2A9;mso-fareast-language:FR'>| M</span></b><span style='font-size:9.0pt;font-family:Effra;color:#575756;mso-fareast-language:FR'> +33 (0)6 07 76 25 49 <br></span><b><span style='font-size:9.0pt;font-family:Effra;color:#00B2A9;mso-fareast-language:FR'>E</span></b><span style='font-size:9.0pt;font-family:Effra;color:#575756;mso-fareast-language:FR'> <a href="mailto:d.buchoux@aca.nexia.fr"><span style='color:#575756'>d.buchoux@aca.nexia.fr</span></a> <br><a href="http://www.aca.nexia.fr"><span style='color:#575756'>aca.nexia.fr</span></a> <br><br><img border=0 width=100 height=39 style='width:1.0416in;height:.4062in' id="Image_x0020_9" src="cid:image008.png@01D63F43.70FD3750" alt="Aca, Independant member of Nexia International">&nbsp; </span><span style='font-size:10.0pt;font-family:Effra;color:#575756'><o:p></o:p></span></p><div><div style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm'><p class=MsoNormal><b><span style='mso-fareast-language:FR'>De&nbsp;:</span></b><span style='mso-fareast-language:FR'> AAF - Anne Clerc, d�l�gu�e g�n�rale &lt;delegation_generale@archivistes.org&gt; <br><b>Envoy�&nbsp;:</b> mercredi 10 juin 2020 15:53<br><b>�&nbsp;:</b> Dolly RAZAFINANJA &lt;d.razafinanja@aca.nexia.fr&gt;; Daniel BUCHOUX &lt;d.buchoux@aca.nexia.fr&gt;; Simon DELEDICQ &lt;s.deledicq@aca.nexia.fr&gt;<br><b>Objet&nbsp;:</b> TR: Rappel : Assembl�e g�n�rale de l'AAF le 29 juin 2020 � partir de 17h en visioconf�rence<o:p></o:p></span></p></div></div><p class=MsoNormal><o:p>&nbsp;</o:p></p><p class=MsoNormal>Bonjour, <o:p></o:p></p><p class=MsoNormal>&nbsp;<o:p></o:p></p><p class=MsoNormal>Pour information, je vous transmets l&#8217;invitation pour notre AG. En comptant sur votre pr�sence.<o:p></o:p></p><p class=MsoNormal><br>Cordialement<o:p></o:p></p><p class=MsoNormal>&nbsp;<o:p></o:p></p><p class=MsoNormal>&nbsp;<o:p></o:p></p><p class=MsoNormal><!--[if gte vml 1]><v:shape id="Image_x0020_2" o:spid="_x0000_s1026" type="#_x0000_t75" style='position:absolute;margin-left:0;margin-top:0;width:107.25pt;height:150.75pt;z-index:251658240;visibility:visible;mso-wrap-style:square;mso-width-percent:0;mso-height-percent:0;mso-wrap-distance-left:4.5pt;mso-wrap-distance-top:0;mso-wrap-distance-right:4.5pt;mso-wrap-distance-bottom:0;mso-position-horizontal:left;mso-position-horizontal-relative:text;mso-position-vertical:absolute;mso-position-vertical-relative:line;mso-width-percent:0;mso-height-percent:0;mso-width-relative:page;mso-height-relative:page' o:allowoverlap="f">

Unnamed: 0,message_id,subject,from,to,date,folder,body,has_attachments
0,<0e8d36fce37f4240b8b7f0965b794923@archivistes....,TR: Rappel : Assemblée générale de l'AAF le 29...,"AAF - Anne Clerc, déléguée générale<delegation...",AAF - Céline Guyon <celine.guyon@archivistes.org>,2020-06-10 16:22:01+02:00,AG,,True


In [None]:
emails_df[emails_df["body"] != ""]["folder"].value_counts()

folder
Boîte de réception      318
Éléments supprimés       12
Courrier indésirable      6
Archives calssifiees      4
Archive                   1
Éléments envoyés          1
Name: count, dtype: int64

In [None]:
# Bonjour et grand Merci Lucile
emails_df["body"][emails_df["body"].str.find("L'ordre du jour de l'assem") != -1]

Series([], Name: body, dtype: object)

In [None]:
emails_df["body"][emails_df["body"] == ""]

0         
1         
2         
3         
4         
        ..
19130     
19131     
19132     
19133     
19135     
Name: body, Length: 18794, dtype: object

In [None]:
emails_df["body"].nunique()

340

## Attempt 2

In [5]:


def decode_str(s):
    """Decode encoded email header strings"""
    if s is None:
        return ""
    try:
        decoded_parts = decode_header(s)
        return ''.join([
            part.decode(encoding or 'utf-8', errors='replace') if isinstance(part, bytes) else part
            for part, encoding in decoded_parts
        ])
    except:
        return str(s)

def extract_text_from_html(html_content):
    """Extract readable text from HTML content"""
    if not html_content:
        return ""

    try:
        # Parse HTML
        soup = BeautifulSoup(html_content, 'html.parser')

        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.extract()

        # Get text
        text = soup.get_text(separator=' ', strip=True)

        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)

        return text
    except:
        return html_content  # Return original if parsing fails

def get_email_body(message):
    """Extract body text from email message, handling HTML correctly"""
    body_text = ""
    body_html = ""

    if message.is_multipart():
        for part in message.walk():
            content_type = part.get_content_type()
            content_disposition = str(part.get("Content-Disposition") or "")

            # Skip attachments
            if "attachment" in content_disposition:
                continue

            try:
                payload = part.get_payload(decode=True)
                if payload is None:
                    continue

                charset = part.get_content_charset() or 'utf-8'
                decoded_payload = payload.decode(charset, errors='replace')

                if content_type == "text/plain":
                    body_text += decoded_payload
                elif content_type == "text/html":
                    body_html += decoded_payload
            except:
                continue
    else:
        # Not multipart - get payload directly
        try:
            content_type = message.get_content_type()
            payload = message.get_payload(decode=True)
            if payload:
                charset = message.get_content_charset() or 'utf-8'
                decoded_payload = payload.decode(charset, errors='replace')

                if content_type == "text/plain":
                    body_text = decoded_payload
                elif content_type == "text/html":
                    body_html = decoded_payload
        except:
            pass

    # Prefer HTML content but fall back to plain text
    if body_html:
        return {
            "html": body_html,
            "text": extract_text_from_html(body_html),
            "has_html": True
        }
    else:
        return {
            "html": "",
            "text": body_text,
            "has_html": False
        }

def extract_attachments_info(message):
    """Extract information about attachments"""
    attachments = []

    if message.is_multipart():
        for part in message.walk():
            content_disposition = str(part.get("Content-Disposition") or "")

            if "attachment" in content_disposition:
                filename = part.get_filename()
                if filename:
                    try:
                        filename = decode_str(filename)
                    except:
                        pass

                content_type = part.get_content_type()
                attachments.append({
                    "filename": filename,
                    "content_type": content_type,
                    "size": len(part.get_payload(decode=True) or b'')
                })

    return attachments

def extract_recipients(message):
    """Extract all recipients (To, CC, BCC)"""
    to = decode_str(message.get('to') or "")
    cc = decode_str(message.get('cc') or "")
    bcc = decode_str(message.get('bcc') or "")

    return {
        "to": to,
        "cc": cc,
        "bcc": bcc
    }

def extract_message_data(message, folder_name):
    """Extract comprehensive email data"""
    # Extract basic headers
    subject = decode_str(message.get('subject') or "")
    from_addr = decode_str(message.get('from') or "")
    date_str = message.get('date')
    message_id = decode_str(message.get('message-id') or "")

    # Parse date
    try:
        date = email.utils.parsedate_to_datetime(date_str)
    except:
        date = None

    # Get recipients
    recipients = extract_recipients(message)

    # Get body content
    body_content = get_email_body(message)

    # Get attachment info
    attachments = extract_attachments_info(message)

    # Thread information
    references = decode_str(message.get('references') or "")
    in_reply_to = decode_str(message.get('in-reply-to') or "")

    return {
        'message_id': message_id,
        'subject': subject,
        'from': from_addr,
        'to': recipients["to"],
        'cc': recipients["cc"],
        'bcc': recipients["bcc"],
        'date': date,
        'folder': folder_name,
        'body_text': body_content["text"],
        'body_html': body_content["html"],
        'has_html': body_content["has_html"],
        'attachments': attachments,
        'attachment_count': len(attachments),
        'references': references,
        'in_reply_to': in_reply_to
    }

def process_mbox_to_sqlite(mbox_path, conn, batch_size=100):
    """Process mbox file directly to SQLite in batches with progress bar"""
    cursor = conn.cursor()
    mbox = mailbox.mbox(mbox_path)
    folder_name = os.path.basename(mbox_path).replace('.mbox', '')

    # Count messages for progress bar
    total_messages = len(mbox)
    print(f"Processing {folder_name} ({total_messages} messages)")

    # Process in batches
    batch = []
    for i, message in enumerate(tqdm(mbox, total=total_messages, desc=folder_name)):
        email_data = extract_message_data(message, folder_name)

        # Convert attachment data to string
        if email_data['attachments']:
            email_data['attachments_json'] = str(email_data['attachments'])
        else:
            email_data['attachments_json'] = ""

        del email_data['attachments']  # Remove original list

        batch.append(email_data)

        # Process batch
        if len(batch) >= batch_size or i == total_messages - 1:
            if batch:
                # Convert to DataFrame for easy SQL insertion
                df = pd.DataFrame(batch)

                # Write to database
                df.to_sql('emails', conn, if_exists='append', index=False)

                # Clear batch
                batch = []

                # Commit to save progress
                conn.commit()

def setup_database(db_path):
    """Set up the SQLite database schema with proper types and indexes"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Drop table if exists
    cursor.execute("DROP TABLE IF EXISTS emails")

    # Create table with proper columns and types
    cursor.execute('''
    CREATE TABLE emails (
        message_id TEXT,
        subject TEXT,
        "from" TEXT,
        "to" TEXT,
        cc TEXT,
        bcc TEXT,
        date TIMESTAMP,
        folder TEXT,
        body_text TEXT,
        body_html TEXT,
        has_html BOOLEAN,
        attachments_json TEXT,
        attachment_count INTEGER,
        references TEXT,
        in_reply_to TEXT
    )
    ''')

    # Create indexes for common queries
    cursor.execute('CREATE INDEX idx_date ON emails(date)')
    cursor.execute('CREATE INDEX idx_folder ON emails(folder)')
    cursor.execute('CREATE INDEX idx_from ON emails("from")')
    cursor.execute('CREATE INDEX idx_to ON emails("to")')
    cursor.execute('CREATE INDEX idx_subject ON emails(subject)')

    conn.commit()
    return conn

def process_all_mbox_files(directory, db_path='emails.db'):
    """Process all mbox files in directory to a SQLite database"""
    # Setup database
    conn = setup_database(db_path)

    # Get list of mbox files
    mbox_files = [f for f in os.listdir(directory) if f.endswith('.mbox')]
    print(f"Found {len(mbox_files)} mbox files")

    # Process each file
    for filename in mbox_files:
        file_path = os.path.join(directory, filename)
        process_mbox_to_sqlite(file_path, conn)

    # Create full-text search index for body
    cursor = conn.cursor()
    try:
        cursor.execute("CREATE VIRTUAL TABLE IF NOT EXISTS email_fts USING fts5(body_text, content='emails', content_rowid='rowid')")
        cursor.execute("INSERT INTO email_fts(rowid, body_text) SELECT rowid, body_text FROM emails")
    except:
        print("Warning: Full-text search index creation failed. SQLite may not have FTS5 support.")

    conn.commit()
    conn.close()

    print(f"Processing complete. Database saved to {db_path}")

# Example usage:
directory = "data/processed/mailbox_cecile/"
# process_all_mbox_files(directory)

In [None]:
## attempt 3

In [None]:


def decode_str(s):
    """Decode encoded email header strings"""
    if s is None:
        return ""
    try:
        decoded_parts = decode_header(s)
        return ''.join([
            part.decode(encoding or 'utf-8', errors='replace') if isinstance(part, bytes) else part
            for part, encoding in decoded_parts
        ])
    except:
        return str(s)

def extract_text_from_html(html_content):
    """Extract readable text from HTML content without BeautifulSoup"""
    if not html_content:
        return ""

    try:
        # Remove HTML tags with regex (simple approach)
        text = re.sub(r'<[^>]+>', ' ', html_content)

        # Decode HTML entities
        text = html.unescape(text)

        # Clean up whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text
    except:
        return html_content  # Return original if parsing fails

def get_email_body(message):
    """Extract body text from email message, handling HTML correctly"""
    body_text = ""
    body_html = ""

    if message.is_multipart():
        for part in message.walk():
            content_type = part.get_content_type()
            content_disposition = str(part.get("Content-Disposition") or "")

            # Skip attachments
            if "attachment" in content_disposition:
                continue

            try:
                payload = part.get_payload(decode=True)
                if payload is None:
                    continue

                charset = part.get_content_charset() or 'utf-8'
                decoded_payload = payload.decode(charset, errors='replace')

                if content_type == "text/plain":
                    body_text += decoded_payload
                elif content_type == "text/html":
                    body_html += decoded_payload
            except:
                continue
    else:
        # Not multipart - get payload directly
        try:
            content_type = message.get_content_type()
            payload = message.get_payload(decode=True)
            if payload:
                charset = message.get_content_charset() or 'utf-8'
                decoded_payload = payload.decode(charset, errors='replace')

                if content_type == "text/plain":
                    body_text = decoded_payload
                elif content_type == "text/html":
                    body_html = decoded_payload
        except:
            pass

    # Prefer HTML content but fall back to plain text
    if body_html:
        return {
            "html": body_html,
            "text": extract_text_from_html(body_html),
            "has_html": True
        }
    else:
        return {
            "html": "",
            "text": body_text,
            "has_html": False
        }

def extract_attachments_info(message):
    """Extract information about attachments"""
    attachments = []

    if message.is_multipart():
        for part in message.walk():
            content_disposition = str(part.get("Content-Disposition") or "")

            if "attachment" in content_disposition:
                filename = part.get_filename()
                if filename:
                    try:
                        filename = decode_str(filename)
                    except:
                        pass

                content_type = part.get_content_type()
                attachments.append({
                    "filename": filename,
                    "content_type": content_type,
                    "size": len(part.get_payload(decode=True) or b'')
                })

    return attachments

def extract_recipients(message):
    """Extract all recipients (To, CC, BCC)"""
    to = decode_str(message.get('to') or "")
    cc = decode_str(message.get('cc') or "")
    bcc = decode_str(message.get('bcc') or "")

    return {
        "to": to,
        "cc": cc,
        "bcc": bcc
    }

def extract_message_data(message, folder_name):
    """Extract comprehensive email data"""
    # Extract basic headers
    subject = decode_str(message.get('subject') or "")
    from_addr = decode_str(message.get('from') or "")
    date_str = message.get('date')
    message_id = decode_str(message.get('message-id') or "")

    # Parse date
    try:
        date = email.utils.parsedate_to_datetime(date_str)
    except:
        date = None

    # Get recipients
    recipients = extract_recipients(message)

    # Get body content
    body_content = get_email_body(message)

    # Get attachment info
    attachments = extract_attachments_info(message)

    # Thread information
    references = decode_str(message.get('references') or "")
    in_reply_to = decode_str(message.get('in-reply-to') or "")

    return {
        'message_id': message_id,
        'subject': subject,
        'from': from_addr,
        'to': recipients["to"],
        'cc': recipients["cc"],
        'bcc': recipients["bcc"],
        'date': date,
        'folder': folder_name,
        'body_text': body_content["text"],
        'body_html': body_content["html"],
        'has_html': body_content["has_html"],
        'attachments': attachments,
        'attachment_count': len(attachments),
        'references': references,
        'in_reply_to': in_reply_to
    }

def process_mbox_to_sqlite(mbox_path, conn, batch_size=100):
    """Process mbox file directly to SQLite in batches with progress bar"""
    cursor = conn.cursor()
    mbox = mailbox.mbox(mbox_path)
    folder_name = os.path.basename(mbox_path).replace('.mbox', '')

    # Count messages for progress bar
    total_messages = len(mbox)
    print(f"Processing {folder_name} ({total_messages} messages)")

    # Process in batches
    batch = []
    for i, message in enumerate(tqdm(mbox, total=total_messages, desc=folder_name)):
        email_data = extract_message_data(message, folder_name)

        # Convert attachment data to string
        if email_data['attachments']:
            email_data['attachments_json'] = str(email_data['attachments'])
        else:
            email_data['attachments_json'] = ""

        del email_data['attachments']  # Remove original list

        batch.append(email_data)

        # Process batch
        if len(batch) >= batch_size or i == total_messages - 1:
            if batch:
                # Convert to DataFrame for easy SQL insertion
                df = pd.DataFrame(batch)

                # Write to database
                df.to_sql('emails', conn, if_exists='append', index=False)

                # Clear batch
                batch = []

                # Commit to save progress
                conn.commit()

def setup_database(db_path):
    """Set up the SQLite database schema with proper types and indexes"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Drop table if exists
    cursor.execute("DROP TABLE IF EXISTS emails")

    # Create table with proper columns and types
    # Note: Escaping the reserved keyword 'references' with quotes
    cursor.execute('''
    CREATE TABLE emails (
        message_id TEXT,
        subject TEXT,
        "from" TEXT,
        "to" TEXT,
        cc TEXT,
        bcc TEXT,
        date TIMESTAMP,
        folder TEXT,
        body_text TEXT,
        body_html TEXT,
        has_html BOOLEAN,
        attachments_json TEXT,
        attachment_count INTEGER,
        "references" TEXT,
        in_reply_to TEXT
    )
    ''')

    # Create indexes for common queries
    cursor.execute('CREATE INDEX idx_date ON emails(date)')
    cursor.execute('CREATE INDEX idx_folder ON emails(folder)')
    cursor.execute('CREATE INDEX idx_from ON emails("from")')
    cursor.execute('CREATE INDEX idx_to ON emails("to")')
    cursor.execute('CREATE INDEX idx_subject ON emails(subject)')

    conn.commit()
    return conn

def process_all_mbox_files(directory, db_path='emails.db'):
    """Process all mbox files in directory to a SQLite database"""
    # Setup database
    conn = setup_database(db_path)

    # Get list of mbox files
    mbox_files = [f for f in os.listdir(directory) if f.endswith('.mbox')]
    print(f"Found {len(mbox_files)} mbox files")

    # Process each file
    for filename in mbox_files:
        file_path = os.path.join(directory, filename)
        process_mbox_to_sqlite(file_path, conn)

    conn.commit()
    conn.close()

    print(f"Processing complete. Database saved to {db_path}")

# Example usage:
directory = "data/processed/mailbox_cecile/"
process_all_mbox_files(directory)

Found 17 mbox files
Processing AG (6 messages)


AG: 100%|██████████| 6/6 [00:00<00:00, 12.01it/s]


Processing Archive (10 messages)


Archive: 100%|██████████| 10/10 [00:00<00:00, 22.86it/s]


Processing Archives calssifiees (423 messages)


Archives calssifiees: 100%|██████████| 423/423 [00:20<00:00, 21.14it/s] 


Processing Ateliers (28 messages)


Ateliers: 100%|██████████| 28/28 [00:00<00:00, 66.75it/s]


Processing Boîte de réception (12499 messages)


Boîte de réception: 100%|██████████| 12499/12499 [02:42<00:00, 77.01it/s] 


Processing Brouillons (41 messages)


Brouillons: 100%|██████████| 41/41 [00:00<00:00, 146.12it/s]


Processing Conflit (6 messages)


Conflit: 100%|██████████| 6/6 [00:00<00:00, 52.64it/s]


Processing Courrier indésirable (45 messages)


Courrier indésirable: 100%|██████████| 45/45 [00:00<00:00, 73.51it/s] 


Processing Formation à distance (2 messages)


Formation à distance: 100%|██████████| 2/2 [00:00<00:00, 48.40it/s]


Processing Gazette (10 messages)


Gazette: 100%|██████████| 10/10 [00:00<00:00, 94.99it/s]


Processing gestioncrise (74 messages)


gestioncrise: 100%|██████████| 74/74 [00:00<00:00, 89.31it/s] 


Processing Idees (18 messages)


Idees: 100%|██████████| 18/18 [00:00<00:00, 120.81it/s]


Processing Instances (60 messages)


Instances: 100%|██████████| 60/60 [00:00<00:00, 85.39it/s] 


Processing Plaidoyer (38 messages)


Plaidoyer: 100%|██████████| 38/38 [00:00<00:00, 109.76it/s]


Processing RH (40 messages)


RH: 100%|██████████| 40/40 [00:00<00:00, 113.77it/s]


Processing Éléments envoyés (5559 messages)


Éléments envoyés: 100%|██████████| 5559/5559 [00:49<00:00, 111.57it/s]


Processing Éléments supprimés (277 messages)


Éléments supprimés: 100%|██████████| 277/277 [00:03<00:00, 92.19it/s] 

Processing complete. Database saved to emails.db





Test

In [None]:

# Connect to the database
db_path = "emails.db"  # Update this path if needed
conn = sqlite3.connect(db_path)

# Method 1: Get the entire table as a DataFrame
df = pd.read_sql_query("SELECT * FROM emails", conn)

# If the table is very large, you might want to limit it
# df = pd.read_sql_query("SELECT * FROM emails LIMIT 1000", conn)

# Method 2: Get specific columns only
# columns_df = pd.read_sql_query("SELECT message_id, subject, \"from\", \"to\", date, folder FROM emails", conn)

# # Method 3: With a specific condition
# filtered_df = pd.read_sql_query("SELECT * FROM emails WHERE folder = 'Inbox'", conn)

# Don't forget to close the connection when done
conn.close()

# Now you can work with the DataFrame(s)
print(df.shape)  # Shows number of rows and columns
print(df.columns)  # Shows all column names
df.head(2)  # Shows the first 5 rows

In [None]:
# import sqlite3
# import pandas as pd
# import re
# import html
# from tqdm import tqdm
# import sys
# import argparse

# def extract_clean_text_from_html(html_content):
#     """
#     Extract clean, readable text from HTML content.

#     Args:
#         html_content (str): HTML content to clean

#     Returns:
#         str: Clean text without HTML tags
#     """
#     if not html_content:
#         return ""

#     try:
#         # Remove scripts, styles, and other tags that contain content we don't want
#         html_content = re.sub(r'<(script|style|head).*?>.*?</\1>', ' ', html_content, flags=re.DOTALL)

#         # Replace common block elements with newlines to preserve structure
#         html_content = re.sub(r'</(p|div|h\d|tr|li)>', '\n', html_content)
#         html_content = re.sub(r'<br[^>]*>', '\n', html_content)

#         # Replace table cells with tab separation
#         html_content = re.sub(r'</td>', '\t', html_content)

#         # Remove all HTML tags
#         text = re.sub(r'<[^>]+>', ' ', html_content)

#         # Decode HTML entities (&nbsp;, &lt;, etc.)
#         text = html.unescape(text)

#         # Fix some common patterns in emails
#         text = re.sub(r'De :\s+.*?Envoyé :', '\n---\nDe : ', text)

#         # Clean up whitespace (multiple spaces, tabs, newlines)
#         text = re.sub(r'[ \t]+', ' ', text)
#         text = re.sub(r'\n{3,}', '\n\n', text)

#         # Final cleanup to remove leading/trailing whitespace
#         return text.strip()
#     except Exception as e:
#         print(f"Error processing HTML: {e}")
#         return f"Error processing HTML content: {str(e)}"

# def process_database(db_path, limit=None, batch_size=500, verbose=True):
#     """Process the entire database, adding clean text for all HTML emails"""
#     conn = sqlite3.connect(db_path)
#     cursor = conn.cursor()

#     # Add column if it doesn't exist
#     try:
#         cursor.execute("ALTER TABLE emails ADD COLUMN body_clean_text TEXT")
#         print("Added body_clean_text column to database")
#     except sqlite3.OperationalError:
#         if verbose:
#             print("The body_clean_text column already exists")

#     # Count HTML emails
#     cursor.execute("SELECT COUNT(*) FROM emails WHERE has_html = 1")
#     total_html = cursor.fetchone()[0]

#     if limit:
#         total_to_process = min(limit, total_html)
#     else:
#         total_to_process = total_html

#     if verbose:
#         print(f"Processing {total_to_process} of {total_html} HTML emails...")

#     # Process HTML emails in batches
#     processed = 0
#     for offset in range(0, total_to_process, batch_size):
#         if limit and offset >= limit:
#             break

#         current_batch = min(batch_size, total_to_process - offset)

#         if verbose:
#             print(f"Processing batch {offset//batch_size + 1} ({offset+1}-{offset+current_batch} of {total_to_process})")

#         # Get batch of emails
#         cursor.execute(f"""
#             SELECT rowid, message_id, body_html
#             FROM emails
#             WHERE has_html = 1
#             LIMIT {current_batch} OFFSET {offset}
#         """)

#         rows = cursor.fetchall()

#         # Process each email in the batch
#         for row in tqdm(rows, desc="Processing", disable=not verbose):
#             rowid, message_id, body_html = row

#             if body_html:
#                 clean_text = extract_clean_text_from_html(body_html)

#                 # Update database
#                 cursor.execute(
#                     "UPDATE emails SET body_clean_text = ? WHERE rowid = ?",
#                     (clean_text, rowid)
#                 )

#                 processed += 1

#         # Commit after each batch
#         conn.commit()

#     # Process non-HTML emails (copy body_text to body_clean_text)
#     cursor.execute("SELECT COUNT(*) FROM emails WHERE has_html = 0 AND body_clean_text IS NULL")
#     non_html_count = cursor.fetchone()[0]

#     if non_html_count > 0:
#         if verbose:
#             print(f"Copying text for {non_html_count} non-HTML emails...")

#         cursor.execute("""
#             UPDATE emails
#             SET body_clean_text = body_text
#             WHERE has_html = 0 AND body_clean_text IS NULL
#         """)
#         conn.commit()

#     #

In [None]:
df["body_html"][0]

'<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><meta http-equiv=Content-Type content="text/html; charset=iso-8859-1"><meta name=Generator content="Microsoft Word 15 (filtered medium)"><!--[if !mso]><style>v\\:* {behavior:url(#default#VML);}\no\\:* {behavior:url(#default#VML);}\nw\\:* {behavior:url(#default#VML);}\n.shape {behavior:url(#default#VML);}\n</style><![endif]--><style><!--\n/* Font Definitions */\n@font-face\n\t{font-family:Helvetica;\n\tpanose-1:2 11 6 4 2 2 2 2 2 4;}\n@font-face\n\t{font-family:"Cambria Math";\n\tpanose-1:2 4 5 3 5 4 6 3 2 4;}\n@font-face\n\t{font-family:Calibri;\n\tpanose-1:2 15 5 2 2 2 4 3 2 4;}\n@font-face\n\t{font-family:"Century Gothic";\n\tpanose-1:2 11 5 2 2 2 2 2 2 4;}\n@font-face\n\t{font-family:"HelveticaNeueLT Com 55 Roman";}\n@font-face\n\t{font-

In [None]:
df["body_html"][df["body_html"].str.find("ordre du jour de l") != -1]

0        <html xmlns:v="urn:schemas-microsoft-com:vml" ...
1        <html xmlns:v="urn:schemas-microsoft-com:vml" ...
705      <html xmlns:v="urn:schemas-microsoft-com:vml" ...
725      <html xmlns:v="urn:schemas-microsoft-com:vml" ...
727      <html><head>\n<meta http-equiv="Content-Type" ...
1474     <html xmlns:v="urn:schemas-microsoft-com:vml" ...
1562     <html xmlns:v="urn:schemas-microsoft-com:vml" ...
1572     <html xmlns:v="urn:schemas-microsoft-com:vml" ...
5258     <meta http-equiv="Content-Type" content="text/...
6271     <html xmlns:v="urn:schemas-microsoft-com:vml" ...
6774     <html xmlns:v="urn:schemas-microsoft-com:vml" ...
8466     <html xmlns:v="urn:schemas-microsoft-com:vml" ...
8476     <html xmlns:v="urn:schemas-microsoft-com:vml" ...
9354     <html>\n<head>\n<meta http-equiv="Content-Type...
9397     <META HTTP-EQUIV="Content-Type" CONTENT="text/...
9410     <META HTTP-EQUIV="Content-Type" CONTENT="text/...
9411     <html xmlns:v="urn:schemas-microsoft-com:vml" .

In [None]:
df["in_reply_to"]

0        <DB8PR08MB505240FF56402FB9D696487BC2830@DB8PR0...
1                                                         
2             <5ecc35d6.1c69fb81.1333f.f055@mx.google.com>
3             <5ecc35d6.1c69fb81.1333f.f055@mx.google.com>
4        <1241336830.665580.1590440188542.JavaMail.zimb...
                               ...                        
19131                                                     
19132                                                     
19133                                                     
19134    <f722ad124b5c4274a6345dfcd950fe8a@archivistes....
19135                                                     
Name: in_reply_to, Length: 19136, dtype: object

In [None]:
def extract_clean_text_from_html(html_content):
    """
    Extract clean, readable text from HTML content.

    Args:
        html_content (str): HTML content to clean

    Returns:
        str: Clean text without HTML tags
    """
    if not html_content:
        return ""

    try:
        # Remove scripts, styles, and other tags that contain content we don't want
        html_content = re.sub(r'<(script|style|head).*?>.*?</\1>', ' ', html_content, flags=re.DOTALL)

        # Replace common block elements with newlines to preserve structure
        html_content = re.sub(r'</(p|div|h\d|tr|li)>', '\n', html_content)
        html_content = re.sub(r'<br[^>]*>', '\n', html_content)

        # Replace table cells with tab separation
        html_content = re.sub(r'</td>', '\t', html_content)

        # Remove all HTML tags
        text = re.sub(r'<[^>]+>', ' ', html_content)

        # Decode HTML entities (&nbsp;, &lt;, etc.)
        text = html.unescape(text)

        # Handle literal escape sequences that appear in the text
        # Replace literal "\xad" with empty string (remove soft hyphens)
        text = text.replace('\\xad', '')
        # Replace literal "\xa0" with a space (non-breaking spaces)
        text = text.replace('\\xa0', ' ')

        # Handle actual Unicode characters too
        # Remove soft hyphens (invisible hyphens used for word breaks)
        text = text.replace('\xad', '')
        # Replace non-breaking spaces with regular spaces
        text = text.replace('\xa0', ' ')
        # Remove other problematic control characters
        text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)

        # Clean up other escape sequences that might appear in text
        text = text.replace('\\\\', '\\')  # Double backslash to single
        text = text.replace("\\'", "'")    # Escaped single quote
        text = text.replace('\\"', '"')    # Escaped double quote
        text = text.replace('\\n', '\n')   # Literal \n to newline
        text = text.replace('\\t', '\t')   # Literal \t to tab

        # Remove remaining literal escape sequences like \x.. that weren't handled above
        text = re.sub(r'\\x[0-9a-fA-F]{2}', '', text)

        # Clean up whitespace (multiple spaces, tabs, newlines)
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'\n{3,}', '\n\n', text)

        # Final cleanup to remove leading/trailing whitespace
        return text.strip()
    except Exception as e:
        print(f"Error processing HTML: {e}")
        return f"Error processing HTML content: {str(e)}"


def process_database(db_path, limit=None, batch_size=500, verbose=True):
    """Process the entire database, adding clean text for all HTML emails"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Add column if it doesn't exist
    try:
        cursor.execute("ALTER TABLE emails ADD COLUMN body_clean_text TEXT")
        print("Added body_clean_text column to database")
    except sqlite3.OperationalError:
        if verbose:
            print("The body_clean_text column already exists")

    # Count HTML emails
    cursor.execute("SELECT COUNT(*) FROM emails WHERE has_html = 1")
    total_html = cursor.fetchone()[0]

    if limit:
        total_to_process = min(limit, total_html)
    else:
        total_to_process = total_html

    if verbose:
        print(f"Processing {total_to_process} of {total_html} HTML emails...")

    # Process HTML emails in batches
    processed = 0
    for offset in range(0, total_to_process, batch_size):
        if limit and offset >= limit:
            break

        current_batch = min(batch_size, total_to_process - offset)

        if verbose:
            print(f"Processing batch {offset//batch_size + 1} ({offset+1}-{offset+current_batch} of {total_to_process})")

        # Get batch of emails
        cursor.execute(f"""
            SELECT rowid, message_id, body_html
            FROM emails
            WHERE has_html = 1
            LIMIT {current_batch} OFFSET {offset}
        """)

        rows = cursor.fetchall()

        # Process each email in the batch
        for row in tqdm(rows, desc="Processing", disable=not verbose):
            rowid, message_id, body_html = row

            if body_html:
                clean_text = extract_clean_text_from_html(body_html)

                # Update database
                cursor.execute(
                    "UPDATE emails SET body_clean_text = ? WHERE rowid = ?",
                    (clean_text, rowid)
                )

                processed += 1

        # Commit after each batch
        conn.commit()

    # Process non-HTML emails (copy body_text to body_clean_text)
    cursor.execute("SELECT COUNT(*) FROM emails WHERE has_html = 0 AND body_clean_text IS NULL")
    non_html_count = cursor.fetchone()[0]

    if non_html_count > 0:
        if verbose:
            print(f"Copying text for {non_html_count} non-HTML emails...")

        cursor.execute("""
            UPDATE emails
            SET body_clean_text = body_text
            WHERE has_html = 0 AND body_clean_text IS NULL
        """)
        conn.commit()

    if verbose:
        print(f"Processing complete. Total processed: {processed} HTML emails + {non_html_count} text emails")

    conn.close()

class EmailAnalyzer:
    """Class for analyzing the email database"""

    def __init__(self, db_path):
        self.db_path = db_path
        self.conn = None

    def connect(self):
        """Connect to the database"""
        if not self.conn:
            self.conn = sqlite3.connect(self.db_path)
            # Configure SQLite to return datetime objects for timestamps
            self.conn.row_factory = sqlite3.Row
        return self.conn

    def close(self):
        """Close the database connection"""
        if self.conn:
            self.conn.close()
            self.conn = None

    def get_email_summary(self):
        """Get a summary of emails in the database"""
        conn = self.connect()

        # Get basic statistics
        stats = {}
        cursor = conn.cursor()

        # Total emails
        cursor.execute("SELECT COUNT(*) FROM emails")
        stats['total_emails'] = cursor.fetchone()[0]

        # Emails by folder
        cursor.execute("SELECT folder, COUNT(*) as count FROM emails GROUP BY folder ORDER BY count DESC")
        stats['emails_by_folder'] = [dict(row) for row in cursor.fetchall()]

        # Emails by year
        cursor.execute("""
            SELECT strftime('%Y', date) AS year, COUNT(*) AS count
            FROM emails
            GROUP BY year
            ORDER BY year
        """)
        stats['emails_by_year'] = [dict(row) for row in cursor.fetchall()]

        # Top senders
        cursor.execute("""
            SELECT "from", COUNT(*) AS count
            FROM emails
            GROUP BY "from"
            ORDER BY count DESC
            LIMIT 10
        """)
        stats['top_senders'] = [dict(row) for row in cursor.fetchall()]

        # Emails with attachments
        cursor.execute("SELECT COUNT(*) FROM emails WHERE attachment_count > 0")
        stats['emails_with_attachments'] = cursor.fetchone()[0]

        return stats

    def search_emails(self, query, limit=100):
        """Search emails by text content"""
        conn = self.connect()

        # Check if body_clean_text exists
        cursor = conn.cursor()
        cursor.execute("PRAGMA table_info(emails)")
        columns = [row[1] for row in cursor.fetchall()]

        if 'body_clean_text' in columns:
            search_column = 'body_clean_text'
        else:
            # Fall back to body_text if clean text doesn't exist
            search_column = 'body_text'

        # Perform the search
        cursor.execute(f"""
            SELECT message_id, subject, "from", "to", date, folder
            FROM emails
            WHERE {search_column} LIKE ?
            ORDER BY date DESC
            LIMIT ?
        """, (f'%{query}%', limit))

        return [dict(row) for row in cursor.fetchall()]

    def get_email_content(self, message_id):
        """Get full content of a specific email by message_id"""
        conn = self.connect()
        cursor = conn.cursor()

        # Check if body_clean_text exists
        cursor.execute("PRAGMA table_info(emails)")
        columns = [row[1] for row in cursor.fetchall()]

        if 'body_clean_text' in columns:
            cursor.execute("""
                SELECT *, body_clean_text AS content
                FROM emails
                WHERE message_id = ?
            """, (message_id,))
        else:
            cursor.execute("""
                SELECT *,
                    CASE WHEN has_html = 1 THEN body_html ELSE body_text END AS content
                FROM emails
                WHERE message_id = ?
            """, (message_id,))

        result = cursor.fetchone()
        if result:
            return dict(result)
        return None

    def get_conversation_thread(self, message_id):
        """Get all emails in the same conversation thread"""
        conn = self.connect()
        cursor = conn.cursor()

        # First get the current email to find its references or in-reply-to
        cursor.execute("""
            SELECT message_id, "references", in_reply_to, subject
            FROM emails
            WHERE message_id = ?
        """, (message_id,))

        email = cursor.fetchone()
        if not email:
            return []

        # Find related messages by references, in-reply-to, or subject thread
        message_ids = set()

        # Add current message
        message_ids.add(email['message_id'])

        # Add messages this email is replying to
        if email['in_reply_to']:
            message_ids.add(email['in_reply_to'])

        # Add messages referenced
        if email['references']:
            ref_ids = re.findall(r'<([^>]+)>', email['references'])
            message_ids.update(ref_ids)

        # Find messages that reply to this one
        cursor.execute("""
            SELECT message_id
            FROM emails
            WHERE in_reply_to = ?
        """, (email['message_id'],))

        for row in cursor.fetchall():
            message_ids.add(row['message_id'])

        # Also find messages with the same subject (ignoring Re:, Fwd:, etc.)
        if email['subject']:
            clean_subject = re.sub(r'^(Re|Fwd|Fw|TR)(\[\d+\])?:\s*', '', email['subject'], flags=re.IGNORECASE)
            if clean_subject:
                cursor.execute("""
                    SELECT message_id
                    FROM emails
                    WHERE subject LIKE ? AND message_id != ?
                """, (f'%{clean_subject}%', email['message_id']))

                for row in cursor.fetchall():
                    message_ids.add(row['message_id'])

        # Now get all the emails in the thread
        placeholders = ','.join(['?'] * len(message_ids))
        cursor.execute(f"""
            SELECT message_id, subject, "from", date, "to", cc
            FROM emails
            WHERE message_id IN ({placeholders})
            ORDER BY date
        """, list(message_ids))

        return [dict(row) for row in cursor.fetchall()]


    def export_to_dataframe(self, query=None, limit=None):
        """Export emails to a pandas DataFrame for analysis"""
        conn = self.connect()

        if query:
            sql = f"{query} "
            if limit:
                sql += f"LIMIT {limit}"
            df = pd.read_sql_query(sql, conn)
        else:
            # Default query to get important fields
            sql = """
                SELECT message_id, subject, "from", "to", date, folder,
                       attachment_count, body_clean_text
                FROM emails
            """
            if limit:
                sql += f" LIMIT {limit}"
            df = pd.read_sql_query(sql, conn)

        # Convert date strings to datetime objects
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'], errors='coerce')

        return df

def main():
    """Main function to run the script from the command line"""
    parser = argparse.ArgumentParser(description='Process emails in SQLite database to extract clean text')
    parser.add_argument('db_path', help='Path to the SQLite database')
    parser.add_argument('--limit', type=int, help='Limit the number of emails to process')
    parser.add_argument('--batch-size', type=int, default=500, help='Batch size for processing')
    parser.add_argument('--summary', action='store_true', help='Print a summary of the database')
    parser.add_argument('--search', help='Search for emails containing a specific text')
    parser.add_argument('--quiet', action='store_true', help='Reduce output verbosity')

    args = parser.parse_args()

    if args.summary:
        analyzer = EmailAnalyzer(args.db_path)
        summary = analyzer.get_email_summary()

        print(f"Email Database Summary")
        print(f"====================")
        print(f"Total emails: {summary['total_emails']}")
        print(f"Emails with attachments: {summary['emails_with_attachments']}")

        print("\nEmails by folder:")
        for folder in summary['emails_by_folder'][:10]:  # Show top 10
            print(f"  {folder['folder']}: {folder['count']}")

        print("\nEmails by year:")
        for year in summary['emails_by_year']:
            print(f"  {year['year']}: {year['count']}")

        print("\nTop senders:")
        for sender in summary['top_senders'][:5]:  # Show top 5
            print(f"  {sender['from']}: {sender['count']}")

        analyzer.close()
    elif args.search:
        analyzer = EmailAnalyzer(args.db_path)
        results = analyzer.search_emails(args.search)

        print(f"Search results for '{args.search}':")
        print(f"====================")
        for i, email in enumerate(results):
            date = email['date']
            if date:
                try:
                    date = datetime.datetime.fromisoformat(date)
                    date_str = date.strftime('%Y-%m-%d %H:%M')
                except (ValueError, TypeError):
                    date_str = str(date)
            else:
                date_str = 'Unknown'

            print(f"{i+1}. [{date_str}] {email['subject']}")
            print(f"   From: {email['from']}")
            print(f"   To: {email['to']}")
            print(f"   ID: {email['message_id']}")
            print()

        analyzer.close()
    else:
        # Process the database
        process_database(args.db_path, limit=args.limit, batch_size=args.batch_size, verbose=not args.quiet)

# if __name__ == "__main__":
#     main()
# main()
db_path = "emails.db"  # Update this path if needed
process_database(db_path, limit=None, batch_size=500, verbose=True)


The body_clean_text column already exists
Processing 18793 of 18793 HTML emails...
Processing batch 1 (1-500 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 138.32it/s]


Processing batch 2 (501-1000 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 142.55it/s]


Processing batch 3 (1001-1500 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 141.16it/s]


Processing batch 4 (1501-2000 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 188.70it/s]


Processing batch 5 (2001-2500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 169.06it/s]


Processing batch 6 (2501-3000 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 156.18it/s]


Processing batch 7 (3001-3500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 208.76it/s]


Processing batch 8 (3501-4000 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 199.01it/s]


Processing batch 9 (4001-4500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 167.38it/s]


Processing batch 10 (4501-5000 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 153.29it/s]


Processing batch 11 (5001-5500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 172.15it/s]


Processing batch 12 (5501-6000 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 156.60it/s]


Processing batch 13 (6001-6500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 188.61it/s]


Processing batch 14 (6501-7000 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 191.74it/s]


Processing batch 15 (7001-7500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 218.29it/s]


Processing batch 16 (7501-8000 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 185.42it/s]


Processing batch 17 (8001-8500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 197.74it/s]


Processing batch 18 (8501-9000 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 158.26it/s]


Processing batch 19 (9001-9500 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 163.40it/s]


Processing batch 20 (9501-10000 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 140.82it/s]


Processing batch 21 (10001-10500 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 161.25it/s]


Processing batch 22 (10501-11000 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 158.80it/s]


Processing batch 23 (11001-11500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 170.02it/s]


Processing batch 24 (11501-12000 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 217.91it/s]


Processing batch 25 (12001-12500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 209.56it/s]


Processing batch 26 (12501-13000 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 186.86it/s]


Processing batch 27 (13001-13500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 220.86it/s]


Processing batch 28 (13501-14000 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 216.14it/s]


Processing batch 29 (14001-14500 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 164.24it/s]


Processing batch 30 (14501-15000 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 214.23it/s]


Processing batch 31 (15001-15500 of 18793)


Processing: 100%|██████████| 500/500 [00:02<00:00, 214.17it/s]


Processing batch 32 (15501-16000 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 130.29it/s]


Processing batch 33 (16001-16500 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 140.20it/s]


Processing batch 34 (16501-17000 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 154.25it/s]


Processing batch 35 (17001-17500 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 142.37it/s]


Processing batch 36 (17501-18000 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 165.70it/s]


Processing batch 37 (18001-18500 of 18793)


Processing: 100%|██████████| 500/500 [00:03<00:00, 149.40it/s]


Processing batch 38 (18501-18793 of 18793)


Processing: 100%|██████████| 293/293 [00:02<00:00, 102.54it/s]


Processing complete. Total processed: 18793 HTML emails + 0 text emails


In [None]:
analyzer =EmailAnalyzer(db_path)
analyzer.get_email_summary()


NameError: name 'EmailAnalyzer' is not defined

In [None]:
analyzer.get_conversation_thread(message_id="<0e8d36fce37f4240b8b7f0965b794923@archivistes.org>")

[{'message_id': '<0e8d36fce37f4240b8b7f0965b794923@archivistes.org>',
  'subject': "TR: Rappel : Assemblée générale de l'AAF le 29 juin 2020 à partir de 17h en visioconférence",
  'from': 'AAF - Anne Clerc, déléguée générale<delegation_generale@archivistes.org>',
  'date': '2020-06-10 16:22:01+02:00',
  'to': 'AAF - Céline Guyon <celine.guyon@archivistes.org>',
  'cc': 'Catherine Bernard <catherine.bernard.aaf@gmail.com>, "Ducol, Laurent"\n\t<Laurent.Ducol@saint-gobain.com>, AAF vie associative - Nicolas Didon\n\t<vieassociative@archivistes.org>'}]

In [None]:
df_db_cleaned = analyzer.export_to_dataframe()
df_db_cleaned["body_clean_text"][1]

  df['date'] = pd.to_datetime(df['date'], errors='coerce')


'Aux membres de l’AAF \n\n \n\n \n \n \n \n Cher.e membre,\n\nJ’ai l’honneur de vous informer que l’assemblée générale ordinaire annuelle de notre association se tiendra le :\n\n Lundi 29 juin 2020 de 17h à 19h en visioconférence\n \n\nInitialement prévue le 20 mars, l’assemblée générale a été reportée en raison du contexte sanitaire.\n\nL’ordre du jour de l’assemblée générale est le suivant :\n\n Approbation du procès-verbal de la précédente réunion d’avril 2019 ;\n\n Approbation du rapport moral de la présidente pour l’année 2019 ;\n\n Approbation du rapport financier de la trésorière, approbation des comptes\n de l’exercice clos le 31 décembre 2019 ;\n\n Présentation des comptes de l’exercice clos le 31 décembre 2019 pour la filiale\n Archivistes Français Formation ;\n\n Présentation et approbation du budget 2020 révisé pour l’AAF ;\n\n Présentation du budget 2020 révisé pour la filiale Archivistes Français Formation ;\n\n Fixation du montant des cotisations pour l’année 2021.\n\nJe

In [None]:
## attempt 4

In [3]:
import mailbox
import pandas as pd
import email
import os
from email.header import decode_header
import datetime
import re
from tqdm import tqdm
import sqlite3
import html
import json
import argparse

def decode_str(s):
    """Decode encoded email header strings"""
    if s is None:
        return ""
    try:
        decoded_parts = decode_header(s)
        return ''.join([
            part.decode(encoding or 'utf-8', errors='replace') if isinstance(part, bytes) else part
            for part, encoding in decoded_parts
        ])
    except:
        return str(s)

def extract_clean_text_from_html(html_content):
    """
    Extract clean, readable text from HTML content.

    Args:
        html_content (str): HTML content to clean

    Returns:
        str: Clean text without HTML tags
    """
    if not html_content:
        return ""

    try:
        # Remove scripts, styles, and other tags that contain content we don't want
        html_content = re.sub(r'<(script|style|head).*?>.*?</\1>', ' ', html_content, flags=re.DOTALL)

        # Replace common block elements with newlines to preserve structure
        html_content = re.sub(r'</(p|div|h\d|tr|li)>', '\n', html_content)
        html_content = re.sub(r'<br[^>]*>', '\n', html_content)

        # Replace table cells with tab separation
        html_content = re.sub(r'</td>', '\t', html_content)

        # Remove all HTML tags
        text = re.sub(r'<[^>]+>', ' ', html_content)

        # Decode HTML entities (&nbsp;, &lt;, etc.)
        text = html.unescape(text)

        # Handle literal escape sequences that appear in the text
        # Replace literal "\xad" with empty string (remove soft hyphens)
        text = text.replace('\\xad', '')
        # Replace literal "\xa0" with a space (non-breaking spaces)
        text = text.replace('\\xa0', ' ')

        # Handle actual Unicode characters too
        # Remove soft hyphens (invisible hyphens used for word breaks)
        text = text.replace('\xad', '')
        # Replace non-breaking spaces with regular spaces
        text = text.replace('\xa0', ' ')
        # Remove other problematic control characters
        text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)

        # Clean up other escape sequences that might appear in text
        text = text.replace('\\\\', '\\')  # Double backslash to single
        text = text.replace("\\'", "'")    # Escaped single quote
        text = text.replace('\\"', '"')    # Escaped double quote
        text = text.replace('\\n', '\n')   # Literal \n to newline
        text = text.replace('\\t', '\t')   # Literal \t to tab

        # Remove remaining literal escape sequences like \x.. that weren't handled above
        text = re.sub(r'\\x[0-9a-fA-F]{2}', '', text)

        # Clean up whitespace (multiple spaces, tabs, newlines)
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'\n{3,}', '\n\n', text)

        # Final cleanup to remove leading/trailing whitespace
        return text.strip()
    except Exception as e:
        print(f"Error processing HTML: {e}")
        return f"Error processing HTML content: {str(e)}"

def get_email_body(message):
    """Extract body text from email message, handling HTML correctly"""
    body_text = ""
    body_html = ""

    if message.is_multipart():
        for part in message.walk():
            content_type = part.get_content_type()
            content_disposition = str(part.get("Content-Disposition") or "")

            # Skip attachments
            if "attachment" in content_disposition:
                continue

            try:
                payload = part.get_payload(decode=True)
                if payload is None:
                    continue

                charset = part.get_content_charset() or 'utf-8'
                decoded_payload = payload.decode(charset, errors='replace')

                if content_type == "text/plain":
                    body_text += decoded_payload
                elif content_type == "text/html":
                    body_html += decoded_payload
            except:
                continue
    else:
        # Not multipart - get payload directly
        try:
            content_type = message.get_content_type()
            payload = message.get_payload(decode=True)
            if payload:
                charset = message.get_content_charset() or 'utf-8'
                decoded_payload = payload.decode(charset, errors='replace')

                if content_type == "text/plain":
                    body_text = decoded_payload
                elif content_type == "text/html":
                    body_html = decoded_payload
        except:
            pass

    # Prefer HTML content but fall back to plain text
    if body_html:
        return {
            "html": body_html,
            "text": extract_text_from_html(body_html),
            "has_html": True
        }
    else:
        return {
            "html": "",
            "text": body_text,
            "has_html": False
        }

def extract_attachments_info(message):
    """Extract information about attachments"""
    attachments = []

    if message.is_multipart():
        for part in message.walk():
            content_disposition = str(part.get("Content-Disposition") or "")

            if "attachment" in content_disposition:
                filename = part.get_filename()
                if filename:
                    try:
                        filename = decode_str(filename)
                    except:
                        pass

                content_type = part.get_content_type()
                attachments.append({
                    "filename": filename,
                    "content_type": content_type,
                    "size": len(part.get_payload(decode=True) or b'')
                })

    return attachments

def extract_recipients(message):
    """Extract all recipients (To, CC, BCC)"""
    to = decode_str(message.get('to') or "")
    cc = decode_str(message.get('cc') or "")
    bcc = decode_str(message.get('bcc') or "")

    return {
        "to": to,
        "cc": cc,
        "bcc": bcc
    }

def extract_message_data(message, folder_name):
    """Extract comprehensive email data"""
    # Extract basic headers
    subject = decode_str(message.get('subject') or "")
    from_addr = decode_str(message.get('from') or "")
    date_str = message.get('date')
    message_id = decode_str(message.get('message-id') or "")

    # Parse date
    try:
        date = email.utils.parsedate_to_datetime(date_str)
    except:
        date = None

    # Get recipients
    recipients = extract_recipients(message)

    # Get body content
    body_content = get_email_body(message)

    # Get attachment info
    attachments = extract_attachments_info(message)

    # Thread information
    references = decode_str(message.get('references') or "")
    in_reply_to = decode_str(message.get('in-reply-to') or "")

    return {
        'message_id': message_id,
        'subject': subject,
        'from': from_addr,
        'to': recipients["to"],
        'cc': recipients["cc"],
        'bcc': recipients["bcc"],
        'date': date,
        'folder': folder_name,
        'body_text': body_content["text"],
        'body_html': body_content["html"],
        'has_html': body_content["has_html"],
        'attachments': attachments,
        'attachment_count': len(attachments),
        'references': references,
        'in_reply_to': in_reply_to
    }

def process_mbox_to_sqlite(mbox_path, conn, batch_size=100):
    """Process mbox file directly to SQLite in batches with progress bar"""
    cursor = conn.cursor()
    mbox = mailbox.mbox(mbox_path)
    folder_name = os.path.basename(mbox_path).replace('.mbox', '')

    # Count messages for progress bar
    total_messages = len(mbox)
    print(f"Processing {folder_name} ({total_messages} messages)")

    # Process in batches
    batch = []
    for i, message in enumerate(tqdm(mbox, total=total_messages, desc=folder_name)):
        email_data = extract_message_data(message, folder_name)

        # Convert attachment data to string
        if email_data['attachments']:
            email_data['attachments_json'] = str(email_data['attachments'])
        else:
            email_data['attachments_json'] = ""

        del email_data['attachments']  # Remove original list

        batch.append(email_data)

        # Process batch
        if len(batch) >= batch_size or i == total_messages - 1:
            if batch:
                # Convert to DataFrame for easy SQL insertion
                df = pd.DataFrame(batch)

                # Write to database
                df.to_sql('emails', conn, if_exists='append', index=False)

                # Clear batch
                batch = []

                # Commit to save progress
                conn.commit()

def setup_database(db_path):
    """Set up the SQLite database schema with proper types and indexes"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Drop table if exists
    cursor.execute("DROP TABLE IF EXISTS emails")

    # Create table with proper columns and types
    # Note: Escaping the reserved keyword 'references' with quotes
    cursor.execute('''
    CREATE TABLE emails (
        message_id TEXT,
        subject TEXT,
        "from" TEXT,
        "to" TEXT,
        cc TEXT,
        bcc TEXT,
        date TIMESTAMP,
        folder TEXT,
        body_text TEXT,
        body_html TEXT,
        has_html BOOLEAN,
        attachments_json TEXT,
        attachment_count INTEGER,
        "references" TEXT,
        in_reply_to TEXT
    )
    ''')

    # Create indexes for common queries
    cursor.execute('CREATE INDEX idx_date ON emails(date)')
    cursor.execute('CREATE INDEX idx_folder ON emails(folder)')
    cursor.execute('CREATE INDEX idx_from ON emails("from")')
    cursor.execute('CREATE INDEX idx_to ON emails("to")')
    cursor.execute('CREATE INDEX idx_subject ON emails(subject)')

    conn.commit()
    return conn

def collect_email_data(directory, include_html=True, include_attachments=True):
    """Process all mbox files and return a list of email data"""
    all_emails = []

    # Get list of mbox files
    mbox_files = [f for f in os.listdir(directory) if f.endswith('.mbox')]
    print(f"Found {len(mbox_files)} mbox files")

    # Process each file
    for filename in mbox_files:
        file_path = os.path.join(directory, filename)
        folder_name = os.path.basename(file_path).replace('.mbox', '')
        mbox = mailbox.mbox(file_path)

        # Count messages for progress bar
        total_messages = len(mbox)
        print(f"Processing {folder_name} ({total_messages} messages)")

        for message in tqdm(mbox, total=total_messages, desc=folder_name):
            email_data = extract_message_data(message, folder_name)

            # Optionally exclude HTML content to reduce data size
            if not include_html:
                email_data.pop('body_html', None)

            # Optionally simplify attachment info to reduce data size
            if not include_attachments:
                email_data['attachment_count'] = len(email_data.get('attachments', []))
                email_data.pop('attachments', None)

            all_emails.append(email_data)

    return all_emails

def save_to_json(emails, filepath, indent=None):
    """Save email data to a JSON file"""
    # Convert data for JSON serialization
    serializable_emails = []

    for email in emails:
        email_copy = email.copy()

        # Convert datetime to string
        if email_copy.get('date') and isinstance(email_copy['date'], datetime.datetime):
            email_copy['date'] = email_copy['date'].isoformat()

        serializable_emails.append(email_copy)

    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(serializable_emails, f, ensure_ascii=False, indent=indent)

    print(f"Saved {len(serializable_emails)} emails to {filepath}")

def save_to_csv(emails, filepath):
    """Save email data to a CSV file"""
    # Convert to DataFrame
    df = pd.DataFrame(emails)

    # Convert datetime to string
    if 'date' in df.columns:
        df['date'] = df['date'].apply(lambda x: x.isoformat() if isinstance(x, datetime.datetime) else x)

    # Convert attachments to string if they exist
    if 'attachments' in df.columns:
        df['attachments'] = df['attachments'].apply(lambda x: str(x) if x else "")

    # Save to CSV
    df.to_csv(filepath, index=False, encoding='utf-8')

    print(f"Saved {len(df)} emails to {filepath}")

def process_mbox_files(directory, output_format='sqlite', output_path=None, include_html=True, include_attachments=True):
    """
    Process mbox files and save to the specified format

    Parameters:
    - directory: Directory containing .mbox files
    - output_format: 'sqlite', 'json', 'csv', or 'all'
    - output_path: Output file path (default: emails.<format>)
    - include_html: Whether to include HTML content
    - include_attachments: Whether to include attachment details
    """
    if output_format not in ['sqlite', 'json', 'csv', 'all']:
        raise ValueError("output_format must be one of 'sqlite', 'json', 'csv', or 'all'")

    # Set default output paths if not provided
    if output_path is None:
        output_path = 'emails'

    # SQLite processing
    if output_format in ['sqlite', 'all']:
        sqlite_path = f"{output_path}.db" if not output_path.endswith('.db') else output_path
        conn = setup_database(sqlite_path)

        # Get list of mbox files
        mbox_files = [f for f in os.listdir(directory) if f.endswith('.mbox')]

        # Process each file
        for filename in mbox_files:
            file_path = os.path.join(directory, filename)
            process_mbox_to_sqlite(file_path, conn)

        conn.commit()
        conn.close()
        print(f"SQLite database saved to {sqlite_path}")

    # JSON and/or CSV processing
    if output_format in ['json', 'csv', 'all']:
        # Collect email data
        emails = collect_email_data(directory, include_html, include_attachments)

        # Save to JSON
        if output_format in ['json', 'all']:
            json_path = f"{output_path}.json" if not output_path.endswith('.json') else output_path
            save_to_json(emails, json_path, indent=2)

        # Save to CSV
        if output_format in ['csv', 'all']:
            csv_path = f"{output_path}.csv" if not output_path.endswith('.csv') else output_path
            save_to_csv(emails, csv_path)

def main():
    parser = argparse.ArgumentParser(description='Process mbox files into various formats')
    parser.add_argument('directory', help='Directory containing .mbox files')
    parser.add_argument('--format', choices=['sqlite', 'json', 'csv', 'all'], default='sqlite',
                       help='Output format (default: sqlite)')
    parser.add_argument('--output', help='Output file path (without extension)')
    parser.add_argument('--no-html', action='store_false', dest='include_html',
                       help='Exclude HTML content to reduce file size')
    parser.add_argument('--no-attachments', action='store_false', dest='include_attachments',
                       help='Exclude attachment details to reduce file size')

    args = parser.parse_args()

    process_mbox_files(
        args.directory,
        args.format,
        args.output,
        args.include_html,
        args.include_attachments
    )

# if __name__ == "__main__":
#     main()

In [None]:
# Set the directory where your .mbox files are located
mbox_directory = "data/processed/mailbox_cecile/"  # Update this path to your actual location

# Call the function to process the files and save as JSON
process_mbox_files(
    directory=mbox_directory,
    output_format='csv',  # Specify 'json' as the output format
    output_path='emails_export',  # Optional: specify the output filename (without extension)
    include_html=True,  # Include HTML content (set to False to reduce file size)
    include_attachments=True  # Include attachment details (set to False to reduce file size)
)

Found 17 mbox files
Processing AG (6 messages)


AG: 100%|██████████| 6/6 [00:00<00:00, 75.16it/s]


Processing Archive (10 messages)


Archive: 100%|██████████| 10/10 [00:00<00:00, 19.84it/s]


KeyboardInterrupt: 

In [None]:
# Import the module

# Process to SQLite (default)
# process_mbox_files("path/to/mbox/files/")

# Process to JSON
process_mbox_files(mbox_directory, output_format="json")

# # Process to CSV with a custom filename
# process_mbox_files("path/to/mbox/files/", output_format="csv", output_path="my_emails")

# # Generate all formats
# process_mbox_files("path/to/mbox/files/", output_format="all")

# # Exclude HTML content for smaller files
# process_mbox_files("path/to/mbox/files/", output_format="json", include_html=False)

Found 17 mbox files
Processing AG (6 messages)


AG: 100%|██████████| 6/6 [00:00<00:00, 32.72it/s]


Processing Archive (10 messages)


Archive: 100%|██████████| 10/10 [00:00<00:00, 27.68it/s]


Processing Archives calssifiees (423 messages)


Archives calssifiees: 100%|██████████| 423/423 [00:11<00:00, 37.41it/s]


Processing Ateliers (28 messages)


Ateliers: 100%|██████████| 28/28 [00:00<00:00, 109.11it/s]


KeyboardInterrupt: 

SyntaxError: from __future__ imports must occur at the beginning of the file (models.py, line 4)

In [None]:
## Attempt 5 with class

In [None]:


# def decode_str(s):
#     """Decode encoded email header strings"""
#     if s is None:
#         return ""
#     try:
#         decoded_parts = decode_header(s)
#         return ''.join([
#             part.decode(encoding or 'utf-8', errors='replace') if isinstance(part, bytes) else part
#             for part, encoding in decoded_parts
#         ])
#     except:
#         return str(s)

# def extract_clean_text_from_html(html_content):
#     """
#     Extract clean, readable text from HTML content.

#     Args:
#         html_content (str): HTML content to clean

#     Returns:
#         str: Clean text without HTML tags
#     """
#     if not html_content:
#         return ""

#     try:
#         # Remove scripts, styles, and other tags that contain content we don't want
#         html_content = re.sub(r'<(script|style|head).*?>.*?</\1>', ' ', html_content, flags=re.DOTALL)

#         # Replace common block elements with newlines to preserve structure
#         html_content = re.sub(r'</(p|div|h\d|tr|li)>', '\n', html_content)
#         html_content = re.sub(r'<br[^>]*>', '\n', html_content)

#         # Replace table cells with tab separation
#         html_content = re.sub(r'</td>', '\t', html_content)

#         # Remove all HTML tags
#         text = re.sub(r'<[^>]+>', ' ', html_content)

#         # Decode HTML entities (&nbsp;, &lt;, etc.)
#         text = html.unescape(text)

#         # Handle literal escape sequences that appear in the text
#         # Replace literal "\xad" with empty string (remove soft hyphens)
#         text = text.replace('\\xad', '')
#         # Replace literal "\xa0" with a space (non-breaking spaces)
#         text = text.replace('\\xa0', ' ')

#         # Handle actual Unicode characters too
#         # Remove soft hyphens (invisible hyphens used for word breaks)
#         text = text.replace('\xad', '')
#         # Replace non-breaking spaces with regular spaces
#         text = text.replace('\xa0', ' ')
#         # Remove other problematic control characters
#         text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)

#         # Clean up other escape sequences that might appear in text
#         text = text.replace('\\\\', '\\')  # Double backslash to single
#         text = text.replace("\\'", "'")    # Escaped single quote
#         text = text.replace('\\"', '"')    # Escaped double quote
#         text = text.replace('\\n', '\n')   # Literal \n to newline
#         text = text.replace('\\t', '\t')   # Literal \t to tab

#         # Remove remaining literal escape sequences like \x.. that weren't handled above
#         text = re.sub(r'\\x[0-9a-fA-F]{2}', '', text)

#         # Clean up whitespace (multiple spaces, tabs, newlines)
#         text = re.sub(r'[ \t]+', ' ', text)
#         text = re.sub(r'\n{3,}', '\n\n', text)

#         # Final cleanup to remove leading/trailing whitespace
#         return text.strip()
#     except Exception as e:
#         print(f"Error processing HTML: {e}")
#         return f"Error processing HTML content: {str(e)}"

# def get_email_body(message):
#     """Extract body text from email message, handling HTML correctly"""
#     body_text = ""
#     body_html = ""

#     if message.is_multipart():
#         for part in message.walk():
#             content_type = part.get_content_type()
#             content_disposition = str(part.get("Content-Disposition") or "")

#             # Skip attachments
#             if "attachment" in content_disposition:
#                 continue

#             try:
#                 payload = part.get_payload(decode=True)
#                 if payload is None:
#                     continue

#                 charset = part.get_content_charset() or 'utf-8'
#                 decoded_payload = payload.decode(charset, errors='replace')

#                 if content_type == "text/plain":
#                     body_text += decoded_payload
#                 elif content_type == "text/html":
#                     body_html += decoded_payload
#             except:
#                 continue
#     else:
#         # Not multipart - get payload directly
#         try:
#             content_type = message.get_content_type()
#             payload = message.get_payload(decode=True)
#             if payload:
#                 charset = message.get_content_charset() or 'utf-8'
#                 decoded_payload = payload.decode(charset, errors='replace')

#                 if content_type == "text/plain":
#                     body_text = decoded_payload
#                 elif content_type == "text/html":
#                     body_html = decoded_payload
#         except:
#             pass

#     # Prefer HTML content but fall back to plain text
#     if body_html:
#         return {
#             "html": body_html,
#             "text": extract_clean_text_from_html(body_html),
#             "has_html": True
#         }
#     else:
#         return {
#             "html": "",
#             "text": body_text,
#             "has_html": False
#         }

# def extract_attachments_info(message):
#     """Extract information about attachments"""
#     attachments = []

#     if message.is_multipart():
#         for part in message.walk():
#             content_disposition = str(part.get("Content-Disposition") or "")

#             if "attachment" in content_disposition:
#                 filename = part.get_filename()
#                 if filename:
#                     try:
#                         filename = decode_str(filename)
#                     except:
#                         pass

#                 content_type = part.get_content_type()
#                 content = part.get_payload(decode=True) or b''

#                 attachments.append({
#                     "filename": filename,
#                     "content_type": content_type,
#                     "size": len(content),
#                     "content": content
#                 })

#     return attachments

# def parse_email_address(address_str):
#     """Parse a string containing email addresses into a list of Entity objects"""
#     if not address_str:
#         return []

#     entities = []
#     # Simple regex to extract name and email from patterns like "Name <email@example.com>"
#     email_pattern = re.compile(r'(.*?)\s*<([^>]+)>|([^,\s]+@[^,\s]+)')

#     for addr in address_str.split(','):
#         addr = addr.strip()
#         if not addr:
#             continue

#         match = email_pattern.search(addr)
#         if match:
#             if match.group(2):  # Format: "Name <email@example.com>"
#                 name = match.group(1).strip()
#                 email_addr = match.group(2).strip()
#             else:  # Format: "email@example.com"
#                 name = email_addr = match.group(3).strip()

#             # Create Entity with EmailAddress
#             email_obj = EmailAddress(email=email_addr)
#             entity = Entity(
#                 name=name,
#                 email=email_obj,
#                 is_physical_person=True  # Assuming default
#             )
#             entities.append(entity)

#     return entities

# def extract_recipients(message):
#     """Extract all recipients (To, CC, BCC) as Entity objects"""
#     to_str = decode_str(message.get('to') or "")
#     cc_str = decode_str(message.get('cc') or "")
#     bcc_str = decode_str(message.get('bcc') or "")
#     reply_to_str = decode_str(message.get('reply-to') or "")

#     to_entities = parse_email_address(to_str)
#     cc_entities = parse_email_address(cc_str)
#     bcc_entities = parse_email_address(bcc_str)
#     reply_to_entity = parse_email_address(reply_to_str)[0] if parse_email_address(reply_to_str) else None

#     return {
#         "to": to_entities,
#         "cc": cc_entities,
#         "bcc": bcc_entities,
#         "reply_to": reply_to_entity
#     }

# list_email_match = re.search(r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', list_unsubscribe)
#         if list_email_match:
#             list_email = list_email_match.group(1)

#         mailing_list = MailingList(
#             id=str(uuid.uuid4()),
#             name=list_name,
#             description=f"Mailing list extracted from {list_id}",
#             email_address=EmailAddress(email=list_email)
#         )

#     # Create a SenderEmail object
#     sender_email_id = str(uuid.uuid4())
#     sender_email = SenderEmail(
#         id=sender_email_id,
#         sender=sender_entity,
#         body=body_content["text"],
#         timestamp=timestamp
#     )

#     # Create a ReceiverEmail object
#     receiver_email = ReceiverEmail(
#         id=email_id,
#         sender_email=sender_email,
#         sender=sender_entity,
#         to=recipients["to"] if recipients["to"] else None,
#         reply_to=recipients["reply_to"],
#         cc=recipients["cc"] if recipients["cc"] else None,
#         bcc=recipients["bcc"] if recipients["bcc"] else None,
#         timestamp=timestamp,
#         subject=subject,
#         body=body_content["text"],
#         attachments=attachments if attachments else None,
#         is_deleted=False,
#         folder=folder_name,
#         is_spam=False,
#         mailing_list=mailing_list,
#         importance_score=0,  # Default value
#         mother_email=None,  # Will be linked later based on in_reply_to
#         children_emails=None
#     )

#     # Create a data dictionary for our normalized database tables
#     result = {
#         'id': email_id,
#         'sender_email_id': sender_email.id,
#         'sender_id': None,  # Will be filled in by the processing function
#         'reply_to_id': None,  # Will be filled in by the processing function
#         'timestamp': timestamp,
#         'subject': subject,
#         'body': body_content["text"],
#         'body_html': body_content["html"] if body_content["has_html"] else None,
#         'has_html': body_content["has_html"],
#         'is_deleted': False,
#         'folder': folder_name,
#         'is_spam': False,
#         'mailing_list_id': mailing_list.id if mailing_list else None,
#         'importance_score': 0,
#         'mother_email_id': None,  # Will be updated later based on in_reply_to
#         'message_id': message_id,
#         'references': references,
#         'in_reply_to': in_reply_to
#     }

#     # Add additional data for cross-reference during insertion
#     result['attachments_data'] = attachments_data

#     return result, receiver_email

# def setup_database(db_path):
#     """Set up the DuckDB database schema with proper types and indexes"""
#     # Connect to DuckDB database
#     conn = duckdb.connect(db_path)

#     # Create tables for each Pydantic model

#     # Organizations table
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS organizations (
#         id VARCHAR PRIMARY KEY,
#         name VARCHAR,
#         description VARCHAR,
#         email_address VARCHAR
#     )
#     """)

#     # Positions table
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS positions (
#         id VARCHAR PRIMARY KEY,
#         name VARCHAR,
#         start_date TIMESTAMP,
#         end_date TIMESTAMP,
#         description VARCHAR,
#         organization_id VARCHAR,
#         FOREIGN KEY (organization_id) REFERENCES organizations(id)
#     )
#     """)

#     # Entities table (for senders and recipients)
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS entities (
#         id VARCHAR PRIMARY KEY,
#         name VARCHAR,
#         email VARCHAR,
#         alias_names JSON,
#         is_physical_person BOOLEAN
#     )
#     """)

#     # Entity alias emails table
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS entity_alias_emails (
#         id VARCHAR PRIMARY KEY,
#         entity_id VARCHAR,
#         email VARCHAR,
#         FOREIGN KEY (entity_id) REFERENCES entities(id)
#     )
#     """)

#     # Entity positions table
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS entity_positions (
#         entity_id VARCHAR,
#         position_id VARCHAR,
#         PRIMARY KEY (entity_id, position_id),
#         FOREIGN KEY (entity_id) REFERENCES entities(id),
#         FOREIGN KEY (position_id) REFERENCES positions(id)
#     )
#     """)

#     # Mailing lists table
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS mailing_lists (
#         id VARCHAR PRIMARY KEY,
#         name VARCHAR,
#         description VARCHAR,
#         email_address VARCHAR
#     )
#     """)

#     # Sender emails table
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS sender_emails (
#         id VARCHAR PRIMARY KEY,
#         sender_id VARCHAR,
#         body TEXT,
#         timestamp TIMESTAMP,
#         FOREIGN KEY (sender_id) REFERENCES entities(id)
#     )
#     """)

#     # Receiver emails table
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS receiver_emails (
#         id VARCHAR PRIMARY KEY,
#         sender_email_id VARCHAR,
#         sender_id VARCHAR,
#         reply_to_id VARCHAR,
#         timestamp TIMESTAMP,
#         subject VARCHAR,
#         body TEXT,
#         body_html TEXT,
#         has_html BOOLEAN,
#         is_deleted BOOLEAN DEFAULT FALSE,
#         folder VARCHAR DEFAULT 'inbox',
#         is_spam BOOLEAN DEFAULT FALSE,
#         mailing_list_id VARCHAR,
#         importance_score INTEGER DEFAULT 0,
#         mother_email_id VARCHAR,
#         message_id VARCHAR,
#         references TEXT,
#         FOREIGN KEY (sender_email_id) REFERENCES sender_emails(id),
#         FOREIGN KEY (sender_id) REFERENCES entities(id),
#         FOREIGN KEY (reply_to_id) REFERENCES entities(id),
#         FOREIGN KEY (mailing_list_id) REFERENCES mailing_lists(id),
#         FOREIGN KEY (mother_email_id) REFERENCES receiver_emails(id)
#     )
#     """)

#     # Email recipients tables (to, cc, bcc)
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS email_recipients_to (
#         email_id VARCHAR,
#         entity_id VARCHAR,
#         PRIMARY KEY (email_id, entity_id),
#         FOREIGN KEY (email_id) REFERENCES receiver_emails(id),
#         FOREIGN KEY (entity_id) REFERENCES entities(id)
#     )
#     """)

#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS email_recipients_cc (
#         email_id VARCHAR,
#         entity_id VARCHAR,
#         PRIMARY KEY (email_id, entity_id),
#         FOREIGN KEY (email_id) REFERENCES receiver_emails(id),
#         FOREIGN KEY (entity_id) REFERENCES entities(id)
#     )
#     """)

#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS email_recipients_bcc (
#         email_id VARCHAR,
#         entity_id VARCHAR,
#         PRIMARY KEY (email_id, entity_id),
#         FOREIGN KEY (email_id) REFERENCES receiver_emails(id),
#         FOREIGN KEY (entity_id) REFERENCES entities(id)
#     )
#     """)

#     # Attachments table
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS attachments (
#         id VARCHAR PRIMARY KEY,
#         email_id VARCHAR,
#         filename VARCHAR,
#         content BLOB,
#         content_type VARCHAR,
#         size INTEGER,
#         FOREIGN KEY (email_id) REFERENCES receiver_emails(id)
#     )
#     """)

#     # Create child email relationships table
#     conn.execute("""
#     CREATE TABLE IF NOT EXISTS email_children (
#         parent_id VARCHAR,
#         child_id VARCHAR,
#         PRIMARY KEY (parent_id, child_id),
#         FOREIGN KEY (parent_id) REFERENCES receiver_emails(id),
#         FOREIGN KEY (child_id) REFERENCES receiver_emails(id)
#     )
#     """)

#     # Create indexes
#     conn.execute('CREATE INDEX IF NOT EXISTS idx_receiver_emails_timestamp ON receiver_emails(timestamp)')
#     conn.execute('CREATE INDEX IF NOT EXISTS idx_receiver_emails_folder ON receiver_emails(folder)')
#     conn.execute('CREATE INDEX IF NOT EXISTS idx_receiver_emails_subject ON receiver_emails(subject)')
#     conn.execute('CREATE INDEX IF NOT EXISTS idx_receiver_emails_message_id ON receiver_emails(message_id)')
#     conn.execute('CREATE INDEX IF NOT EXISTS idx_entities_email ON entities(email)')
#     conn.execute('CREATE INDEX IF NOT EXISTS idx_attachments_email_id ON attachments(email_id)')

#     return conn

# def process_mbox_to_duckdb(mbox_path, conn, batch_size=100, entity_cache=None):
#     """Process mbox file directly to DuckDB in batches with progress bar"""
#     if entity_cache is None:
#         entity_cache = {}  # Cache to store entities we've already seen

#     mbox = mailbox.mbox(mbox_path)
#     folder_name = os.path.basename(mbox_path).replace('.mbox', '')

#     # Count messages for progress bar
#     total_messages = len(mbox)
#     print(f"Processing {folder_name} ({total_messages} messages)")

#     # Process in batches for each table
#     entity_batch = []
#     entity_alias_emails_batch = []
#     mailing_list_batch = []
#     sender_email_batch = []
#     receiver_email_batch = []
#     to_recipients_batch = []
#     cc_recipients_batch = []
#     bcc_recipients_batch = []
#     attachments_batch = []

#     for i, message in enumerate(tqdm(mbox, total=total_messages, desc=folder_name)):
#         email_data, receiver_email = extract_message_data(message, folder_name)

#         # Process sender entity
#         sender = receiver_email.sender
#         if sender.email.email not in entity_cache:
#             entity_id = str(uuid.uuid4())
#             entity_cache[sender.email.email] = entity_id

#             # Add to entities batch
#             entity_batch.append({
#                 'id': entity_id,
#                 'name': sender.name,
#                 'email': sender.email.email,
#                 'alias_names': json.dumps(sender.alias_names) if sender.alias_names else None,
#                 'is_physical_person': sender.is_physical_person
#             })

#             # Process alias emails if any
#             if sender.alias_emails:
#                 for alias_email in sender.alias_emails:
#                     entity_alias_emails_batch.append({
#                         'id': str(uuid.uuid4()),
#                         'entity_id': entity_id,
#                         'email': alias_email.email
#                     })
#         else:
#             # Get cached entity ID
#             entity_id = entity_cache[sender.email.email]

#         # Process sender email
#         sender_email = receiver_email.sender_email
#         sender_email_batch.append({
#             'id': sender_email.id,
#             'sender_id': entity_id,  # Use cached or new entity ID
#             'body': sender_email.body,
#             'timestamp': sender_email.timestamp
#         })

#         # Process receiver email
#         reply_to_id = None
#         if receiver_email.reply_to:
#             reply_to_email = receiver_email.reply_to.email.email
#             if reply_to_email not in entity_cache:
#                 reply_to_id = str(uuid.uuid4())
#                 entity_cache[reply_to_email] = reply_to_id

#                 entity_batch.append({
#                     'id': reply_to_id,
#                     'name': receiver_email.reply_to.name,
#                     'email': reply_to_email,
#                     'alias_names': None,
#                     'is_physical_person': True
#                 })
#             else:
#                 reply_to_id = entity_cache[reply_to_email]

#         # Add mailing list if present
#         mailing_list_id = None
#         if receiver_email.mailing_list:
#             mailing_list_id = receiver_email.mailing_list.id
#             mailing_list_batch.append({
#                 'id': mailing_list_id,
#                 'name': receiver_email.mailing_list.name,
#                 'description': receiver_email.mailing_list.description,
#                 'email_address': receiver_email.mailing_list.email_address.email
#             })

#         # Add receiver email
#         receiver_email_batch.append({
#             'id': receiver_email.id,
#             'sender_email_id': sender_email.id,
#             'sender_id': entity_id,
#             'reply_to_id': reply_to_id,
#             'timestamp': receiver_email.timestamp,
#             'subject': receiver_email.subject,
#             'body': receiver_email.body,
#             'body_html': email_data['body_html'] if 'body_html' in email_data else None,
#             'has_html': email_data['has_html'] if 'has_html' in email_data else False,
#             'is_deleted': receiver_email.is_deleted,
#             'folder': receiver_email.folder,
#             'is_spam': receiver_email.is_spam,
#             'mailing_list_id': mailing_list_id,
#             'importance_score': receiver_email.importance_score,
#             'mother_email_id': None,  # Will be updated later
#             'message_id': email_data['message_id'],
#             'references': email_data['references'] if 'references' in email_data else None
#         })

#         # Process recipients (to, cc, bcc)
#         if receiver_email.to:
#             for entity in receiver_email.to:
#                 if entity.email.email not in entity_cache:
#                     to_entity_id = str(uuid.uuid4())
#                     entity_cache[entity.email.email] = to_entity_id

#                     entity_batch.append({
#                         'id': to_entity_id,
#                         'name': entity.name,
#                         'email': entity.email.email,
#                         'alias_names': json.dumps(entity.alias_names) if entity.alias_names else None,
#                         'is_physical_person': entity.is_physical_person
#                     })

#                     # Process alias emails
#                     if entity.alias_emails:
#                         for alias_email in entity.alias_emails:
#                             entity_alias_emails_batch.append({
#                                 'id': str(uuid.uuid4()),
#                                 'entity_id': to_entity_id,
#                                 'email': alias_email.email
#                             })
#                 else:
#                     to_entity_id = entity_cache[entity.email.email]

#                 # Add to recipients relationship
#                 to_recipients_batch.append({
#                     'email_id': receiver_email.id,
#                     'entity_id': to_entity_id
#                 })

#         # Process CC recipients
#         if receiver_email.cc:
#             for entity in receiver_email.cc:
#                 if entity.email.email not in entity_cache:
#                     cc_entity_id = str(uuid.uuid4())
#                     entity_cache[entity.email.email] = cc_entity_id

#                     entity_batch.append({
#                         'id': cc_entity_id,
#                         'name': entity.name,
#                         'email': entity.email.email,
#                         'alias_names': json.dumps(entity.alias_names) if entity.alias_names else None,
#                         'is_physical_person': entity.is_physical_person
#                     })
#                 else:
#                     cc_entity_id = entity_cache[entity.email.email]

#                 # Add cc recipients relationship
#                 cc_recipients_batch.append({
#                     'email_id': receiver_email.id,
#                     'entity_id': cc_entity_id
#                 })

#         # Process BCC recipients
#         if receiver_email.bcc:
#             for entity in receiver_email.bcc:
#                 if entity.email.email not in entity_cache:
#                     bcc_entity_id = str(uuid.uuid4())
#                     entity_cache[entity.email.email] = bcc_entity_id

#                     entity_batch.append({
#                         'id': bcc_entity_id,
#                         'name': entity.name,
#                         'email': entity.email.email,
#                         'alias_names': json.dumps(entity.alias_names) if entity.alias_names else None,
#                         'is_physical_person': entity.is_physical_person
#                     })
#                 else:
#                     bcc_entity_id = entity_cache[entity.email.email]

#                 # Add bcc recipients relationship
#                 bcc_recipients_batch.append({
#                     'email_id': receiver_email.id,
#                     'entity_id': bcc_entity_id
#                 })

#         # Process attachments
#         if receiver_email.attachments:
#             for attachment in receiver_email.attachments:
#                 attachments_batch.append({
#                     'id': str(uuid.uuid4()),
#                     'email_id': receiver_email.id,
#                     'filename': attachment.filename,
#                     'content': attachment.content,
#                     'content_type': email_data.get('attachments_json', {}).get('content_type', 'application/octet-stream'),
#                     'size': len(attachment.content) if attachment.content else 0
#                 })

#         # Process batch when it reaches the batch size or on the last message
#         if len(receiver_email_batch) >= batch_size or i == total_messages - 1:
#             # Insert entities
#             if entity_batch:
#                 entities_df = pd.DataFrame(entity_batch)
#                 conn.execute("""
#                 INSERT OR IGNORE INTO entities
#                 SELECT * FROM entities_df
#                 """)
#                 entity_batch = []

#             # Insert entity alias emails
#             if entity_alias_emails_batch:
#                 alias_emails_df = pd.DataFrame(entity_alias_emails_batch)
#                 conn.execute("""
#                 INSERT OR IGNORE INTO entity_alias_emails
#                 SELECT * FROM alias_emails_df
#                 """)
#                 entity_alias_emails_batch = []

#             # Insert mailing lists
#             if mailing_list_batch:
#                 mailing_lists_df = pd.DataFrame(mailing_list_batch)
#                 conn.execute("""
#                 INSERT OR IGNORE INTO mailing_lists
#                 SELECT * FROM mailing_lists_df
#                 """)
#                 mailing_list_batch = []

#             # Insert sender emails
#             if sender_email_batch:
#                 sender_emails_df = pd.DataFrame(sender_email_batch)
#                 conn.execute("""
#                 INSERT OR IGNORE INTO sender_emails
#                 SELECT * FROM sender_emails_df
#                 """)
#                 sender_email_batch = []

#             # Insert receiver emails
#             if receiver_email_batch:
#                 receiver_emails_df = pd.DataFrame(receiver_email_batch)
#                 conn.execute("""
#                 INSERT OR IGNORE INTO receiver_emails
#                 SELECT * FROM receiver_emails_df
#                 """)
#                 receiver_email_batch = []

#             # Insert recipient relationships
#             if to_recipients_batch:
#                 to_recipients_df = pd.DataFrame(to_recipients_batch)
#                 conn.execute("""
#                 INSERT OR IGNORE INTO email_recipients_to
#                 SELECT * FROM to_recipients_df
#                 """)
#                 to_recipients_batch = []

#             if cc_recipients_batch:
#                 cc_recipients_df = pd.DataFrame(cc_recipients_batch)
#                 conn.execute("""
#                 INSERT OR IGNORE INTO email_recipients_cc
#                 SELECT * FROM cc_recipients_df
#                 """)
#                 cc_recipients_batch = []

#             if bcc_recipients_batch:
#                 bcc_recipients_df = pd.DataFrame(bcc_recipients_batch)
#                 conn.execute("""
#                 INSERT OR IGNORE INTO email_recipients_bcc
#                 SELECT * FROM bcc_recipients_df
#                 """)
#                 bcc_recipients_batch = []

#             # Insert attachments
#             if attachments_batch:
#                 attachments_df = pd.DataFrame(attachments_batch)
#                 conn.execute("""
#                 INSERT OR IGNORE INTO attachments
#                 SELECT * FROM attachments_df
#                 """)
#                 attachments_batch = []

#             # Commit to save progress
#             conn.commit()

#     return entity_cache

# def process_mbox_files(directory, output_path=None):
#     """
#     Process mbox files and save to DuckDB format with normalized tables

#     Parameters:
#     - directory: Directory containing .mbox files
#     - output_path: Output file path (default: emails.duckdb)
#     """
#     # Set default output path if not provided
#     if output_path is None:
#         output_path = 'emails.duckdb'
#     elif not output_path.endswith('.duckdb'):
#         output_path = f"{output_path}.duckdb"

#     # Setup database
#     conn = setup_database(output_path)

#     # Get list of mbox files
#     mbox_files = [f for f in os.listdir(directory) if f.endswith('.mbox')]
#     print(f"Found {len(mbox_files)} mbox files")

#     # Entity cache to avoid duplicates across files
#     entity_cache = {}

#     # Process each file
#     for filename in mbox_files:
#         file_path = os.path.join(directory, filename)
#         entity_cache = process_mbox_to_duckdb(file_path, conn, entity_cache=entity_cache)

#     # Create relationships between emails (mother/child relationships)
#     print("Creating email thread relationships...")
#     conn.execute("""
#     UPDATE receiver_emails
#     SET mother_email_id = (
#         SELECT r2.id
#         FROM receiver_emails r2
#         WHERE r2.message_id = receiver_emails.in_reply_to
#         LIMIT 1
#     )
#     WHERE in_reply_to IS NOT NULL
#     """)

#     # Populate the children relationships table
#     print("Populating child email relationships...")
#     conn.execute("""
#     INSERT INTO email_children (parent_id, child_id)
#     SELECT mother_email_id, id
#     FROM receiver_emails
#     WHERE mother_email_id IS NOT NULL
#     """)

#     # Final optimization and cleanup
#     print("Optimizing database...")
#     conn.execute("PRAGMA optimize")
#     conn.close()

#     print(f"DuckDB database saved to {output_path}")
#     print("""
# Database structure:
# - entities: Stores all senders and recipients
# - entity_alias_emails: Stores alias emails for entities
# - sender_emails: Stores email data from senders
# - receiver_emails: Stores received email data
# - email_recipients_to/cc/bcc: Links emails to recipient entities
# - attachments: Stores email attachments
# - email_children: Stores parent-child relationships between emails
# - mailing_lists: Stores mailing list information
# - organizations: Stores organization information
# - positions: Stores position information
# - entity_positions: Links entities to positions
# """)

# def main():
#     parser = argparse.ArgumentParser(description='Process mbox files into DuckDB with Pydantic models')
#     parser.add_argument('directory', help='Directory containing .mbox files')
#     parser.add_argument('--output', help='Output file path (without extension)')

#     args = parser.parse_args()

#     process_mbox_files(
#         args.directory,
#         args.output
#     )

# if __name__ == "__main__":
#     main()

## Eml Attempt

In [None]:
# from aspose.email.storage.mbox import MboxStorageReader, MboxLoadOptions

# mbox_load_options = MboxLoadOptions()
# mbox_load_options.leave_open = False
# mbox_load_options.preferred_text_encoding = 'utf-8'

# with MboxStorageReader.create_reader("data/processed/mailbox_cecile/AG.mbox", mbox_load_options) as mbox_reader:
#     for eml in mbox_reader.enumerate_messages():
#         eml.save(f"{eml.subject}.eml")

In [None]:
# import os
# import glob
# from aspose.email.storage.mbox import MboxStorageReader, MboxLoadOptions
# from pathlib import Path

# # Define the input and output directories
# input_dir = "data/processed/mailbox_cecile"
# output_dir = "data/processed/mailbox_cecile_mbox_to_eml"

# # Create the output directory if it doesn't exist
# os.makedirs(output_dir, exist_ok=True)

# # Configure MBox loading options
# mbox_load_options = MboxLoadOptions()
# mbox_load_options.leave_open = False
# mbox_load_options.preferred_text_encoding = 'utf-8'

# # Counter for saved files
# total_saved_count = 0

# # Get all .mbox files in the input directory
# mbox_files = glob.glob(os.path.join(input_dir, "*.mbox"))

# print(f"Found {len(mbox_files)} .mbox files to process")

# # Process each mbox file
# for mbox_file in mbox_files:
#     mbox_name = os.path.basename(mbox_file).replace('.mbox', '')
#     file_saved_count = 0

#     print(f"Processing {mbox_name}.mbox...")

#     with MboxStorageReader.create_reader(mbox_file, mbox_load_options) as mbox_reader:
#         for eml in mbox_reader.enumerate_messages():
#             # Create a safe filename from the subject
#             # If subject is empty, use a placeholder
#             subject = eml.subject if eml.subject else "no_subject"
#             safe_subject = "".join(c if c.isalnum() or c in [' ', '.', '_', '-'] else '_' for c in subject)

#             # Add mbox name and index to ensure unique filenames
#             filename = f"{mbox_name}_{file_saved_count+1:04d}_{safe_subject}.eml"
#             output_path = os.path.join(output_dir, filename)

#             # Save the email
#             eml.save(output_path)
#             file_saved_count += 1
#             total_saved_count += 1

#     print(f"  - Saved {file_saved_count} emails from {mbox_name}.mbox")

# # Verify the count matches the number of files in the directory
# actual_file_count = len([f for f in os.listdir(output_dir) if f.endswith('.eml')])

# # Print summary
# print(f"\nProcessing complete!")
# print(f"Total emails saved: {total_saved_count}")
# print(f"Number of .eml files in directory: {actual_file_count}")

Successfully saved 6 email files to data/processed/mailbox_cecile_mbox_to_eml
Number of .eml files in directory: 6


In [None]:
import os
import glob
import re
from aspose.email.storage.mbox import MboxStorageReader, MboxLoadOptions
from pathlib import Path

# Define the input and output directories
input_dir = "data/processed/mailbox_cecile"
output_dir = "data/processed/mailbox_cecile_mbox_to_eml"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Configure MBox loading options
mbox_load_options = MboxLoadOptions()
mbox_load_options.leave_open = False
mbox_load_options.preferred_text_encoding = 'utf-8'

# Counter for saved files
total_saved_count = 0

# Get all .mbox files in the input directory
mbox_files = glob.glob(os.path.join(input_dir, "*.mbox"))

print(f"Found {len(mbox_files)} .mbox files to process")

# Function to clean folder/file names for database compatibility
def clean_name(name):
    # Replace spaces and special characters with underscores
    # Keep only alphanumeric characters, underscores, and dots
    cleaned = re.sub(r'[^\w\.]', '_', name)
    # Convert to lowercase for consistency
    return cleaned.lower()

# Process each mbox file
for mbox_file in mbox_files:
    mbox_name = os.path.basename(mbox_file).replace('.mbox', '')
    # cleaned_mbox_name = clean_name(mbox_name)
    file_saved_count = 0

    print(f"Processing {mbox_name}.mbox...")

    with MboxStorageReader.create_reader(mbox_file, mbox_load_options) as mbox_reader:
        for eml in mbox_reader.enumerate_messages():
            # Create a safe filename from the subject
            # If subject is empty, use a placeholder
            subject = eml.subject if eml.subject else "no_subject"
            safe_subject = clean_name(subject)

            # Format: 000000001_cleaned_folder_name_cleaned_subject.eml
            filename = f"{file_saved_count+1:10d}_{mbox_name}_{safe_subject}.eml"
            output_path = os.path.join(output_dir, filename)

            # Save the email
            eml.save(output_path)
            file_saved_count += 1
            total_saved_count += 1

    print(f"  - Saved {file_saved_count} emails from {mbox_name}.mbox")

# Verify the count matches the number of files in the directory
actual_file_count = len([f for f in os.listdir(output_dir) if f.endswith('.eml')])

# Print summary
print(f"\nProcessing complete!")
print(f"Total emails saved: {total_saved_count}")
print(f"Number of .eml files in directory: {actual_file_count}")

Found 18 .mbox files to process
Processing AG.mbox...
  - Saved 6 emails from AG.mbox
Processing AG.mbox...
  - Saved 0 emails from AG.mbox
Processing Archive.mbox...
  - Saved 10 emails from Archive.mbox
Processing Archives calssifiees.mbox...
  - Saved 50 emails from Archives calssifiees.mbox
Processing Ateliers.mbox...
  - Saved 28 emails from Ateliers.mbox
Processing Boîte de réception.mbox...
  - Saved 50 emails from Boîte de réception.mbox
Processing Brouillons.mbox...
  - Saved 41 emails from Brouillons.mbox
Processing Conflit.mbox...
  - Saved 6 emails from Conflit.mbox
Processing Courrier indésirable.mbox...
  - Saved 45 emails from Courrier indésirable.mbox
Processing Formation à distance.mbox...
  - Saved 2 emails from Formation à distance.mbox
Processing Gazette.mbox...
  - Saved 10 emails from Gazette.mbox
Processing gestioncrise.mbox...
  - Saved 50 emails from gestioncrise.mbox
Processing Idees.mbox...
  - Saved 18 emails from Idees.mbox
Processing Instances.mbox...
  - 