### Dependencies and logging

In [None]:
import logging
from pathlib import Path

import humanfriendly
import ipywidgets as widgets
import pandas as pd
from IPython.display import display
from tqdm import tqdm

from helpers.logging import OutputWidgetHandler
from libratom.lib.pff import PffArchive

In [None]:
logger = logging.getLogger(__name__)
handler = OutputWidgetHandler()
handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT))
logger.addHandler(handler)
logger.setLevel(logging.INFO)

### Location of PST files

In [None]:
# Edit as appropriate
CACHED_ENRON_DATA_DIR = Path("RevisedEDRMv1_Complete")

# Generate the list of files to know how many there are
files = sorted(CACHED_ENRON_DATA_DIR.glob('**/*.pst'))

### Report widget

In [None]:
# Container for the report
output = widgets.Output()

In [None]:
def update(out, data):
    """Refreshes the report output widget
    """

    out.clear_output(wait=True)
    
    df_data = {key: [value] for key, value in data.items()}
    df_data['Size'] = [humanfriendly.format_size(data['Size'])]
    
    with out:
        display(pd.DataFrame(df_data, index=['Total']) )

### Iterate over the PST files and do work

In [None]:
handler.clear_logs()

# Overall report
report = {'Files': 0, 'Messages': 0, 'Attachments': 0, 'Size': 0, 'Errors': 0}

# Start displaying results
display(output)


# Iterate over files
with tqdm(total=len(files), desc="Files read", unit="files", leave=True) as file_bar:
    for pst_file in files:
        try:
            # Iterate over messages
            with PffArchive(pst_file) as archive:
                for message in archive.messages():
                    try:
                        # Do something with the message...
                        _ = archive.format_message(message)

                        # Update report
                        report['Messages'] += 1
                        report['Attachments'] += message.number_of_attachments

                        # Refresh report widget every 100 messages
                        if not report['Messages'] % 100:
                            update(output, report)

                    except Exception as exc:
                        # Log error and move on to the next message
                        report['Errors'] += 1
                        logger.exception(exc)

        except Exception as exc:
            # Log error and move on to the next file
            report['Errors'] += 1
            logger.exception(exc)

        # Update progress bar
        file_bar.update()

        # Update report
        report['Files'] += 1    
        report['Size'] += pst_file.stat().st_size

        # Refresh report widget
        update(output, report)

In [None]:
# Print out errors, if any 
handler.show_logs()