In [None]:
import sys
import logging
from pathlib import Path

import ipywidgets as widgets
from ipywidgets import Layout, Box
from IPython.display import display

import humanfriendly
import pandas as pd

from libratom.utils.pff import PffArchive

### Log settings

In [None]:
# https://ipywidgets.readthedocs.io/en/stable/examples/Output%20Widget.html#Integrating-output-widgets-with-the-logging-module
class OutputWidgetHandler(logging.Handler):
    """ Custom logging handler sending logs to an output widget """

    def __init__(self, *args, **kwargs):
        super(OutputWidgetHandler, self).__init__(*args, **kwargs)
        layout = {
            'display': 'flex',
            'border': '1px solid lightgray',
        }
        self.out = widgets.Output(layout=layout)

    def emit(self, record):
        """ Overload of logging.Handler method """
        new_output = {
            'name': 'stdout',
            'output_type': 'stream',
            'text': f'{self.format(record)}\n'
        }
        self.out.outputs = (new_output, ) + self.out.outputs

    def show_logs(self):
        """ Show the logs """
        display(self.out)

    def clear_logs(self):
        """ Clear the current logs """
        self.out.clear_output()

logger = logging.getLogger(__name__)
handler = OutputWidgetHandler()
handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT))
logger.addHandler(handler)
logger.setLevel(logging.INFO)

### Location of PST files

In [None]:
# Edit as appropriate
CACHED_ENRON_DATA_DIR = Path("/tmp/libratom/test_data/RevisedEDRMv1_Complete")

### Session variables

In [None]:
# Generate the list of files to know how many there are
files = list(CACHED_ENRON_DATA_DIR.glob('**/*.pst'))

# Overall report
report = {'Files': 0, 'Messages': 0, 'Attachments': 0, 'Size': 0, 'Errors': 0}

# Layout for the report and progress box
report_box_layout = Layout(
    display='flex',
    width='50%',
    flex_flow='column',
    border='1px solid lightblue',
    justify_content='center',
    align_items='center'
)

### Utility functions

In [None]:
def update(out, data):
    """Refreshes the report output widget
    """

    out.clear_output(wait=True)
    
    df_data = {key: [value] for key, value in data.items()}
    df_data['Size'] = [humanfriendly.format_size(data['Size'])]
    
    with out:
        display(pd.DataFrame(df_data, index=['Total']) )

### Initialize progress and report widgets

In [None]:
# Progress bar for number of files processed
progress = widgets.IntProgress(
    value=0,
    min=0,
    max=len(files),
    step=1,
    description='Completed:',
    bar_style='',
    orientation='horizontal'
)

# Container for the report
output = widgets.Output()

### Iterate over the PST files and do work

In [None]:
handler.clear_logs()

# Start displaying results
display(Box(children=[output, progress], layout=report_box_layout))

# Iterate over files
for pst_file in files:
    try:
        # Iterate over messages
        with PffArchive(pst_file) as archive:
            for message in archive.messages():
                try:
                    # Do something with the message...
                    _ = archive.format_message(message)

                    # Update report
                    report['Messages'] += 1
                    report['Attachments'] += message.number_of_attachments

                    # Refresh report widget every 100 messages
                    if not report['Messages'] % 100:
                        update(output, report)

                except Exception as exc:
                    # Log error and move on to the next message
                    report['Errors'] += 1
                    logger.exception(exc)

    except Exception as exc:
        # Log error and move on to the next file
        report['Errors'] += 1
        logger.exception(exc)
    
    # Update progress bar
    progress.value += 1
    
    # Update report
    report['Files'] += 1    
    report['Size'] += pst_file.stat().st_size
    
    # Refresh report widget
    update(output, report)

In [None]:
# Print out errors, if any 
handler.show_logs()