In [None]:
import sys
import logging
from pathlib import Path
from collections import defaultdict

import ipywidgets as widgets
from ipywidgets import Layout, Box
from IPython.display import display

import humanfriendly
import pandas as pd

from libratom.utils.pff import PffArchive

### Set up spacy

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

### Log settings

In [None]:
# https://ipywidgets.readthedocs.io/en/stable/examples/Output%20Widget.html#Integrating-output-widgets-with-the-logging-module
class OutputWidgetHandler(logging.Handler):
    """ Custom logging handler sending logs to an output widget """

    def __init__(self, *args, **kwargs):
        super(OutputWidgetHandler, self).__init__(*args, **kwargs)
        layout = {
            'display': 'flex',
            'border': '1px solid lightgray',
        }
        self.out = widgets.Output(layout=layout)

    def emit(self, record):
        """ Overload of logging.Handler method """
        new_output = {
            'name': 'stdout',
            'output_type': 'stream',
            'text': f'{self.format(record)}\n'
        }
        self.out.outputs = (new_output, ) + self.out.outputs

    def show_logs(self):
        """ Show the logs """
        display(self.out)

    def clear_logs(self):
        """ Clear the current logs """
        self.out.clear_output()

logger = logging.getLogger(__name__)
handler = OutputWidgetHandler()
handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT))
logger.addHandler(handler)
logger.setLevel(logging.INFO)

### Location of PST files

In [None]:
# Edit as appropriate
CACHED_ENRON_DATA_DIR = Path("/tmp/libratom/test_data/RevisedEDRMv1_Complete/andy_zipper")

### Session variables

In [None]:
# Generate the list of files to know how many there are
FILES = list(CACHED_ENRON_DATA_DIR.glob('**/*.pst'))

### Rendering

In [None]:
# Layouts
report_box_layout = Layout(
    display='flex',
    width='50%',
    margin='0px 0px 4px 0px',
    flex_flow='column',
    border='1px solid lightblue',
    justify_content='center',
    align_items='center'
)

entities_box_layout = Layout(
    width='50%',
    margin='0px 0px 4px 0px',
    border='1px solid lightblue',
)

### Utility functions

In [None]:
def update_report(out, data):
    """Refreshes the report output widget
    """

    out.clear_output(wait=True)
    
    df_data = {key: [value] for key, value in data.items()}
    df_data['Size'] = [humanfriendly.format_size(data['Size'])]
    
    with out:
        display(pd.DataFrame(df_data, index=['Total']))


def update_entities(out, data):
    """Refreshes the entities output widget
    """

    out.clear_output(wait=True)
    
    with out:
        print('Sample of entities found')
        print('------------------------')

        for ent in data:
            print(f'{ent.text.strip()}: {ent.label_}')

### Initialize progress and report widgets

In [None]:
# Progress bar for number of files processed
progress = widgets.IntProgress(
    value=0,
    min=0,
    max=len(FILES),
    step=1,
    description='Completed:',
    bar_style='',
    orientation='horizontal'
)

# Container for the report
report_out = widgets.Output()

# Container for the entities sample
ents_out = widgets.Output()

### Iterate over the PST files and extract entities from each message

In [None]:
%%time

handler.clear_logs()

# Overall report
report = defaultdict(int)

# Start displaying results
display(Box(children=[report_out, progress], layout=report_box_layout))
display(Box(children=[ents_out], layout=entities_box_layout))

# Iterate over files
for pst_file in FILES:
    # Update report
    report['Files'] += 1    
    report['Size'] += pst_file.stat().st_size
    
    try:
        # Iterate over messages
        with PffArchive(pst_file) as archive:
            for message in archive.messages():
                # Update report
                report['Messages'] += 1
                
                try:
                    # Extract entities from the message
                    doc = nlp(archive.format_message(message))
                    entities = doc.ents
                    report['Entities'] += len(entities)

                    # Refresh report widget every 10 messages
                    if not report['Messages'] % 10:
                        update_report(report_out, report)
                        update_entities(ents_out, entities[:10]) # show up to 10 entities

                except Exception as exc:
                    # Log error and move on to the next message
                    report['Errors'] += 1
                    logger.exception(exc)

    except Exception as exc:
        # Log error and move on to the next file
        report['Errors'] += 1
        logger.exception(exc)
    
    # Update progress bar
    progress.value += 1
    
    # Refresh report widget
    update_report(report_out, report)

In [None]:
# Print out errors, if any 
handler.show_logs()