In [1]:
import sys
import logging
from pathlib import Path
from collections import defaultdict, deque

import ipywidgets as widgets
from ipywidgets import Layout, Box
from IPython.display import display

import humanfriendly
import pandas as pd

from libratom.utils.pff import PffArchive

### Set up spacy

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

### Log settings

In [3]:
# https://ipywidgets.readthedocs.io/en/stable/examples/Output%20Widget.html#Integrating-output-widgets-with-the-logging-module
class OutputWidgetHandler(logging.Handler):
    """ Custom logging handler sending logs to an output widget """

    def __init__(self, *args, **kwargs):
        super(OutputWidgetHandler, self).__init__(*args, **kwargs)
        layout = {
            'display': 'flex',
            'border': '1px solid lightgray',
        }
        self.out = widgets.Output(layout=layout)

    def emit(self, record):
        """ Overload of logging.Handler method """
        new_output = {
            'name': 'stdout',
            'output_type': 'stream',
            'text': f'{self.format(record)}\n'
        }
        self.out.outputs = (new_output, ) + self.out.outputs

    def show_logs(self):
        """ Show the logs """
        display(self.out)

    def clear_logs(self):
        """ Clear the current logs """
        self.out.clear_output()

logger = logging.getLogger(__name__)
handler = OutputWidgetHandler()
handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT))
logger.addHandler(handler)
logger.setLevel(logging.INFO)

### Location of PST files

In [4]:
# Edit as appropriate
CACHED_ENRON_DATA_DIR = Path("/tmp/libratom/test_data/RevisedEDRMv1_Complete")

### Session variables

In [5]:
# Generate the list of files to know how many there are
files = list(CACHED_ENRON_DATA_DIR.glob('**/*.pst'))

# Overall report
report = defaultdict(int)

# List of entities to display 
ents_sample = deque(maxlen=10)

### Rendering

In [6]:
# Layouts
report_box_layout = Layout(
    display='flex',
    width='50%',
    margin='0px 0px 4px 0px',
    flex_flow='column',
    border='1px solid lightblue',
    justify_content='center',
    align_items='center'
)

entities_box_layout = Layout(
    width='50%',
    margin='0px 0px 4px 0px',
    border='1px solid lightblue',
)

### Utility functions

In [7]:
def update_report(out, data):
    """Refreshes the report output widget
    """

    out.clear_output(wait=True)
    
    df_data = {key: [value] for key, value in data.items()}
    df_data['Size'] = [humanfriendly.format_size(data['Size'])]
    
    with out:
        display(pd.DataFrame(df_data, index=['Total']))


def update_entities(out, data):
    """Refreshes the entities output widget
    """

    out.clear_output(wait=True)
    
    with out:
        print('Sample of entities found')
        print('------------------------')

        for ent in data:
            print(f'{ent.text.strip()}: {ent.label_}')

### Initialize progress and report widgets

In [8]:
# Progress bar for number of files processed
progress = widgets.IntProgress(
    value=0,
    min=0,
    max=len(files),
    step=1,
    description='Completed:',
    bar_style='',
    orientation='horizontal'
)

# Container for the report
report_out = widgets.Output()

# Container for the entities sample
ents_out = widgets.Output()

### Iterate over the PST files and extract entities from each message

In [9]:
%%time

handler.clear_logs()

# Start displaying results
display(Box(children=[report_out, progress], layout=report_box_layout))
display(Box(children=[ents_out], layout=entities_box_layout))

# Iterate over files
for pst_file in files:
    # Update report
    report['Files'] += 1    
    report['Size'] += pst_file.stat().st_size
    
    try:
        # Iterate over messages
        with PffArchive(pst_file) as archive:
            for message in archive.messages():
                # Update report
                report['Messages'] += 1
                
                try:
                    # Extract entities from the message
                    doc = nlp(archive.format_message(message))
                    entities = doc.ents
                    report['Entities'] += len(entities)
                    
                    # Show up to 10 entities
                    ents_sample.extendleft(entities)

                    # Refresh report widget every 10 messages
                    if not report['Messages'] % 10:
                        update_report(report_out, report)
                        update_entities(ents_out, ents_sample)

                except Exception as exc:
                    # Log error and move on to the next message
                    report['Errors'] += 1
                    logger.exception(exc)

    except Exception as exc:
        # Log error and move on to the next file
        report['Errors'] += 1
        logger.exception(exc)
    
    # Update progress bar
    progress.value += 1
    
    # Refresh report widget
    update_report(report_out, report)

Box(children=(Output(), IntProgress(value=0, description='Completed:', max=4)), layout=Layout(align_items='cen…

Box(children=(Output(),), layout=Layout(border='1px solid lightblue', margin='0px 0px 4px 0px', width='50%'))

CPU times: user 9min 51s, sys: 11.5 s, total: 10min 2s
Wall time: 9min 59s


In [None]:
# Print out errors, if any 
handler.show_logs()

In [11]:
report['Entities']

630919

In [17]:
dir(ents_sample[4])

['_',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_recalculate_indices',
 '_vector',
 '_vector_norm',
 'as_doc',
 'conjuncts',
 'doc',
 'end',
 'end_char',
 'ent_id',
 'ent_id_',
 'ents',
 'get_extension',
 'get_lca_matrix',
 'has_extension',
 'has_vector',
 'kb_id',
 'kb_id_',
 'label',
 'label_',
 'lefts',
 'lemma_',
 'lower_',
 'merge',
 'n_lefts',
 'n_rights',
 'noun_chunks',
 'orth_',
 'remove_extension',
 'rights',
 'root',
 'sent',
 'sentiment',
 'set_extension',
 'similarity',
 'start',
 'start_char',
 'string',
 'subtree',
 'text',
 'text_with_ws',
 'to_array',
 'upper_',
 'vector',
 'vector_norm',
 'vocab']

In [18]:
entity = ents_sample[4]

In [23]:
type(entity.ent_id)

int

In [22]:
entity.text

'EML'

In [24]:
entity.doc

date: Fri, 18 May 2001 01:38:00 -0700 (PDT) Fri, 18 May 2001 01:38:00 -0700  (PDT)
Message-ID: <C1PTMZZVPOWNW3QLBFJM5JK3CXIHXZABA@zlsvr22>
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
from: "Andrea Ring"
to: "Jared Kaiser" <Jared Kaiser/ENRON@enronXgate@ENRON>
subject: Re:
filename: aring (Non-Privileged).pst
folder: \Andrea_Ring_Jan2002_1\Ring, Andrea\'Sent Mail
Body-Type: plain-text

Hi!

                  Yes,  we were discussing your lending habits and everyone was calling me a fricken mouch!  Your like a legend around here.
We still need to get together and share some of your previous comments with Craig.  How are you?  I miss you and get regular updates about how you are doing from you know who.  Been to any exciting ports lately.   Any baby news?  Not much new around here - John Craig keeps us laughing.  Take care!

                                                                                                                   

In [27]:
type(entity.label_)

str

In [29]:
entity.label_

'ORG'