In [1]:
from pathlib import Path

import spacy

from libratom.lib.core import load_spacy_model
from libratom.lib.pff import PffArchive

### Set up spaCy

In [2]:
spacy_model_name = 'en_core_web_sm'
spacy_model = load_spacy_model(spacy_model_name)
spacy_model_version = spacy_model.meta.get("version")

assert spacy_model
print(f'Loaded spacy model: {spacy_model_name}, version: {spacy_model_version}')

Loaded spacy model: en_core_web_sm, version: 3.0.0


### Pick a PST file

In [3]:
PST_FILE = Path("data/RevisedEDRMv1_Complete/andrea_ring/andrea_ring_000_1_1.pst")

### View the PST file's structure

In [4]:
with PffArchive(PST_FILE) as archive:
    print(archive.tree)

root
├── SPAM Search Folder 2
├── Search Root
└── Top of Personal Folders
    ├── Deleted Items
    └── ring-a
        ├── ARING (Non-Privileged)
        │   ├── 'Sent Mail
        │   │   ├── Message ID: 2129060
        │   │   ├── Message ID: 2129092
        │   │   ├── Message ID: 2129124
        │   │   ├── Message ID: 2129156
        │   │   ├── Message ID: 2129188
        │   │   ├── Message ID: 2129220
        │   │   ├── Message ID: 2129252
        │   │   ├── Message ID: 2129284
        │   │   ├── Message ID: 2129316
        │   │   ├── Message ID: 2129348
        │   │   ├── Message ID: 2129380
        │   │   ├── Message ID: 2129412
        │   │   ├── Message ID: 2129444
        │   │   ├── Message ID: 2129476
        │   │   ├── Message ID: 2129508
        │   │   ├── Message ID: 2129540
        │   │   └── Message ID: 2129572
        │   ├── Drafts (Lotus Notes)
        │   │   ├── Message ID: 2128996
        │   │   └── Message ID: 2129028
        │   ├── Inbox
        

### Pick a message

In [5]:
MESSAGE_ID = 2128676

### Access the message object and its properties

In [6]:
archive = PffArchive(PST_FILE)
message = archive.get_message_by_id(MESSAGE_ID)

##### Message headers

In [7]:
headers = message.transport_headers
print(headers.strip())

date: Wed, 17 Oct 2001 16:16:29 -0700 (PDT) Wed, 17 Oct 2001 16:16:29 -0500
Message-ID: <CLRG0H0JD3U4CPKHCRV5E5LZP51JFARKA@zlsvr22>
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="----=_Part_222847_7067165.1241825038247"
2.0: Microsoft Mail Internet Headers Version 2.0
Received: from NAOMA-MSMBX01V.corp.enron.com ([192.168.157.202]) by NAHOU-MSMBX03V.corp.enron.com with Microsoft SMTPSVC(5.0.2195.2966);
16: 16:31 -0500
X-MimeOLE: Produced By Microsoft Exchange V6.0.4712.0
content-class: urn:content-classes:message
Subject: FW: Things to do when the boss is out.
X-MS-Has-Attach: yes
X-MS-TNEF-Correlator: <05CB101D5C9559478D544B7386BBBCDC025D0A@NAOMA-MSMBX01V.corp.enron.com>
Thread-Topic: FW: Things to do when the boss is out.
Thread-Index: AcFWZ093VuF/8cJaEdWxIgBQi+MJ2QALiYaAAB4BjBAACOWUMAACiWdwAAM4b4AAAjOG4A==
From: "Winckowski  Michele" <Michele.Winckowski@ENRON.com>
Return-Path: Michele.Winckowski@ENRON.com
X-OriginalArrivalTime: 17 Oct 2001 21:16:31.0434 (

##### Body

In [8]:
body = message.plain_text_body or message.html_body or message.rtf_body

if isinstance(body, bytes):
    body = str(body, encoding="utf-8", errors="replace")

print(body.strip())

- cubicle hurdles.mpeg 
 - Hallway races.mpeg 
 - Rowing.mpeg 

***********
EDRM Enron Email Data Set has been produced in EML, PST and NSF format by ZL Technologies, Inc. This Data Set is licensed under a Creative Commons Attribution 3.0 United States License <http://creativecommons.org/licenses/by/3.0/us/> . To provide attribution, please cite to "ZL Technologies, Inc. (http://www.zlti.com)."
***********


##### Attachments

In [9]:
for attachment in message.attachments:
    print(f'{attachment.name}: {attachment.size} bytes')

cubicle hurdles.mpeg: 651268 bytes
Hallway races.mpeg: 328377 bytes
Rowing.mpeg: 758215 bytes


### Extract entities from the message body

In [10]:
document = spacy_model(archive.format_message(message))
for entity in document.ents:
    print(f'{entity.text.strip()}: { entity.label_}')

17: CARDINAL
2001: DATE
16:16:29: TIME
-0700: CARDINAL
17: CARDINAL
2001: DATE
16:16:29: TIME
1.0: MONEY
Content-Type: PERSON
2.0: CARDINAL
Microsoft Mail Internet Headers Version: ORG
Received: PERSON
Microsoft SMTPSVC(5.0.2195.2966: ORG
16:31: CARDINAL
Microsoft Exchange: ORG
X-MS-Has-Attach: PERSON
Thread-Topic: PERSON
Thread-Index: PERSON
Winckowski  Michele: WORK_OF_ART
Return-Path: PERSON
17: CARDINAL
2001 21:16:31.0434: DATE
Winckowski: PERSON
Michele: PERSON
NA: ORG
Non-Privileged).pst: ORG
Non-Privileged)\Deleted Items: ORG
Body-Type: PERSON
Enron: ORG
Data Set: ORG
EML: ORG
PST: ORG
NSF: ORG
ZL Technologies, Inc.: ORG
Data Set: ORG
ZL Technologies, Inc.: ORG


In [11]:
# Normally done automatically when using PffArchive as a context object
archive._data.close()