In [None]:
import logging
from datetime import datetime
from pathlib import Path
from tempfile import gettempdir

from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker

from helpers.logging import OutputWidgetHandler
from libratom.cli.subcommands import entities, OUTPUT_FILENAME_TEMPLATE
from libratom.lib.database import db_init, db_session
from libratom.models import Entity, FileReport, Configuration, Attachment

In [None]:
logger = logging.getLogger()
handler = OutputWidgetHandler()
handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT))
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [None]:
def print_orm_object(obj, exclude=None):
    exclude =  exclude or []
    
    columns = [col.name for col in obj.__table__.columns]
    for column in columns:
        if not column in exclude:
            print(f'{column}: {getattr(obj, column)}')

### Location of input files

Edit as appropriate

In [None]:
src = Path("data/RevisedEDRMv1_Complete/albert_meyers/") # for PST files
# src = Path("data/httpd-users")  # for mbox files

### Location of output database file

In [None]:
db_file_name = OUTPUT_FILENAME_TEMPLATE.format(
    src.name,
    "entities",
    datetime.now()
    .isoformat(timespec="seconds")
    .translate(str.maketrans({"-": "", ":": ""})),
)

db_file_path = Path(gettempdir()) / 'ratom' / db_file_name

### Input variables

In [None]:
spacy_model_name = 'en_core_web_sm'

### Entity extraction

In [None]:
status = entities(out=db_file_path, spacy_model_name=spacy_model_name, jobs=2, src=src, progress=False)

### Post Extraction Queries

In [None]:
engine = create_engine(f"sqlite:///{db_file_path}")
session = sessionmaker(bind=engine)()

##### Total entity count

In [None]:
session.query(Entity).count()

##### View the first 100 entities

In [None]:
for entity in session.query(Entity)[:100]:
    print_orm_object(entity, exclude=['id', 'file_report_id', 'message_id', 'filepath'])
    print(f'file: {entity.file_report.name}')
    print(f'message id in pst file: {entity.message.pff_identifier}')
    print('---')

##### Entity count by type

In [None]:
results = session.query(Entity.label_, func.count(Entity.label_)).group_by(Entity.label_).all()

for entity_type, count in results:
    print(f'{entity_type}: {count}')

##### Attachment count by type

In [None]:
results = session.query(Attachment.mime_type, func.count(Attachment.mime_type)).group_by(Attachment.mime_type).all()

for mime_type, count in results:
    print(f'{mime_type}: {count}')

##### Per file reports

In [None]:
file_reports = session.query(FileReport).all()
for file_report in file_reports:
    print_orm_object(file_report)
    print(f'number of messages: {len(file_report.messages)}')
    print(f'number of attachments: {len(file_report.attachments)}')
    print(f'number of entities: {len(file_report.entities)}')
    print(f'processing start time: {file_report.processing_start_time}')
    print(f'processing end time: {file_report.processing_end_time}')
    print(f'processing wall time: {file_report.processing_wall_time}')
    print('---')

##### Configuration report

In [None]:
for conf in session.query(Configuration).all():
    print(f'{conf.name}: {conf.value}')

In [None]:
session.close()

### Log details

In [None]:
handler.show_logs()