Skip to content

Commit

Permalink
Implement EML parser
Browse files Browse the repository at this point in the history
  • Loading branch information
helviojunior committed May 18, 2024
1 parent a26ece7 commit 006be59
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 26 deletions.
111 changes: 87 additions & 24 deletions filecrawler/libs/containerfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,37 +101,100 @@ def extract_eml(self) -> bool:
import glob
import email
from email import policy
from email.parser import HeaderParser

with open(str(self._file.path), "r") as f:
msg = email.message_from_file(f, policy=policy.default)
for attachment in msg.iter_attachments():
try:
output_filename = attachment.get_filename()
except AttributeError:
print("Got string instead of filename for %s. Skipping." % f.name)
continue

msg_data = None
try:
msg_data = attachment.get_payload(decode=True)
except TypeError:
print("Couldn't get payload for %s" % output_filename)
continue

# If no attachments are found, skip this file
if msg_data is not None:
if output_filename is None:
output_filename = Tools.random_generator(size=10) + Tools.guess_extensions(msg_data)

with open(os.path.join(str(self._temp_path), output_filename), "wb") as of:
msg_data = None
msg_epoch = Tools.to_epoch(Tools.get_email_date(msg))

full_name = os.path.join(str(self._temp_path), f"header.txt")
try:
parser = HeaderParser()
with open(full_name, "wb") as of:
try:
of.write(f"## E-mail: {str(self._file.path)}\n".encode("UTF-8"))
of.write(f"## Header\n\n".encode("UTF-8"))
of.write(parser.parsestr(msg.as_string(), headersonly=True).as_string().encode("UTF-8"))
except:
pass
except:
pass

# Try to update file time from e-mail time
try:
os.utime(full_name, (msg_epoch, msg_epoch))
except:
pass

if msg.is_multipart():
for t, ext in [('html', 'html'), ('plain', 'txt')]:
# Use txt instead of html, because html can be in exclusion list
full_name = os.path.join(str(self._temp_path), f"body_{ext}.txt")
try:
b_data = msg.get_body((t,))
if b_data is not None:
b_data = b_data.get_payload(decode=True)
if b_data is not None:
with open(full_name, "wb") as of:
of.write(b_data)

# Try to update file time from e-mail time
try:
os.utime(full_name, (msg_epoch, msg_epoch))
except:
pass
except Exception as e1:
#Tools.print_error(e1)
pass

for attachment in msg.iter_attachments():
msg_data = None
try:
output_filename = attachment.get_filename()
except AttributeError:
print("Got string instead of filename for %s. Skipping." % f.name)
continue

msg_data = None
try:
msg_data = attachment.get_payload(decode=True)
except TypeError:
print("Couldn't get payload for %s" % output_filename)
continue

# If no attachments are found, skip this file
if msg_data is not None:
if output_filename is None:
output_filename = Tools.random_generator(size=10) + Tools.guess_extensions(msg_data)

full_name = os.path.join(str(self._temp_path), output_filename)

with open(full_name, "wb") as of:
try:
of.write(msg_data)
except TypeError:
print("Couldn't get payload for %s" % output_filename)

# Try to update file time from e-mail time
try:
of.write(msg_data)
except TypeError:
print("Couldn't get payload for %s" % output_filename)
os.utime(full_name, (msg_epoch, msg_epoch))
except:
pass

# not multipart - i.e. plain text, no attachments, keeping fingers crossed
else:
txt_body = msg.get_payload(decode=True)
full_name = os.path.join(str(self._temp_path), 'body.txt')
with open(full_name, "wb") as of:
try:
of.write(txt_body)
except:
pass

return True
except Exception as e:
#Tools.print_error(e)
Tools.print_error(e)
return False

def extract_7z(self) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion filecrawler/parsers/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
class DefaultParser(ParserBase):

def __init__(self):
super().__init__('Default', 'Parser for PDF files')
super().__init__('Default', 'Default parser')

def parse(self, file: File) -> dict:
data = {'content': self.get_readable_data(file)}
Expand Down
17 changes: 16 additions & 1 deletion filecrawler/util/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
import platform
import string, random, sys, re
import subprocess
import time
from email.message import EmailMessage

import unicodedata
import email
from tabulate import tabulate
from filecrawler.libs.color import Color

Expand Down Expand Up @@ -185,12 +189,23 @@ def get_git_version():

@staticmethod
def to_datetime(epoch: [int, float]) -> datetime.datetime:
return datetime.datetime(1970, 1, 1, 0, 0, 0) + datetime.timedelta(seconds=epoch)
return datetime.datetime(1970, 1, 1, 0, 0, 0) + datetime.timedelta(seconds=int(epoch))

@staticmethod
def to_epoch(date: datetime.datetime) -> int:
return (date - datetime.datetime(1970, 1, 1, 0, 0, 0)).seconds

@staticmethod
def to_boolean(text: [str, bool]) -> bool:
return bool(text)

@staticmethod
def get_email_date(msg: EmailMessage) -> datetime.datetime:
try:
return Tools.to_datetime(time.mktime(email.utils.parsedate(msg['date'])))
except:
return datetime.datetime.now()

@staticmethod
def guess_extension(file_path: str) -> str:
try:
Expand Down

0 comments on commit 006be59

Please sign in to comment.