Implement EML parser

helviojunior · May 18, 2024 · 006be59 · 006be59
1 parent a26ece7
commit 006be59
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 26 deletions.
diff --git a/filecrawler/libs/containerfile.py b/filecrawler/libs/containerfile.py
@@ -101,37 +101,100 @@ def extract_eml(self) -> bool:
             import glob
             import email
             from email import policy
+            from email.parser import HeaderParser
 
             with open(str(self._file.path), "r") as f:
                 msg = email.message_from_file(f, policy=policy.default)
-                for attachment in msg.iter_attachments():
-                    try:
-                        output_filename = attachment.get_filename()
-                    except AttributeError:
-                        print("Got string instead of filename for %s. Skipping." % f.name)
-                        continue
-
-                    msg_data = None
-                    try:
-                        msg_data = attachment.get_payload(decode=True)
-                    except TypeError:
-                        print("Couldn't get payload for %s" % output_filename)
-                        continue
-
-                    # If no attachments are found, skip this file
-                    if msg_data is not None:
-                        if output_filename is None:
-                            output_filename = Tools.random_generator(size=10) + Tools.guess_extensions(msg_data)
-
-                        with open(os.path.join(str(self._temp_path), output_filename), "wb") as of:
+                msg_data = None
+                msg_epoch = Tools.to_epoch(Tools.get_email_date(msg))
+
+                full_name = os.path.join(str(self._temp_path), f"header.txt")
+                try:
+                    parser = HeaderParser()
+                    with open(full_name, "wb") as of:
+                        try:
+                            of.write(f"## E-mail: {str(self._file.path)}\n".encode("UTF-8"))
+                            of.write(f"## Header\n\n".encode("UTF-8"))
+                            of.write(parser.parsestr(msg.as_string(), headersonly=True).as_string().encode("UTF-8"))
+                        except:
+                            pass
+                except:
+                    pass
+
+                # Try to update file time from e-mail time
+                try:
+                    os.utime(full_name, (msg_epoch, msg_epoch))
+                except:
+                    pass
+
+                if msg.is_multipart():
+                    for t, ext in [('html', 'html'), ('plain', 'txt')]:
+                        # Use txt instead of html, because html can be in exclusion list
+                        full_name = os.path.join(str(self._temp_path), f"body_{ext}.txt")
+                        try:
+                            b_data = msg.get_body((t,))
+                            if b_data is not None:
+                                b_data = b_data.get_payload(decode=True)
+                            if b_data is not None:
+                                with open(full_name, "wb") as of:
+                                    of.write(b_data)
+
+                                # Try to update file time from e-mail time
+                                try:
+                                    os.utime(full_name, (msg_epoch, msg_epoch))
+                                except:
+                                    pass
+                        except Exception as e1:
+                            #Tools.print_error(e1)
+                            pass
+
+                    for attachment in msg.iter_attachments():
+                        msg_data = None
+                        try:
+                            output_filename = attachment.get_filename()
+                        except AttributeError:
+                            print("Got string instead of filename for %s. Skipping." % f.name)
+                            continue
+
+                        msg_data = None
+                        try:
+                            msg_data = attachment.get_payload(decode=True)
+                        except TypeError:
+                            print("Couldn't get payload for %s" % output_filename)
+                            continue
+
+                        # If no attachments are found, skip this file
+                        if msg_data is not None:
+                            if output_filename is None:
+                                output_filename = Tools.random_generator(size=10) + Tools.guess_extensions(msg_data)
+
+                            full_name = os.path.join(str(self._temp_path), output_filename)
+
+                            with open(full_name, "wb") as of:
+                                try:
+                                    of.write(msg_data)
+                                except TypeError:
+                                    print("Couldn't get payload for %s" % output_filename)
+
+                            # Try to update file time from e-mail time
                             try:
-                                of.write(msg_data)
-                            except TypeError:
-                                print("Couldn't get payload for %s" % output_filename)
+                                os.utime(full_name, (msg_epoch, msg_epoch))
+                            except:
+                                pass
+
+                # not multipart - i.e. plain text, no attachments, keeping fingers crossed
+                else:
+                    txt_body = msg.get_payload(decode=True)
+                    full_name = os.path.join(str(self._temp_path), 'body.txt')
+                    with open(full_name, "wb") as of:
+                        try:
+                            of.write(txt_body)
+                        except:
+                            pass
 
             return True
         except Exception as e:
-            #Tools.print_error(e)
+            Tools.print_error(e)
             return False
 
     def extract_7z(self) -> bool:

diff --git a/filecrawler/parsers/default.py b/filecrawler/parsers/default.py
@@ -5,7 +5,7 @@
 class DefaultParser(ParserBase):
 
     def __init__(self):
-        super().__init__('Default', 'Parser for PDF files')
+        super().__init__('Default', 'Default parser')
 
     def parse(self, file: File) -> dict:
         data = {'content': self.get_readable_data(file)}

diff --git a/filecrawler/util/tools.py b/filecrawler/util/tools.py
@@ -6,7 +6,11 @@
 import platform
 import string, random, sys, re
 import subprocess
+import time
+from email.message import EmailMessage
+
 import unicodedata
+import email
 from tabulate import tabulate
 from filecrawler.libs.color import Color
 
@@ -185,12 +189,23 @@ def get_git_version():
 
     @staticmethod
     def to_datetime(epoch: [int, float]) -> datetime.datetime:
-        return datetime.datetime(1970, 1, 1, 0, 0, 0) + datetime.timedelta(seconds=epoch)
+        return datetime.datetime(1970, 1, 1, 0, 0, 0) + datetime.timedelta(seconds=int(epoch))
+
+    @staticmethod
+    def to_epoch(date: datetime.datetime) -> int:
+        return (date - datetime.datetime(1970, 1, 1, 0, 0, 0)).seconds
 
     @staticmethod
     def to_boolean(text: [str, bool]) -> bool:
         return bool(text)
 
+    @staticmethod
+    def get_email_date(msg: EmailMessage) -> datetime.datetime:
+        try:
+            return Tools.to_datetime(time.mktime(email.utils.parsedate(msg['date'])))
+        except:
+            return datetime.datetime.now()
+
     @staticmethod
     def guess_extension(file_path: str) -> str:
         try: