Skip to content

Commit

Permalink
rfc822 header parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
lavr committed Mar 21, 2015
1 parent 2dadfff commit 0d88ff8
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 10 deletions.
61 changes: 55 additions & 6 deletions emails/loader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# encoding: utf-8
import os
import os.path
from email.utils import formataddr

from .local_store import (FileSystemLoader, ZipLoader, MsgLoader, FileNotFound)
from .helpers import guess_charset
from ..compat import to_unicode
from ..compat import urlparse
from ..message import Message
from ..utils import fetch_url
from ..utils import fetch_url, decode_header, parse_addresses_header
from .local_store import (FileSystemLoader, ZipLoader, MsgLoader, FileNotFound)
from .helpers import guess_charset


class LoadError(Exception):
Expand Down Expand Up @@ -43,6 +43,7 @@ def from_html(html, text=None, base_url=None, message_params=None, local_loader=
raise InvalidHtmlFile("Error parsing '%s'" % source_filename)
message.transformer.load_and_transform(**kwargs)
message.transformer.save()
message._loader = local_loader
return message


Expand Down Expand Up @@ -113,11 +114,59 @@ def from_zip(zip_file, loader_cls=None, **kwargs):
return _from_filebased_source(store=loader_cls(file=zip_file), **kwargs)


def from_rfc822(msg, message_params=None):

SKIP_HEADERS = {'delivered-to', 'received', 'authentication-results',
'mime-version', 'content-type', 'received-spf'}


def relaxed_parse_address_header(value, errors='strict'):
#return parse_addresses_header(value, errors=errors)

for r in parse_addresses_header(value, errors=errors):
if r:
if not r[0] and '@' not in r[1]:
# if no email in header, should decode again
yield decode_header(r[1], errors=errors)
else:
yield formataddr(r)


def from_rfc822(msg, message_params=None, parse_headers=False, unicode_errors='strict'):
# Warning: from_rfc822 is for demo purposes only
loader = MsgLoader(msg=msg)
message_params = message_params or {}

loader = MsgLoader(msg=msg)
message = Message(html=loader.html, text=loader.text, **message_params)
message._loader = loader

for att in loader.attachments:
message.attachments.add(att)

if isinstance(parse_headers, bool):
if parse_headers is True:
parse_header = lambda h: h == 'subject' or h in Message.ADDRESS_HEADERS
else:
parse_header = lambda h: False
elif isinstance(parse_headers, (list, tuple, set)):
parse_header = lambda h: h in parse_headers
else:
raise ValueError("Unknown type for 'parse_headers': %s", type(parse_headers))

if parse_headers:
for k, v in loader.msg.items():
k = k.lower()
if not parse_header(k):
continue

if k == 'subject':
message.subject = decode_header(v, errors=unicode_errors)
elif k == 'to':
message.mail_to = list(relaxed_parse_address_header(v, errors=unicode_errors))[0]
elif k == 'from':
message.mail_from = list(relaxed_parse_address_header(v, errors=unicode_errors))[0]
elif k in Message.ADDRESS_HEADERS:
message._headers[k] = ",".join(relaxed_parse_address_header(v, errors=unicode_errors))
else:
message._headers[k] = decode_header(v, errors=unicode_errors)

return message
1 change: 1 addition & 0 deletions emails/loader/local_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def __init__(self, msg, base_path=None):
self._files = {}
self._content_ids = {}
self._parsed = False
self.headers = {}

def decode_text(self, text, charset=None):
if charset:
Expand Down
23 changes: 19 additions & 4 deletions emails/testsuite/loader/test_rfc822_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from emails.compat import to_native
import emails.loader
from emails.loader.local_store import MsgLoader
from emails.loader import relaxed_parse_address_header
#from emails.loader.helpers import guess_charset

ROOT = os.path.dirname(__file__)
Expand Down Expand Up @@ -64,7 +65,9 @@ def test_msgloader():
map_cid = "cid:%s" % source_message.attachments['Map.png'].content_id
assert loader.content(map_cid) == 'Y'

assert emails.loader.from_rfc822(msg=source_message.as_string()).as_string()
m2 = emails.loader.from_rfc822(msg=source_message.as_string(), parse_headers=True)
assert m2.subject == data['subject']
assert m2.as_string()
# TODO: more tests


Expand All @@ -75,14 +78,26 @@ def _try_decode(s):
except UnicodeDecodeError:
pass


def test_relaxed_header_parser():
from email.header import Header
text = u'웃'
assert list(relaxed_parse_address_header(str(Header(text, 'utf-8'))))[0] == text


def test_mass_msgloader():
import encodings
encodings.aliases.aliases['win_1251'] = 'cp1251' # data-specific
ROOT = os.path.dirname(__file__)
for filename in glob.glob(os.path.join(ROOT, "data/msg/*.eml")):
msg = _try_decode(open(filename, 'rb').read())
if msg is None:
print("can not read filename=", filename)
continue
#msg = email.message_from_string(open(filename).read())
msgloader = MsgLoader(msg=msg)
print(len(msgloader.attachments))
#msgloader = MsgLoader(msg=msg)
message = emails.loader.from_rfc822(msg=msg, parse_headers=True, unicode_errors='replace')
if message._headers:
print(message._headers)
#message.as_string()
#print(len(msgloader.attachments))

0 comments on commit 0d88ff8

Please sign in to comment.