-
-
Notifications
You must be signed in to change notification settings - Fork 38
/
mail.py
325 lines (271 loc) · 12.3 KB
/
mail.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# Copyright Kevin Deldycke <kevin@deldycke.com> and contributors.
# All Rights Reserved.
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import email
import hashlib
import inspect
import mailbox
import os
import re
import time
import arrow
from boltons.cacheutils import cachedproperty
from tabulate import tabulate
from . import CTIME, MINIMAL_HEADERS_COUNT, TooFewHeaders, logger
class DedupMail:
"""Message with deduplication-specific properties and utilities.
Adds all data-cleaning primitives and heuristics to Python's standard
library messages from: https://github.com/python/cpython/blob
/e799aa8b92c195735f379940acd9925961ad04ec/Lib/mailbox.py#L1489
This class should not be used directly but composed with
``mailbox.Message`` sub-classes.
"""
def __init__(self, message=None):
"""Initialize a pre-parsed ``Message`` instance the same way the default
factory in Python's ``mailbox`` module does.
"""
# Hunt down in our parent classes (but ourselve) the first one inheriting the
# mailbox.Message class. That way we can get to the original factory.
orig_message_klass = None
for klass in inspect.getmro(self.__class__)[1:]:
if issubclass(klass, mailbox.Message):
orig_message_klass = klass
break
assert orig_message_klass
# Call original object initialization from the right message class we
# inherits from mailbox.Message.
super(orig_message_klass, self).__init__(message)
# Normalized path to the mailbox this message originates from.
self.source_path = None
# Mail ID used to uniquely refers to it in the context of its source.
self.mail_id = None
# Real filesystem location of the mail. Returns the individual mail's file
# for folder-based box types (maildir & co.), but returns the whole box path
# for file-based boxes (mbox & co.). Only used by regexp-based selection
# strategies.
self.path = None
# Global config.
self.conf = None
def add_box_metadata(self, box, mail_id):
"""Post-instanciation utility to attach to mail some metadata derived from its
parent box.
Called right after the ``__init__()`` constructor.
This allows the mail to carry its own information on its origin box and index.
"""
self.source_path = box._path
self.mail_id = mail_id
# Extract file name and close it right away to reclaim memory.
mail_file = box.get_file(mail_id)
self.path = mail_file._file.name
mail_file.close()
def __repr__(self):
return f"<{self.__class__.__name__} {self.source_path}:{self.mail_id}>"
@cachedproperty
def uid(self):
"""Unique ID of the mail."""
return self.source_path, self.mail_id
@cachedproperty
def timestamp(self):
"""Compute the normalized canonical timestamp of the mail.
Sourced from the message's header by default. In the case of maildir,
can be sourced from the email's file from the filesystem.
"""
# XXX ctime does not refer to creation time on POSIX systems, but
# rather the last time the inode data changed. Source:
# https://userprimary.net/posts/2007/11/18
# /ctime-in-unix-means-last-change-time-not-create-time/
if self.conf.time_source == CTIME:
return os.path.getctime(self.path)
# Fetch from the date header.
value = self.get("Date")
try:
value = email.utils.mktime_tz(email.utils.parsedate_tz(value))
except ValueError:
pass
return value
# XXX Also investigate what https://docs.python.org/2/library
# /mailbox.html#mailbox.MaildirMessage.get_date does.
@cachedproperty
def size(self):
"""Returns canonical mail size.
Size is computed as the length of the message body, i.e. the payload of
the mail stripped of all its headers, not from the mail file
persisting on the file-system.
"""
return len("".join(self.body_lines))
# TODO: Allow customization of the way the size is computed, by getting
# the file size instead for example.
# size = os.path.getsize(mail_file)
@cachedproperty
def body_lines(self):
""" Return a normalized list of lines from message's body. """
body = []
if self.preamble is not None:
body.extend(self.preamble.splitlines(keepends=True))
for part in self.walk():
if part.is_multipart():
continue
ctype = part.get_content_type()
cte = part.get_params(header="Content-Transfer-Encoding")
if (ctype is not None and not ctype.startswith("text")) or (
cte is not None and cte[0][0].lower() == "8bit"
):
part_body = part.get_payload(decode=False)
else:
charset = part.get_content_charset()
if charset is None or len(charset) == 0:
charsets = ["ascii", "utf-8"]
else:
charsets = [charset]
part_body = part.get_payload(decode=True)
for enc in charsets:
try:
part_body = part_body.decode(enc)
break
except UnicodeDecodeError:
continue
except LookupError:
continue
else:
part_body = part.get_payload(decode=False)
body.extend(part_body.splitlines(keepends=True))
if self.epilogue is not None:
body.extend(self.epilogue.splitlines(keepends=True))
return body
@cachedproperty
def subject(self):
"""Normalized subject.
Only used for debugging and human-friendly logging.
"""
# Fetch subject from first message.
subject = self.get("Subject", "")
subject, _ = re.subn(r"\s+", " ", subject)
return subject
@cachedproperty
def hash_key(self):
""" Returns the canonical hash of a mail. """
logger.debug(f"Serialized headers: {self.serialized_headers!r}")
hash_value = hashlib.sha224(self.serialized_headers).hexdigest()
logger.debug(f"Hash: {hash_value}")
return hash_value
@cachedproperty
def canonical_headers(self):
"""Returns the full list of all canonical headers names and values in
preparation for hashing."""
canonical_headers = []
for header_id in self.conf.hash_headers:
# Skip absent header.
if header_id not in self:
continue
# Fetch all occurrences of the header.
canonical_values = []
for header_value in self.get_all(header_id):
normalized_value = self.normalize_header_value(header_id, header_value)
if re.search(r"\S", normalized_value):
canonical_values.append(normalized_value)
canonical_value = "\n".join(canonical_values)
canonical_headers.append((header_id, canonical_value))
# Cast to a tuple to prevent any modification.
return tuple(canonical_headers)
@cachedproperty
def pretty_canonical_headers(self):
"""Renders into a table and in the same order, headers names and values
used to produce mail's hash.
Returns a string ready to be printing to user or for debugging.
"""
table = [["Header ID", "Header value"]] + list(self.canonical_headers)
return "\n" + tabulate(table, tablefmt="fancy_grid", headers="firstrow")
@cachedproperty
def serialized_headers(self):
""" Serialize the canonical headers into a single string ready to be hashed. """
# At this point we should have at an absolute minimum of headers.
headers_count = len(self.canonical_headers)
if headers_count < MINIMAL_HEADERS_COUNT:
logger.warning(self.pretty_canonical_headers)
raise TooFewHeaders(
f"{headers_count} headers found out of {MINIMAL_HEADERS_COUNT}."
)
else:
logger.debug(self.pretty_canonical_headers)
return "\n".join(
[f"{h_id}: {h_value}" for h_id, h_value in self.canonical_headers]
).encode("utf-8")
@staticmethod
def normalize_header_value(header_id, value):
"""Normalize and clean-up header value into its canonical form.
Always returns a unicode string.
"""
# Problematic when reading utf8 emails
# this will ensure value is always string
if isinstance(value, bytes):
value = value.decode("utf-8", "replace")
elif isinstance(value, email.header.Header):
value = str(value)
# Normalize white spaces.
value = re.sub(r"\s+", " ", value).strip()
# Trim Subject prefixes automatically added by mailing list software,
# since the mail could have been cc'd to multiple lists, in which case
# it will receive a different prefix for each, but this shouldn't be
# treated as a real difference between duplicate mails.
if header_id == "subject":
subject = value
while True:
matching = re.match(
r"([Rr]e: )*(\[\w[\w_-]+\w\] )+(.+)", subject, re.DOTALL
)
if not matching:
break
subject = matching.group(3)
# show_progress("Trimmed Subject to %s" % subject)
return subject
if header_id == "content-type":
# Apparently list servers actually munge Content-Type
# e.g. by stripping the quotes from charset="us-ascii".
# Section 5.1 of RFC2045 says that either form is valid
# (and they are equivalent).
#
# Additionally, with multipart/mixed, boundary delimiters can
# vary by recipient. We need to allow for duplicates coming
# from multiple recipients, since for example you could be
# signed up to the same list twice with different addresses.
# Or maybe someone bounces you a load of mail some of which is
# from a mailing list you're both subscribed to - then it's
# still useful to be able to eliminate duplicates.
return re.sub(";.*", "", value)
if header_id == "date":
# Date timestamps can differ by seconds or hours for various
# reasons, so let's only honour the date for now and normalize them
# to UTC timezone.
try:
parsed = email.utils.parsedate_tz(value)
if not parsed:
raise TypeError
utc_timestamp = email.utils.mktime_tz(parsed)
return arrow.get(utc_timestamp).format("YYYY-MM-DD")
except (TypeError, ValueError):
return value
elif header_id in ["to", "message-id"]:
# Sometimes email.parser strips the <> brackets from a To:
# header which has a single address. I have seen this happen
# for only one mail in a duplicate pair. I'm not sure why
# (presumably the parser uses email.utils.unquote somewhere in
# its code path which was only triggered by that mail and not
# its sister mail), but to be safe, we should always strip the
# <> brackets to avoid this difference preventing duplicate
# detection.
if re.match("^<[^<>,]+>$", value):
return email.utils.unquote(value)
return value