viewmailattachments

#! /usr/bin/env python

# Take an mbox HTML message (e.g. from mutt), split it
# and rewrite it so all of its attachments can be viewed in a browser
# (perhaps after being converted to HTML from DOC or whatever first).
#
# Can be run from within a mailer like mutt, or independently
# on a single message file.
#
# Grew out of a simpler script called viewhtmlmail.
#
# Copyright 2015 by Akkana Peck. Share and enjoy under the GPL v2 or later.
# Changes:
#   Holger Klawitter 2014: create a secure temp file and avoid temp mbox

# To use it from mutt, put the following lines in your .muttrc:
# macro  index  <F10>  "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"
# macro  pager  <F10>  "<pipe-message>~/bin/viewmailattachments\n" "View attachments in browser"

import os, sys
import re
import time
import shutil
import email, mimetypes
import tempfile
import subprocess
from bs4 import BeautifulSoup

# Some prefs:
USE_WVHTML_FOR_DOC = False
CONVERT_PDF_TO_HTML = False

# Temporary for debugging:
class mysubprocess:
    @staticmethod
    def call(arr):
        # print "\n\n=================================\n=== Calling:", arr
        subprocess.call(arr)

def view_message_attachments(fp, tmpdir):
    '''View message attachments coming from the file-like object fp.
    '''

    msg = email.message_from_string(fp.read())

    html_part = None
    counter = 1
    subfiles = []
    subparts = []
    htmlfiles = []
    htmlparts = []

    def tmp_file_name(part):
        partfile = part.get_filename()

        # Applications should really sanitize the given filename so that an
        # email message can't be used to overwrite important files.
        # As a first step, warn about ../
        if partfile and '../' in partfile:
            print "Eek! Possible security problem in filename", partfile
            return None

        # Make a filename in the tmp dir:
        if not partfile:
            ext = mimetypes.guess_extension(part.get_content_type())
            if not ext:
                # Use a generic bag-of-bits extension
                ext = '.bin'
            partfile = 'part-%03d%s' % (counter, ext)

        return os.path.join(tmpdir, partfile)

    def save_tmp_file(part):
        '''Saves this part's payload to a tmp file, returning the new filename.
        '''
        partfile = tmp_file_name(part)

        tmpfile = open(partfile, "w")
        tmpfile.write(part.get_payload(decode=True))
        tmpfile.close()
        return partfile

    # Walk through the message a first, preliminary time
    # to separate out any images that might be referred to by
    # an HTML part.
    for part in msg.walk():
        # walk() includes the top-level message
        if part == msg:
            # print "  Skipping the top-level message"
            continue

        if part.get_content_type() != "multipart/related":
            continue

        # It's multipart. Walk the subtree looking for image children.
        for child in part.walk():
            # print " ", child.get_content_type()

            # At least for now, only save images as parts of multipart.
            if child.get_content_maintype() != "image":
                continue

            filename = save_tmp_file(child)
            # print "    Saved to", filename

            # Rewrite image and other inline URLs in terms of content-id.
            # Mailers may use Content-Id or Content-ID (or, presumably,
            # other capitalizations). So we can't just look it up simply.
            content_id = None
            for k in child.keys():
                if k.lower() == 'content-id':
                    # Remove angle brackets, if present.
                    # child['Content-Id'] is unmutable: attempts to change it
                    # are just ignored. Copy it to a local mutable string.
                    content_id = child[k]
                    if content_id.startswith('<') and \
                       content_id.endswith('>'):
                        content_id = content_id[1:-1]

                    subfiles.append({ 'filename': filename,
                                      'Content-Id': content_id })
                    subparts.append(child)
                    counter += 1
                    fp = open(filename, 'wb')
                    fp.write(child.get_payload(decode=True))
                    fp.close()
                    break     # no need to look at other keys

            # if not content_id:
            #     print filename, "doesn't have a Content-Id, not saving"
            #     # print "keys:", child.keys()

    # print "Subfiles:"
    # for sf in subfiles:
    #     print sf

    # Call up the firefox window right away,
    # so the user can see something is happening.
    # Firefox, alas, has no way from the commandline of calling up
    # a new private window with content, then replacing that content.
    # So we'll create a file that refreshes, so that when content is ready,
    # it can redirect to the first content page.
    def write_to_index(outfile, msg, timeout_secs, redirect_url):
        if not redirect_url:
            redirect_url = "file://" + outfile
        ofp = open(outfile, "w")
        ofp.write('''<html><head>
<meta content="utf-8" http-equiv="encoding">
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<meta http-equiv="refresh" content="%d;URL=%s">
</head><body>
<br><br><br><br><br><br><big><big>%s</big></big>
</body></html>
''' % (timeout_secs, redirect_url, msg))
        ofp.close()

    redirect_timeout = 3
    pleasewait_file = tmpdir + "/index.html"
    write_to_index(pleasewait_file, "Please wait ...", redirect_timeout, None)
    mysubprocess.call(["firefox", "-private-window",
                       "file://" + pleasewait_file])
                       # "data:text/html,<br><br><br><br><br><h1>Translating documents, please wait ..."
                       # Use JS if we can figure out how to close or replace
                       # the "please wait" tab once we have content to show.
                       # But for now, setTimeout() doesn't work at all
                       # in newly popped up private windows.
                       # "javascript:document.writeln('<br><br><br><br><br><h1>Translating documents, please wait ...');setTimeout(function(){alert('hi');}, 500);"
                     # ])

    # Now walk through looking for the real parts:
    # HTML, doc and docx.
    for part in msg.walk():

        # part has, for example:
        # items: [('Content-Type', 'image/jpeg'),
        #         ('Content-Transfer-Encoding', 'base64'),
        #         ('Content-ID', '<14.3631871432@web82503.mail.mud.yahoo.com>'),
        #         ('Content-Disposition',
        #          'attachment; filename="ATT0001414.jpg"')]
        # keys: ['Content-Type', 'Content-Transfer-Encoding',
        #        'Content-ID', 'Content-Disposition']
        # values: ['image/jpeg', 'base64',
        #          '<14.3631871432@web82503.mail.mud.yahoo.com>',
        # 'attachment; filename="ATT0001414.jpg"']

        # multipart/* are just containers
        #if part.get_content_maintype() == 'multipart':
        if part.is_multipart() or part.get_content_type == 'message/rfc822':
            continue

        if part.get_content_maintype() == "application":
            partfile = save_tmp_file(part)
            fileparts = os.path.splitext(partfile)
            htmlfilename = fileparts[0] + ".html"

            if part.get_content_subtype() == "msword" and USE_WVHTML_FOR_DOC:
                mysubprocess.call(["wvHtml", partfile, htmlfilename])
                htmlfiles.append(htmlfilename)

            elif part.get_content_subtype() == \
                 "vnd.openxmlformats-officedocument.wordprocessingml.document" \
                 or part.get_content_subtype() == "msword":
                mysubprocess.call(["unoconv", "-f", "html", "-o",
                                   htmlfilename, partfile])

                htmlfilename = os.path.join(fileparts[0] + ".html")
                htmlfiles.append(htmlfilename)

            # unoconv conversions from powerpoint to HTML drop all images.
            # Try converting to PDF instead:
            elif part.get_content_subtype() == "vnd.ms-powerpoint" \
                 or part.get_content_subtype() == \
                    "vnd.openxmlformats-officedocument.presentationml.presentation" :
                pdffile = fileparts[0] + ".pdf"
                mysubprocess.call(["unoconv", "-f", "pdf",
                                   "-o", pdffile, partfile])
                htmlfiles.append(pdffile)

            elif part.get_content_subtype() == "pdf":
                if CONVERT_PDF_TO_HTML:
                    mysubprocess.call(["pdftohtml", "-s", partfile])

                    # But pdftohtml is idiotic about output filename
                    # and won't let you override it:
                    htmlfiles.append(fileparts[0] + "-html.html")
                else:
                    htmlfiles.append(partfile)

        elif part.get_content_maintype() == "text" and \
             part.get_content_subtype() == 'html':

            htmlfile = tmp_file_name(part)

            fp = open(htmlfile, 'wb')
            htmlsrc = part.get_payload(decode=True)

            soup = BeautifulSoup(htmlsrc)

            # Substitute filenames for CIDs:
            for tag in soup.body.find_all("img", src=True):
                if tag['src'].lower().startswith("cid:"):
                    for sf in subfiles:
                        if tag['src'][4:] == sf['Content-Id']:
                            tag['src'] = "file://" + sf['filename']
            # for sf in subfiles:
            #     htmlsrc = re.sub('cid: ?' + sf['Content-Id'],
            #                      'file://' + sf['filename'],
            #                      htmlsrc, flags=re.IGNORECASE)

            # If it's HTML, we may need to add a meta charset tag. Sigh.
            # If it's text/plain, there's nothing we can do to fix charset.
            charset = part.get_charset()
            if not charset:
                charset = "UTF-8"
            head = soup.find("head")
            if not head:
                head = soup.new_tag("head")
                html = soup.find("html")
                if html:
                    html.insert(0, head)
                else:
                    soup.insert(0, head)

            if not head.findAll("meta", attrs={"http-equiv": "encoding"}) and \
               not head.findAll("meta", attrs={"http-equiv": "content-type"}):
                meta = soup.new_tag("meta")
                meta["content"] = charset
                meta["http-equiv"] = "encoding"
                head.insert(0, meta)
                meta = soup.new_tag("meta")
                meta["http-equiv"] = "content-type"
                meta["content"] = "text/html; charset=%s" % charset
                head.insert(0, meta)

            fp.write(soup.prettify().encode("utf-8", "xmlcharrefreplace"))
            fp.close()

            htmlfiles.append(htmlfile)
        elif part.get_content_maintype() == "image" and part not in subparts:
            partfile = save_tmp_file(part)
            htmlfiles.append(partfile)

    # Done processing attachments. Call firefox on everything.
    if htmlfiles:
        # For the first URL, just put a redirect in
        write_to_index(pleasewait_file,
                       "Redirecting to file://" + htmlfiles[0],
                       0, "file://" + htmlfiles[0])

        # It may take the whole timeout before firefox actually redirects
        # to the new page, so wait it out:
        # XXX Actually we may not need this, we can probably start
        # loading the other tabs right away.
        # time.sleep(redirect_timeout)

        # mysubprocess.call(["firefox", htmlfiles[0]])
        # mysubprocess.call(["firefox", "-private-window", htmlfiles[0]])
        for f in htmlfiles[1:]:
            # If we don't wait for the new window to pop up before
            # calling new-tab, bad things will happen: the document
            # may load in a new tab in the old window and THEN pop up
            # an unwanted third window. Go firefox.
            time.sleep(1)
            mysubprocess.call(["firefox", "-private-window", "file://" + f])

    # Wait a while to make sure firefox has loads the imgaes, then clean up.
    # time.sleep(6)
    # shutil.rmtree(tmpdir)

if __name__ == '__main__':
    tmpdir = tempfile.mkdtemp()

    if len(sys.argv) > 1:
        for f in sys.argv[1:]:
            fp = open(f)
            view_message_attachments(fp, tmpdir)
            fp.close()
    else:
        view_message_attachments(sys.stdin, tmpdir)