In [None]:
import imaplib, email, getpass
from email.utils import getaddresses

# Email settings
imap_server = 'imap.gmail.com'
imap_user = 'joseph_engelman@brown.edu'
imap_password = getpass.getpass()

# Connection
conn = imaplib.IMAP4_SSL(imap_server)
(retcode, capabilities) = conn.login(imap_user, imap_password)

if retcode != "OK":
    print "Oh no! Could not connect to GMail."
else:
    print "Logged in."

In [None]:
conn.select("[Gmail]/All Mail", readonly=True)
result, data = conn.uid('search', None, 'ALL')

uids = data[0].split()

# Download headers
result, data = conn.uid('fetch', ','.join(uids), '(RFC822)')

print "Fetched", len(uids), "email(s)."

In [None]:
# Where data will be stored
raw_file = open('raw-email-rec.tsv', 'w')

# Header for TSV file
raw_file.write("Message-ID\tDate\tSubject\tFrom\tTo\tCc\tIn-Reply-To\tBody\n")

# Parse data and spit out info
for i in range(0, len(data)):
     
    # If the current item is _not_ an email header
    if len(data[i]) != 2:
        continue
     
    # Okay, it's an email header. Parse it.
    msg = email.message_from_string(data[i][1])
    mids = msg.get_all('message-id', None)
    mdates = msg.get_all('date', None)
    subjects = msg.get_all('subject', None)
    senders = msg.get_all('from', [])
    receivers = msg.get_all('to', [])
    ccs = msg.get_all('cc', [])
    in_reply_to = msg.get_all('in-reply-to', [])
    
    if msg.is_multipart():
        body = ' '.join([part.get_payload() for part in msg.walk() if part.get_content_type() == 'text/plain'])
    else:
        body = msg.get_payload()
     
    row = "\t" if not mids else mids[0] + "\t"
    row += "\t" if not mdates else mdates[0] + "\t"
    row += "\t" if not subjects else subjects[0].replace('\t', ' ').replace('\r\n', ' ').replace('\n', ' ').encode('string_escape') + "\t"
     
    # Only one person sends an email, but just in case
    for name, addr in getaddresses(senders):
        row += addr + " "
    row += "\t"
     
    # Space-delimited list of those the email was addressed to
    for name, addr in getaddresses(receivers):
        row += addr + " "
    row += "\t"
    
    # Space-delimited list of those who were CC'd
    for name, addr in getaddresses(ccs):
        row += addr + " "
    row += "\t"
    
    # Space-delimited list of messages this message was in reply to
    for parent in in_reply_to:
        row += parent + " "
    row += "\t"
    
    # Full body of the email (including all parts if multipart)
    row += body.replace('\t', ' ').replace('\r\n', ' ').replace('\n', ' ').encode('string_escape')
    row += "\n"
    
    # Just going to output tab-delimited, raw data.
    raw_file.write(row)
    
# Done with file, so close it
raw_file.close()
print "Done."

In [None]:
clean_file = open("emails.tsv", "w")

with open('raw-email-rec.tsv', 'r') as f:
    previous = ''
    for line in f:
        if line.startswith('<'):
            if len(previous) > 0:
                clean_file.write(previous)
            previous = line
        else:
            previous = ''
        
clean_file.close()
print "Done."