In [3]:
!pip install google-api-python-client



In [34]:
import argparse
import httplib2
import os

from apiclient import discovery, errors
import oauth2client
from oauth2client import client
from oauth2client import tools


# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/gmail-python-quickstart.json
SCOPES = [
   'https://www.googleapis.com/auth/gmail.readonly',
]
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Gmail API Python Quickstart'


def get_credentials(flags=None):
    """Gets valid user credentials from storage.

    If nothing has been stored, or if the stored credentials are invalid,
    the OAuth2 flow is completed to obtain the new credentials.

    Returns:
        Credentials, the obtained credential.
    """
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir,
                                   'gmail-python-quickstart.json')

    store = oauth2client.file.Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials


def build_service():
    """Shows basic usage of the Gmail API.

    Creates a Gmail API service object and outputs a list of label names
    of the user's Gmail account.
    """
    flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args([
        '--noauth_local_webserver'
    ])
    credentials = get_credentials(flags)
    http = credentials.authorize(httplib2.Http())
    service = discovery.build('gmail', 'v1', http=http)
    return service

In [None]:
# Run this cell, and hit Ctrl-Enter to save the verification code.
gmail = build_service()

In [40]:
POS_LABEL = 'job-offers/yes'
NEG_LABEL = 'job-offers-no'

In [42]:
# Convert our human-readable labels into Gmail's underlying label IDs
def get_label_ids(gmail, labels=[], user_id='me'):
    """Retrieve the label ID for each label."""
    try:
        response = gmail.users().labels().list(userId=user_id).execute()
        return {lbl['name']: lbl['id'] for lbl in response['labels'] if lbl['name'] in labels}
    except errors.HttpError as e:
        return None
    
lbl_name_ids = get_label_ids(gmail, [POS_LABEL, NEG_LABEL])
print(lbl_name_ids)

{'job-offers/yes': 'Label_62', 'job-offers-no': 'Label_60'}


In [54]:
# Retrieve all messages under each label
def get_messages_by_label(gmail, labels=[], user_id='me'):
    d = {}
    for label in labels:
        print("Retrieving all messages with label(s) '{}'".format(label))
        try:
            response = gmail.users().messages().list(userId=user_id,
                                                     labelIds=[label]).execute()
            d[label] = []
            if 'messages' in response:
                d[label].extend(response['messages'])

            while 'nextPageToken' in response:
                page_token = response['nextPageToken']
                response = gmail.users().messages().list(userId=user_id,
                                                         labelIds=[label],
                                                         pageToken=page_token).execute()
                d[label].extend(response['messages'])
            print("Retrieved {:d} messages".format(len(d[label])))
        except errors.HttpError as e:
            print("Error: ", e)
            return None
    return d

label_ids = [lbl_name_ids[lbl] for lbl in [POS_LABEL, NEG_LABEL]]
messages = get_messages_by_label(gmail, label_ids)

Retrieving all messages with label(s) 'Label_62'
Retrieved 39 messages
Retrieving all messages with label(s) 'Label_60'
Retrieved 112 messages


In [3]:
import pandas as pd

In [69]:
df = pd.DataFrame()
for lbl, msgs in messages.items():
    tmpdf = pd.DataFrame(msgs)
    tmpdf['labelName'] = lbl
    df = df.append(tmpdf, ignore_index=True)
df.describe()

Unnamed: 0,id,threadId,labelName
count,151,151,151
unique,151,146,2
top,153a9496e6a5653a,1548799c2c39ebf3,Label_60
freq,1,2,112


In [68]:
# Checkpoint our work
df.to_json('emails.json')
# Aaaand prove it saved as we expect
tmpdf = pd.read_json('emails.json')
tmpdf.describe()

Unnamed: 0,id,labelName,threadId
count,151,151,151
unique,151,2,146
top,1543e010109aeaea,Label_60,154815291dd58008
freq,1,112,2


In [298]:
import base64
import email
from email.iterators import _structure


def get_email(gmail, msg_id, format='full', user_id='me'):
    """Download an email in Google's pre-hierarchied format."""
    msg = gmail.users().messages().get(
        userId=user_id, id=msg_id, format=format
    ).execute()
    # Convert raw emails into Python objects
    if format == 'raw':
        msg_str = base64.urlsafe_b64decode(msg['raw']).decode()  
        return email.message_from_string(msg_str)
    return msg


def extract_by_mimetype(msg, mimetype='text/plain'):
    """Pull the first text/plain message part out of the email."""
    for part in msg.walk():
        if part.get_content_type() == mimetype:
            return part.get_payload(decode=True)

In [146]:
def get_emails(row):
    return get_email(gmail, row.id)

# Retrieve each missing email
emails = df.apply(get_emails, axis=1)
# Add to our dataframe
df['email'] = emails
# Save our work
df.to_json('emails.json')

In [296]:
def b64decode(msg):
    return base64.urlsafe_b64decode(msg).decode()


def extract_gmail_plaintext(payload):
    """Return the first text/plain payload in Gmail message."""
    if payload['mimeType'] == 'text/plain':
        return b64decode(payload['body']['data'])
    for part in payload.get('parts', []):
        return extract_gmail_plaintext(part)

In [186]:
def get_plaintext(row):
    return extract_gmail_plaintext(row.email['payload'])

text = df.apply(get_plaintext, axis=1)

In [1]:
df['text'] = text
df.to_json('emails.json')

NameError: name 'text' is not defined

# Checkpoint!

In [7]:
df = pd.read_json('emails.json')

In [34]:
def null_count(df, colname):
    """Report on the number of null & non-null values."""
    print("""{colname}
====
   null: {null:d}
notnull: {notnull:d}
""".format(colname=colname,
           null=df[colname].isnull().sum(),
           notnull=df[colname].notnull().sum())
    )
null_count(df, 'text')

text
====
   null: 79
notnull: 72



In [51]:
payload = null_text['email']['payload']

In [237]:
def walk(val, level=0, leaves=True):
    """Walk a structure made of a dicts/lists."""
    if isinstance(val, dict):
        for k, v in sorted(val.items()):
            yield k, level
            yield from walk(v, level + 1, leaves=leaves)
    elif isinstance(val, list):
        for item in val:
            yield from walk(item, level + 1, leaves=leaves)
    elif leaves:
        yield val, level

In [241]:
def tree(struct, filtr=''):
    gen = walk(struct)
    for key, lvl in gen:
        if key in filtr:
            yield next(gen) 
        
for val, lvl in tree(payload, filtr=['mimeType']):
    print("{}{}".format(' ' * 2 * lvl, val))

  multipart/mixed
      text/html


In [279]:
def get_mime_types(row):
    return ', '.join([v for v, l in tree(row.email['payload'], filtr=['mimeType'])])

mts = df[df['text'].isnull()].apply(get_mime_types, axis=1)

In [316]:
def extract_gmail_text(payload):
    """Parse the *most likely* source of human-readable text from 
    the email.
    
    "Most likely" uses the following search along the MIME types of 
    messages parts:
        * text/plain, which is base64 decoded.
        * text/html, which is base64 decoded and then
          parsed for text elements.
    """
    if payload['mimeType'] == 'text/plain':
        return b64decode(payload['body']['data'])
    if payload['mimeType'] == 'text/html':
        return parse_html_email(payload['body']['data'])
    # Else, recurse
    for part in payload.get('parts', []):
        return extract_gmail_text(part)
    

from bs4 import BeautifulSoup


def parse_html_email(data):
    """Parse a text/html Email message part."""
    text = b64decode(data)
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

In [318]:
def fill_in_text(row):
    return extract_gmail_text(row.email['payload'])

texts = df[df['text'].isnull()].apply(fill_in_text, axis=1)

In [319]:
texts

1      Hello,\n\r\nMy name is Naga.I am a Staffing Sp...
10     Hi Jeremy,\n\nMy Medford, MA client is conduct...
101    Greetings!\n\r\nMy name is Snigdha, I am a res...
102    Hi Jeremy,\nHope you are doing well,\nMy name ...
104    \n\nHello Jeremy\n\nMy name is Jake and I'm an...
105    \n\nHello Jeremy\n\nMy name is Gabrielle and I...
106    \n\nHello Jeremy\n\nMy name is Arthur and I'm ...
108    \n\nHello Jeremy\n\nMy name is Jeremy and I'm ...
109    Hi Jeremy,\n\nNet2Source, Inc. is one of the f...
11     Greetings Jeremy,\n\r\nI represent Pyramid Con...
110    \n\nHello Jeremy\n\nMy name is Bria and I'm an...
112    \n\nNow’s a great time for a career change\n\n...
115    Hi Jeremy,\n\r\nI came across your resume. I h...
122    My name is Fern Stasiuk and I am a Senior Sour...
124    Hello Jeremy,\n\nOur Merrimack,NH client is co...
127    Hi Jeremy,\n\nI came across your resume online...
129    \n\nHello Jeremy\n\nThere is immediate need fo...
139    Good Morning/Afternoon J