# HeadHunt

We're going to teach a program what kinds of jobs we like, using labeled Gmail messages as a training source.

The workflow looks like this:

1. Data Acquisition
    1. Authorize w/ Gmail
    1. Lookup IDs of labels
    1. Get list of message IDs by label ID
    1. Download all emails
    1. Parse text from email
1. Feature Extraction
    1. Tokenize
    1. TF-IDF
1. Learn & Predict
    1. Train
    1. Cross-Validate
    1. Visualize

# Get Data

In [4]:
POS_LABEL = 'job-offers/yes'
NEG_LABEL = 'job-offers-no'

In [5]:
!pip install google-api-python-client



In [9]:
import argparse
import base64
import email
import httplib2
import os

import oauth2client
import pandas as pd
from apiclient import discovery, errors
from oauth2client import client, tools

In [44]:
def download_emails(gmail=None, df=None, labels=[]):
    """Retrieves emails for the user.
    
    Args:
        df (pandas.DataFrame): Pre-existing data. Only non-existent emails
            will be retrieved.
    Returns:
        A new DataFrame w/ downloaded emails.
    """
    reqd_columns = ['id', 'labelName', 'email']
    if df is None:
        data = pd.DataFrame(columns=reqd_columns)
    else:
        assert reqd_columns in df.columns
        data = df
    if gmail is None:
        gmail = build_service()  # Ctrl-Enter to save verification code

In [45]:
df = pd.DataFrame(columns=['id', 'labelName', 'email'])
gmail = build_service()
labels = [POS_LABEL, NEG_LABEL]
# Get all labels, and build a map of label names => label IDs
lbl_name_ids = get_label_ids(gmail, labels)
label_ids = [lbl_name_ids[lbl] for lbl in labels]
# Lookup message IDs associated with labels: labelName => [{message IDs, threadID}]
messages = get_message_ids(gmail, label_ids)

Retrieving all messages with label(s) 'Label_62'
Retrieved 39 messages
Retrieving all messages with label(s) 'Label_60'
Retrieved 112 messages


In [46]:
# Switch to using a DataFrame
for lbl, msgs in messages.items():
    tmpdf = pd.DataFrame(msgs)
    tmpdf['labelName'] = lbl
    df = df.append(tmpdf, ignore_index=True)
df.head()

Unnamed: 0,email,id,labelName,threadId
0,,1552c3134e732bff,Label_60,1552c3134e732bff
1,,15507767e9b5725b,Label_60,15507767e9b5725b
2,,154deb710bc21e40,Label_60,154deb710bc21e40
3,,154d084eca46f99d,Label_60,154d084eca46f99d
4,,154ce64cec2bfc18,Label_60,154ce64cec2bfc18


In [93]:
# Download each missing email by ID
def get_emails(row):
    return get_email(gmail, row.id)

# Retrieve each missing email
df['emails'] = df.apply(get_emails, axis=1)
df.head()

Unnamed: 0,email,id,labelName,threadId,text,emails
0,"{'id': '1552c3134e732bff', 'payload': {'mimeTy...",1552c3134e732bff,Label_60,1552c3134e732bff,"Jeremy,\r\n\r\nI came across a copy of your re...","{'id': '1552c3134e732bff', 'payload': {'mimeTy..."
1,"{'id': '15507767e9b5725b', 'payload': {'mimeTy...",15507767e9b5725b,Label_60,15507767e9b5725b,"Hello,\n\r\nMy name is Naga.I am a Staffing Sp...","{'id': '15507767e9b5725b', 'payload': {'mimeTy..."
2,"{'id': '154deb710bc21e40', 'payload': {'mimeTy...",154deb710bc21e40,Label_60,154deb710bc21e40,"Hi Jeremy,\r\n\r\nHope you are doing well!!\r\...","{'id': '154deb710bc21e40', 'payload': {'mimeTy..."
3,"{'id': '154d084eca46f99d', 'payload': {'mimeTy...",154d084eca46f99d,Label_60,154d084eca46f99d,"Hello Jeremy, \r\nGreetings,This is Pankaj. I ...","{'id': '154d084eca46f99d', 'payload': {'mimeTy..."
4,"{'id': '154ce64cec2bfc18', 'payload': {'mimeTy...",154ce64cec2bfc18,Label_60,154ce64cec2bfc18,This is charan from Soft path System LLC. Soft...,"{'id': '154ce64cec2bfc18', 'payload': {'mimeTy..."


In [94]:
def fill_in_text(row):
    return extract_gmail_text(row.email['payload'])

# Extract text from the email
df['text'] = df.apply(fill_in_text, axis=1)
df.head()

Unnamed: 0,email,id,labelName,threadId,text,emails
0,"{'id': '1552c3134e732bff', 'payload': {'mimeTy...",1552c3134e732bff,Label_60,1552c3134e732bff,"Jeremy,\r\n\r\nI came across a copy of your re...","{'id': '1552c3134e732bff', 'payload': {'mimeTy..."
1,"{'id': '15507767e9b5725b', 'payload': {'mimeTy...",15507767e9b5725b,Label_60,15507767e9b5725b,"Hello,\n\r\nMy name is Naga.I am a Staffing Sp...","{'id': '15507767e9b5725b', 'payload': {'mimeTy..."
2,"{'id': '154deb710bc21e40', 'payload': {'mimeTy...",154deb710bc21e40,Label_60,154deb710bc21e40,"Hi Jeremy,\r\n\r\nHope you are doing well!!\r\...","{'id': '154deb710bc21e40', 'payload': {'mimeTy..."
3,"{'id': '154d084eca46f99d', 'payload': {'mimeTy...",154d084eca46f99d,Label_60,154d084eca46f99d,"Hello Jeremy, \r\nGreetings,This is Pankaj. I ...","{'id': '154d084eca46f99d', 'payload': {'mimeTy..."
4,"{'id': '154ce64cec2bfc18', 'payload': {'mimeTy...",154ce64cec2bfc18,Label_60,154ce64cec2bfc18,This is charan from Soft path System LLC. Soft...,"{'id': '154ce64cec2bfc18', 'payload': {'mimeTy..."


## Authenticate to Gmail

In [20]:
# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/gmail-python-quickstart.json
SCOPES = [
   'https://www.googleapis.com/auth/gmail.readonly',
]
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'HeadHunt'


def get_credentials(flags=None):
    """Gets valid user credentials from storage.

    If nothing has been stored, or if the stored credentials are invalid,
    the OAuth2 flow is completed to obtain the new credentials.

    Returns:
        Credentials, the obtained credential.
    """
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir,
                                   'gmail-python-quickstart.json')

    store = oauth2client.file.Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials


def build_service():
    """Shows basic usage of the Gmail API.

    Creates a Gmail API service object and outputs a list of label names
    of the user's Gmail account.
    """
    flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args([
        '--noauth_local_webserver'
    ])
    credentials = get_credentials(flags)
    http = credentials.authorize(httplib2.Http())
    service = discovery.build('gmail', 'v1', http=http)
    return service

## Download Email

In [96]:
# Convert our human-readable labels into Gmail's underlying label IDs
def get_label_ids(gmail, labels=[], user_id='me'):
    """Retrieve the label ID for each label."""
    try:
        response = gmail.users().labels().list(userId=user_id).execute()
        return {lbl['name']: lbl['id'] for lbl in response['labels'] if lbl['name'] in labels}
    except errors.HttpError as e:
        return None

    
# Retrieve all messages under each label
def get_message_ids(gmail, labels=[], user_id='me'):
    d = {}
    for label in labels:
        print("Retrieving all messages with label(s) '{}'".format(label))
        try:
            response = gmail.users().messages().list(userId=user_id,
                                                     labelIds=[label]).execute()
            d[label] = []
            if 'messages' in response:
                d[label].extend(response['messages'])

            while 'nextPageToken' in response:
                page_token = response['nextPageToken']
                response = gmail.users().messages().list(userId=user_id,
                                                         labelIds=[label],
                                                         pageToken=page_token).execute()
                d[label].extend(response['messages'])
            print("Retrieved {:d} messages".format(len(d[label])))
        except errors.HttpError as e:
            print("Error: ", e)
            return None
    return d


def get_email(gmail, msg_id, format='full', user_id='me'):
    """Download an email in Google's pre-hierarchied format.
    
    https://developers.google.com/gmail/api/v1/reference/users/messages/get
    
    Returns:
        https://developers.google.com/gmail/api/v1/reference/users/messages
    """
    msg = gmail.users().messages().get(
        userId=user_id, id=msg_id, format=format
    ).execute()
    # Convert raw emails into Python objects
    if format == 'raw':
        msg_str = base64.urlsafe_b64decode(msg['raw']).decode()  
        return email.message_from_string(msg_str)
    return msg


def extract_by_mimetype(msg, mimetype='text/plain'):
    """Pull the first text/plain message part out of the email."""
    for part in msg.walk():
        if part.get_content_type() == mimetype:
            return part.get_payload(decode=True)

## Process Text from Emails

In [62]:
from bs4 import BeautifulSoup

In [63]:
def walk(val, level=0, leaves=True):
    """Walk a structure made of a dicts/lists."""
    if isinstance(val, dict):
        for k, v in sorted(val.items()):
            yield k, level
            yield from walk(v, level + 1, leaves=leaves)
    elif isinstance(val, list):
        for item in val:

            yield from walk(item, level + 1, leaves=leaves)
    elif leaves:
        yield val, level

        
def tree(struct, filtr=''):
    gen = walk(struct)
    for key, lvl in gen:
        if key in filtr:
            yield next(gen) 

In [64]:
def extract_gmail_text(payload):
    """Parse the *most likely* source of human-readable text from 
    the email.
    
    "Most likely" uses the following search along the MIME types of 
    messages parts:
        * text/plain, which is base64 decoded.
        * text/html, which is base64 decoded and then
          parsed for text elements.
    """
    if payload['mimeType'] == 'text/plain':
        return base64.urlsafe_b64decode(payload['body']['data']).decode()
    if payload['mimeType'] == 'text/html':
        return parse_html_email(payload['body']['data'])
    # Else, recurse
    for part in payload.get('parts', []):
        return extract_gmail_text(part)


def parse_html_email(data):
    """Parse a text/html Email message part."""
    text = base64.urlsafe_b64decode(data).decode()
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()