In [44]:
from bs4 import BeautifulSoup as Soup
import json
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from socket import timeout
import time
from time import sleep
import urllib
from urllib.request import urlopen

%pylab inline

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('max_colwidth',500)    

Populating the interactive namespace from numpy and matplotlib


In [45]:
def get_links(url):
    """
    Arguments:
    url (string):  a url from an Indeed.com search.
    
    This represents step 1 of searching, clicking through, and scraping job descriptions
    on Indeed.com.
    
    Returns:
    ids (list): a list of link IDs so we can click through to the actual job postings.
    """
    html = urlopen(url)
    
    site = html.read()
    soup = Soup(site, 'html.parser')
    results = soup.find(id = 'resultsCol')
    
    page_urls = [link.get('href') for link in results.find_all('a')]
    page_urls = [link for link in page_urls if '/rc/' in 
          str(link)]
    
    ids = []
    for link in page_urls:
        start = link.find('jk=') + 3
        end = link.find('&fccid=')
        ids.append(link[start:end])
    return(ids)

In [46]:
def get_page_dict(url):
    """
    Arguments:
    url (string): a component page url from the results of an Indeed.com search.
    
    Parses the html and extracts key information.
    
    Returns:
    html_content_dict (dict): a dictionary of job info.
    
    Success key indicates whether the site format is wonky and should be tossed or 
    parsed differently later. Url key holds the string of the site url. Job title,
    company, location, and job descriptions are extracted from the job page html.
    
    """
    
    # In case website times out
    backoff=15
    
    nulldict = {"success": False,
                'url': url,
                'description': "",
                'title': "",
                'company': "",
                'location': ""}
    
    succeeded = False
    for tries in range(3):
        if succeeded == False:
            try:
                html = urlopen(url)
                succeeded = True
            # In case server needs another try
            except timeout:
                sleep(backoff)
            # In case server returns 500, move on
            except urllib.error.HTTPError():
                return nulldict
    
    if succeeded == False:
        return nulldict
    
    site = html.read()
    soup = Soup(site, 'html.parser')
    
    for i in soup.find_all("script"):
        i.decompose()
    for i in soup.find_all("style"):
        i.decompose()
    for i in soup.find_all("noscript"):
        i.decompose()
    for i in soup.find_all("meta"):
        i.decompose()
    
    #for updates as the script runs, uncomment below
    #print(url)
    
    #look at the soup, figure out the right call to get text, title, company, location
    tag = soup.find("div", {"class": "jobsearch-JobComponent-description"})
    if tag is None:
        return {"success": False, "url": url, 'urlresponse': html, "urltext" : site, "urlsoup" : soup}
    text = soup.find("div", {"class": "jobsearch-JobComponent-description"}).get_text()
    lines = (line.strip() for line in text.splitlines()) 
    lines = [l for l in lines if l != '']
    cleaned = ' '.join(lines)
    
    title = soup.find('title').get_text().split(sep = " -")[0]
    company = soup.find('div',{'class':'jobsearch-InlineCompanyRating'}).find('div').get_text()
    location = soup.find('div',{'class':'jobsearch-InlineCompanyRating'}).findAll('div')[-1].get_text()
    
    html_content_dict = {"success": True,
                        'url': url,
                        'description':cleaned,
                        'title': title,
                        'company': company,
                        'location': location}
    
    return html_content_dict

In [47]:
def get_job_descriptions(search_url, total_job_count, offset):
    """
    Arguments: 
    search_url (string): url for Indeed.com search.
    total_job_count (int): total number of jobs to be retrieved
    offset (int): to break task into smaller pieces, start later in search

    Gets job IDs from that search page.
    Returns list of dicts containing job content
    using the helper functions above.
    Each job description is a dict.
    
    Returns:
    descriptions (list): list of dicts containing job content
    """
    # how many jobs from this search?
    count = total_job_count
    
    # In case website times out
    backoff=15
    succeeded = False
    for tries in range(3):
        if succeeded == False:
            try:
                html = urlopen(search_url)
                succeeded = True
            except timeout:
                sleep(backoff)
        
    site = html.read()
    soup = Soup(site, 'html.parser')
    results = soup.find(id = 'resultsCol')
    divs = results.find_all('div')
    
    print("Getting job ids...")
    i = 0 
    job_ids = []
    while i < count:
        url = search_url + '&start=' + str(i + offset)
        job_ids = job_ids + get_links(url)
        sleep(1)
        i = i + 10
    print("Finished pulling job ids. Getting descriptions...")
    
    descriptions = []
    
    #creating a counter to keep track of progress
    iteration_counter = 0
    
    for id in job_ids:
        #Printing out progress mod 10
        if iteration_counter % 10 == 0:
            print("Grabbing job " + str(iteration_counter))
        iteration_counter = iteration_counter + 1
        url = "https://www.indeed.com/viewjob?jk=" + str(id)
        descriptions.append(get_page_dict(url))
        sleep(1)
    
    print("Finished grabbing jobs.")
    return(descriptions)

In [48]:
def run_data_collection(radius_int, fromage_int, sort_type, total_job_count, offset):
    """
    Arguments:
    radius_int (int): radius of geographic search. Furthest reaches of NYC are 15-20 miles from midtown.
    fromage_int (int): "from age" a limit on days old the search results can be
    sort_type (string): Indeed.com allows search by recommended or date. Date preferred here.
    format(jobtitletext = searchwords)
    offset (int): offset for starting later in search results for smaller tasks
    
    This function formats the search string appropriately and runs the scraping task.
    
    Returns: 
    scraped_content (list): list of dicts where each dict contains info from one job posting.
    
    """
    urlstring = "https://www.indeed.com/jobs?radius={radius}&l=New+York%2C+NY&fromage={fromage}&sort={sort}"
    formatted_string = urlstring.format(radius = radius_int, fromage = fromage_int, sort = sort_type)
    
    scraped_content = get_job_descriptions(formatted_string, total_job_count, offset)
    
    return scraped_content

In [49]:
def clean_scraped_data(scraped_data_list):
    """
    Arguments:
    scraped_data_list (list): list of dicts, each of which contains a job.
    
    Filters on the success of the parse (Indeed.com has two different html formats).
    convert list of dicts into a dataframe.
    
    Return:
    job_df (pd dataframe): dataframe where each row is a job description
    """
    #filter on success, convert list of identical dicts into pandas df
    filtered_data = [x for x in scraped_data_list if x['success']]
    job_df = pd.DataFrame(filtered_data)
    
    return job_df

In [56]:
def scrape_data(offset, goal_job_desc_count, step):
    """
    Arguments:
    offset (int): start scraping here. For breaking task into chunks
    goal_job_desc_count (int): how many total job descriptions required
    step (int): scraping batched in groups of this many. 
    
    Because these scraping tasks can produce unexpected errors,
    scraping is batched and can be picked up based on last 
    reported signpost quantity.
    
    Returns:
    Saves each batch to labeled csv in working directory.
    """    
    for i in range(offset, goal_job_desc_count, step):
        scraped_data =  run_data_collection(15,15,"date", 500, i)
        cleaned_data = clean_scraped_data(scraped_data)
        cleaned_data.to_csv("indeed_jobs_{}_overnight.csv".format(i))
        print("finished "+str(i+500)+" jobs")

In [55]:
# Scrape 10,000 job descriptions in batches of 500 starting at record 0 from the search

scrape_data(0,10000,500)

4