This notebook utilizes Python to scrape Indeed for job information based on inputs given by the user, then export the dataframe as a csv so that it can be further cleaned and analyzed in a notebook more tailored to the information contained within the dataset. It was coded in the early stages of my project, hence the flexibility of inputting the job position, the extent of search results to scrape, and the keyword(s) to flag job posts for.

In [1]:
# import necessary libraries for scraping, cleaning, storing data
from bs4 import BeautifulSoup as bs
import requests
import html5lib
from datetime import datetime
import time
import csv
from pathlib import Path
import numpy as np
import pandas as pd
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

In [2]:
# Initialize Selenium & Webdriver instances
service = Service(## PATH TO CHROMEDRIVER.EXE)
search_driver = webdriver.Chrome(service = service)
keyword_driver = webdriver.Chrome(service = service)
salary_driver = webdriver.Chrome(service = service)

In [3]:
# This cell sets all initial variables for gathering and recording data

# Two urls need to be set: a base url for the job searches and another for extracting average salaries
base_search_url = 'https://www.indeed.com/jobs?q='
base_salary_url = 'https://www.indeed.com/career/'

# Set/reset the list where job posting information will be stored.
job_postings = []

# Inputs: If more or different data is needed, these inputs make it easier to scrape data for a new dataset
position = input('Enter the job title to search for: ').strip()
num_locations = int(input('Enter the number of locations to search for the job position: ').strip())
print('Enter each location as city, state (abbreviate state)')
locations_list = []
for location in range(num_locations):
    city_state = (input('Enter location #{}: '.format(location + 1)).strip())
    locations_list.append(city_state)
num_skills = int(input('Enter the number of skills or keywords: ').strip())
skills_list = []
for skill in range(num_skills):
    if skill == 0:
        skill_keyword = input('Enter a skill, certification, or other keyword to flag in job postings: ').lower()
    else:
        skill_keyword = input('Enter the next skill, certification, or other keyword to flag in job postings: ').lower()
    skills_list.append(skill_keyword)
pages = input('Enter how many pages of search results should be scraped: ').strip()

Enter the job title to search for: data analyst
Enter the number of locations to search for the job position: 20
Enter each location as city, state (abbreviate state)
Enter location #1: austin, tx
Enter location #2: san jose, ca
Enter location #3: dallas, tx
Enter location #4: atlanta, ga
Enter location #5: huntsville, al
Enter location #6: charlotte, nc
Enter location #7: raleigh, nc
Enter location #8: san francisco, ca
Enter location #9: seattle, wa
Enter location #10: washington dc
Enter location #11: baltimore, md
Enter location #12: boulder, co
Enter location #13: colorado springs, co
Enter location #14: new york, ny
Enter location #15: los angeles, ca
Enter location #16: trenton, nj
Enter location #17: baltimore, md
Enter location #18: lansing, mi
Enter location #19: hartford, ct
Enter location #20: hartford, ct
Enter the number of skills or keywords: 2
Enter a skill, certification, or other keyword to flag in job postings: python
Enter the next skill, certification, or other key

In [4]:
# All functions to be called are laid out in this cell

# Function for updating the search url when proceeding to the next page of the search results
def update_search_url(page):
    updated_search_url = f'{base_search_url}{position}&l={location}&start={int(page)*10}'
    return updated_search_url

# Function that grabs the position's average salary in the current location being searched in
def parse_location(listing):
    job_location = listing.find('div',{'class':'companyLocation'}).text
    if job_location.find(',') != -1: 
        cutoff = job_location.find(',') + 4  
        job_location = job_location[:cutoff]  # Remove zipcodes, since Indeed considers zipcodes invalid
    else:
        job_location = 'invalid or vague location' # If there's no comma, it's an invalid location
    
    if job_location == 'invalid or vague location':
        location_avg_salary = -1
    else:
        salary_url = f'{base_salary_url}{position}/salaries/{job_location}'
        salary_driver.get(salary_url)
        salary_soup = bs(salary_driver.page_source, 'html5lib')
        location_avg_salary =  salary_soup.find('div', {'class':'css-15psvrv eu4oa1w0'}).text
        removals = [',','$']
        for char in removals:
            location_avg_salary = location_avg_salary.replace(char,'')
    return location, location_avg_salary

# Function for parsing pay ranges/exact rates, returning frequency of pay, miniumum pay, and maximum pay.
# exact rates will cause minimum pay and maximum pay to be the same.
def parse_pay(listing):
    try:
        pay_range = listing.find('div',{'class':'metadata salary-snippet-container'}).text
        pay_range = pay_range.replace(',','')
        pay_type = pay_range[-5:].strip() + 'ly'
        if pay_range.find('-') == -1:  
            pay_max = float(pay_range[1:pay_range.find('a') - 1])
            pay_min = float(pay_range[1:pay_range.find('a') - 1])
        else: 
            pay_min = float(pay_range[1:pay_range.find('-') - 1])
            pay_max = float(pay_range[pay_range.find('-')+3:pay_range.find('a')-1])
    except:
        pay_type = 'unlisted'
        pay_min = -1
        pay_max = -1
    return pay_type, pay_min, pay_max

# Function for scanning for the skillset "keyword" in each post, returning 1 for present and 0 for missing
def skill_search(listing):
    job_url = 'https://indeed.com/' + listing.find('h2').find('a')['href']
    keyword_driver.get(job_url)
    time.sleep(2.5) 
    job_soup = bs(keyword_driver.page_source, 'html5lib')
    flag_list = []
    for skill in skills_list:
        if skill in job_soup.text.lower():
            flag = 1
        else:
            flag = 0
        flag_list.append(flag)    
    return flag_list
                        
# Function for examining search results and calling the record function for all information of interest.
def search_scrape(url):
    search_driver.get(url)
    search_driver.implicitly_wait(60)
    soup = bs(search_driver.page_source, 'html5lib')
    
    listings_div = soup.find('div', attrs={'id': 'mosaic-provider-jobcards'})
    for listing in listings_div.find('ul'):
        try:
            position = listing.find('h2').find('a').text
            company = listing.find('span',{'class':'companyName'}).text
            
            # Call parsing function for job location, location's average pay for position
            job_location, avg_salary = parse_location(listing)
            
            # Call parsing function for pay type, min, max
            pay_type, pay_min, pay_max = parse_pay(listing)
            
            # Call skillset keyword search function to confirm whether or not it's present in the job posting's description
            skill_flags = skill_search(listing)
            
            # Append all information gathered to job_postings as a list
            job_details = [position, company, job_location, avg_salary, pay_type, pay_min, pay_max]
            for flag in skill_flags:
                job_details.append(flag)
            job_postings.append(job_details)
        
        # Job Search page contains ads/misc postings that use the same job-card format as the actual job postings.
        # Attempting to assign position for these will produce an AttributeError;  continue in this case.
        except AttributeError:
            continue

In [5]:
# Perform Scrape
for location in locations_list:
    for page in range(int(pages)):
        search_url = update_search_url(page)
        search_scrape(search_url)

In [6]:
# Create DataFrame
columns = ['Job Name','Company','Location','Avg Salary','Pay Type','Pay Min','Pay Max']
for skill in skills_list:
    columns.append(f'Mentions {skill}')
df = pd.DataFrame(job_postings, columns = columns)

In [7]:
# Export to csv for future use
now = datetime.now()
date_str = now.strftime("%m-%d-%Y_%H-%M-%S")
df.to_csv(f'Indeed_Job_Scrape_Raw_{date_str}.csv', index=False)

This concludes the scraping script. Data cleaning and wrangling are performed in a separate notebook.