In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd  
import time 

In [2]:
def get_jobs(keyword, num_jobs, verbose):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    # Initializing the webdriver.
    options = webdriver.ChromeOptions()
    # options.add_argument('headless') # Uncomment the left code for browserless scraping.
    service = Service(executable_path='/Users/tonyha/Documents/Projects/salary_prediction/chromedriver')
    driver = webdriver.Chrome(service = service, options=options)
    driver.maximize_window()

    keyword = keyword.replace(" ", "-") # Glassdoor requires '-' between the words in the url.
    url = "https://www.glassdoor.com.au/Job/"+keyword+"-jobs-SRCH_KO0,4.htm"
    driver.get(url)
    jobs = [] # We store our job listings here. It will be a list of dictionaries. 

    time.sleep(4) # The waiting time (in seconds) between requests. Ensure it is high enough to load pages.
    
    # Close the login popup. We first make it appear by clicking on the "Show more jobs" button once before closing it.
    if len(driver.find_elements(By.XPATH, "//*[@id='left-column']/div[2]/div/button"))>0:
        driver.find_element(By.XPATH, "//*[@id='left-column']/div[2]/div/button").click()
        time.sleep(4)
        popup_closed = False
        while not popup_closed:
            try:
                driver.find_element(By.XPATH, "/html/body/div[11]/div[2]/div[2]/div[1]/div[1]/button").click()
                popup_closed = True
            except:
                time.sleep(2)

    time.sleep(4)

    # Keep clicking 'show more jobs' until there is none.
    more_jobs = len(driver.find_elements(By.XPATH, "//*[@id='left-column']/div[2]/div/button"))>0
    while more_jobs:
        driver.find_element(By.XPATH, "//*[@id='left-column']/div[2]/div/button").click()
        time.sleep(4) 
        more_jobs = len(driver.find_elements(By.XPATH, "//*[@id='left-column']/div[2]/div/button"))>0

    # "job_buttons" contains all the jobs on the page (or more precisely, their elements).
    job_buttons = driver.find_elements(By.XPATH, "//ul[@aria-label='Jobs List']/li[@data-test='jobListing']")
    
    # If fewer jobs are found than what is requested, tell the user. Otherwise take the first "num_jobs" from the page.
    if len(job_buttons)<num_jobs:
        print("The requested number of jobs is {0} but the search found {1} jobs. The scraper will therefore return only {1} jobs.".format(num_jobs, len(job_buttons)))
    else:
        job_buttons = job_buttons[:num_jobs]
    
    for job_button in job_buttons: # Going through each job on the page.

        job_button.click() # Click the job listing to open its javascript components. Allows us to obtain valuable info under this job.
        
        time.sleep(4)
        
        # Collect information on the job title, location, and description. These should always be available under each job, and we make 
        # sure we obtain them by using a while loop. 
        collected_successfully = False
        while not collected_successfully:
            try:
                job_title = driver.find_element(By.XPATH, "//h1[@class='heading_Heading__BqX5J heading_Level1__soLZs']").text
                location = driver.find_element(By.XPATH, "//div[@class='JobDetails_location__mSg5h']").text
                job_description = driver.find_element(By.XPATH, "//*[@id='app-navigation']/div[4]/div[2]/div[2]/div/div[1]/section/div[2]/div[1]").text
                collected_successfully = True
            except:
                time.sleep(2)

        # Collect other useful information. If they are missing from the job listing we assign "missing". 
        try:
            company_name = driver.find_element(By.XPATH, "//h4[@class='heading_Heading__BqX5J heading_Subhead__Ip1aW']").text
        except NoSuchElementException:
            company_name = "missing"

        try:
            industry = driver.find_element(By.XPATH, "//div[@class='JobDetails_companyOverviewGrid__3t6b4']/div[5]/div").text
        except:
            industry = "missing"

        try:
            company_type = driver.find_element(By.XPATH, "//div[@class='JobDetails_companyOverviewGrid__3t6b4']/div[3]/div").text
        except:
            company_type = "missing"

        try:
            company_size = driver.find_element(By.XPATH, "//div[@class='JobDetails_companyOverviewGrid__3t6b4']/div[1]/div").text
        except: 
            company_size = "missing"

        try:
            salary = driver.find_element(By.XPATH, "//div[@class='SalaryEstimate_salaryRange__brHFy']").text
        except NoSuchElementException:
            salary = "missing"

        #Printing for debugging
        if verbose:
            print("Job Title: {}".format(job_title))
            print("Company Name: {}".format(company_name))
            print("Location: {}".format(location))
            print("Work Model: missing")
            print("Employment Type: missing")
            print("Industry: {}".foramt(industry))
            print("Company Type: {}".format(company_type))
            print("Company Size: {}".format(company_size))
            print("Job Description: {}".format(job_description[:100]))
            print("Salary: {}".format(salary))

        #Add the job to 'jobs'.
        jobs.append({"Job Title": job_title,
        "Company Name": company_name,
        "Location": location,
        "Work Model": "missing",
        "Employment Type": "Missing",
        "Industry": industry,
        "Company Type": company_type,
        "Company Size": company_size,
        "Job Description" : job_description,
        "Salary": salary})

        #Print progress.
        print("Progress: {}".format("" + str(len(jobs)) + "/" + str(len(job_buttons))))
            
    return pd.DataFrame(jobs) 

In [None]:
#Start the scraping. 
df = get_jobs("data", 2000, False)
df

In [4]:
# df.to_csv("data_glassdoor.csv")