In [1]:
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd 
import time 

In [2]:
def get_jobs(keyword, num_jobs, verbose):
    
    '''Gathers jobs as a dataframe, scraped from Indeed'''

    # Initializing the webdriver.
    options = webdriver.ChromeOptions()
    # options.add_argument('headless') # Uncomment the left code for browserless scraping.
    service = Service(executable_path='/Users/tonyha/Documents/Projects/salary_prediction/chromedriver')
    driver = webdriver.Chrome(service = service, options=options)
    driver.maximize_window()

    keyword = keyword.replace(" ", "+") # Indeed requires '+' between the words in the url.
    url = "https://au.indeed.com/jobs?q="+keyword+"&l=&from=searchOnHP&vjk=514cd2942c99e51d"
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    jobs = [] # We store our job here. It will be a list of dictionaries. 

    time.sleep(4) # The waiting time (in seconds) between requests. Ensure it is high enough to load pages. 
    
    # Before we scrape, we close the job alert popup that appears on the second page, then go back to the first page.
    if len(driver.find_elements(By.XPATH, "//a[@data-testid='pagination-page-next']"))>0:
        driver.find_element(By.XPATH, "//a[@data-testid='pagination-page-next']").click()
        time.sleep(4)
        popup_closed = False
        while not popup_closed:
            try:
                driver.find_element(By.XPATH, "//*[@id='mosaic-desktopserpjapopup']/div[1]/button").click()
                popup_closed = True
            except:
                time.sleep(2)
        driver.find_element(By.XPATH, "//a[@data-testid='pagination-page-prev']").click()

    while len(jobs) < num_jobs:  # If true, should be still looking for new jobs.

        # 'job_buttons' contains all the jobs on the current page.
        job_buttons = driver.find_elements(By.XPATH, "//li[@class='css-5lfssm eu4oa1w0'][not(.//div[@class='mosaic mosaic-empty-zone nonJobContent-desktop'])]")

        for job_button in job_buttons: # Going through each job on the current page.

            if len(jobs) >= num_jobs: # Stop if we have collected enough jobs mid-way through the page.
                break

            job_button.click() # Click the job listing to open its contents. 
            
            time.sleep(5)

            # Collect information on the job title, location, and description. These should always be available under each job, and we make 
            # sure we obtain them by using a while loop. 
            collected_successfully = False
            while not collected_successfully:
                try:
                    job_title = driver.find_element(By.XPATH, "//h2[@class='jobsearch-JobInfoHeader-title css-1t78hkx e1tiznh50']").text.split('\n')[0]
                    location = driver.find_element(By.XPATH, "//div[@data-testid='inlineHeader-companyLocation']/div").text
                    job_description = driver.find_element(By.XPATH, "//div[@id='jobDescriptionText']").text
                    collected_successfully = True
                except:
                    time.sleep(2)

            # Collect other useful information. If they are missing from the job listing we assign "missing". 
            try:
                company_name = driver.find_element(By.XPATH, "//span[@class='css-1saizt3 e1wnkr790']/a[@aria-label]").text
            except NoSuchElementException:
                company_name = "missing"

            try:
                employment_type = driver.find_element(By.XPATH, "//div[@id='salaryInfoAndJobType']/span[@class='css-k5flys eu4oa1w0']").text
            except NoSuchElementException:
                employment_type = "missing"

            try:
                salary = driver.find_element(By.XPATH, "//div[@id='salaryInfoAndJobType']/span[@class='css-19j1a75 eu4oa1w0']").text
            except NoSuchElementException:
                salary = "missing"
            
            # To get information on the industry and company size we must click on the company name to open a new link. After scraping its 
            # contents we close the tab.
            if company_name != "missing": # The link is only available if there is a company name.
                original_window = driver.current_window_handle # Stores the main window we we can switch back to it after scraping the link.
                assert len(driver.window_handles) == 1 # Before clicking the link, we make sure the current window is the only one.
                driver.find_element(By.XPATH, "//div[@data-testid='inlineHeader-companyName']").click() # Click company name to open link.
                wait.until(EC.number_of_windows_to_be(2)) # Wait for the new window to load.
                driver.switch_to.window(driver.window_handles[-1]) # Switch to the new window. 
                time.sleep(5)
                try:
                    industry = driver.find_element(By.XPATH, "//li[@data-testid='companyInfo-industry']/div[2]").text
                except:
                    industry = "missing"
                try:
                    company_size = driver.find_element(By.XPATH, "//li[@data-testid='companyInfo-employee']/div[2]").text
                except:
                    company_size = "missing"
                driver.close() # Close the tab
                driver.switch_to.window(original_window) # Switch back to main window.
                time.sleep(5)

            # Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))
                print("Work Model: missing")
                print("Employment Type: {}".format(employment_type))
                print("Industry: {}".format(industry))
                print("Company Type: missing")
                print("Company Size: {}".format(company_size))
                print("Job Description: {}".format(job_description[:100]))
                print("Salary: {}".format(salary))

            # Add the job to 'jobs'.
            jobs.append({"Job Title": job_title,
            "Company Name": company_name,
            "Location": location,
            "Work Model": "missing",
            "Employment Type": employment_type,
            "Industry": industry,
            "Company Type": "missing",
            "Company Size": company_size,
            "Job Description" : job_description,
            "Salary": salary})

            # Print progress.
            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))

        # If more jobs need to be collected then go to the next page.
        if len(jobs)<num_jobs:
            try:
                driver.find_element(By.XPATH, "//a[@data-testid='pagination-page-next']").click()
                time.sleep(5) 
            except NoSuchElementException: # This will happen when the current page is the last one.
                print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
                break
            
    return pd.DataFrame(jobs) 

In [None]:
#Start the scraping. 
df = get_jobs("data", 2000, False)
df

In [4]:
# df.to_csv("data_indeed.csv")