In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
import pandas as pd 
import time 

In [2]:
def get_jobs(keyword, num_jobs, verbose):
    
    '''Gathers jobs as a dataframe, scraped from Seek'''
    
    # Initializing the webdriver.
    options = webdriver.ChromeOptions()
    # options.add_argument('headless') # Uncomment the left code for browserless scraping.
    service = Service(executable_path='/Users/tonyha/Documents/Projects/salary_prediction/chromedriver')
    driver = webdriver.Chrome(service = service, options=options)
    driver.maximize_window()

    keyword = keyword.replace(" ", "-") # Seek requires '-' between the words in the url.
    url = "https://www.seek.com.au/"+keyword+"-jobs"
    driver.get(url)
    jobs = [] # We store our jobs. It will be a list of dictionaries. 

    time.sleep(4) # The waiting time (in seconds) between requests. Ensure it is high enough to load pages.

    body = driver.find_element(By.XPATH, "/html/body") # Get the body of the html so we can use it to scroll the page up.

    while len(jobs) < num_jobs:  # If true, should be still looking for new jobs. 

        # 'job_buttons' contains all the jobs on the page.
        job_buttons = driver.find_elements(By.XPATH, "//*[@id='app']/div/div[3]/div/section/div[2]/div/div/div[1]/div/div/div[1]/div/div/div[1]/div[3]/div") 
        
        for job_button in job_buttons: # Going through each job on the page.

            if len(jobs) >= num_jobs: # If we have collected enough jobs mid-way through the page then stop. 
                break

            # Click the job listing to open its javascript components. Allows us to obtain valuable info under this job. 
            job_button.find_element(By.XPATH, ".//a[@data-testid='job-list-item-link-overlay']").click()
            time.sleep(4)

            # Collect information on the job title, location, and description. These should always be available under each job, and we make 
            # sure we obtain them by using a while loop. 
            collected_successfully = False
            while not collected_successfully:
                try:
                    job_title = driver.find_element(By.XPATH, "//h1[@data-automation='job-detail-title']").text
                    location = driver.find_element(By.XPATH, "//span[@data-automation='job-detail-location']").text
                    job_description = driver.find_element(By.XPATH, "//div[@data-automation='jobAdDetails']").text
                    collected_successfully = True
                except:
                    time.sleep(2)

            # Collect other useful information. If they are missing from the job listing we assign "missing". 
            try:
                company_name = driver.find_element(By.XPATH, "//span[@data-automation='advertiser-name']").text
            except NoSuchElementException:
                company_name = "missing"

            try:
                employment_type = driver.find_element(By.XPATH, "//span[@data-automation='job-detail-work-type']").text
            except NoSuchElementException:
                employment_type = "missing"

            try:
                industry = driver.find_element(By.XPATH, "//span[@data-automation='job-detail-classifications']").text
            except NoSuchElementException:
                industry = "missing"

            try:
                salary = driver.find_element(By.XPATH, "//span[@data-automation='job-detail-salary']").text
            except NoSuchElementException:
                salary = "missing"

            # Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))
                print("Work Model: missing")
                print("Employment Type: {}".format(employment_type))
                print("Industry: {}".format(industry))
                print("Company Type: missing")
                print("Company Size: missing")
                print("Job Description: {}".format(job_description[:100]))
                print("Salary: {}".format(salary))

            # Add the job to 'jobs'.
            jobs.append({"Job Title": job_title,
            "Company Name": company_name,
            "Location": location,
            "Work Model": "missing",
            "Employment Type": employment_type,
            "Industry": industry,
            "Company Type": "missing",
            "Company Size": "missing",
            "Job Description" : job_description,
            "Salary": salary})

            # Print progress.
            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))

            # There is a "Save this search" button which may intercept our clicking of jobs. To prevent this we scroll up before clicking on new jobs.
            body.send_keys(Keys.PAGE_UP)
            time.sleep(1)

        # If more jobs need to be collected then go to the next page.
        if len(jobs)<num_jobs:
            try:
                driver.find_element(By.XPATH, "//a[@aria-label='Next']").click()
                time.sleep(4)
            except NoSuchElementException: # This will happen when the current page is the last one.
                print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
                break
            
    return pd.DataFrame(jobs) 

In [None]:
# Start the scraping. 
df = get_jobs("data", 2000, False)
df

In [4]:
# df.to_csv("data_seek.csv")