In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd  
import time 

In [2]:
def get_jobs(keyword, num_jobs, verbose):
    
    '''Gathers jobs as a dataframe, scraped from Jora'''
    
    # Initializing the webdriver.
    options = webdriver.ChromeOptions()
    # options.add_argument('headless') # Uncomment the left code for browserless scraping.
    service = Service(executable_path='/Users/tonyha/Documents/Projects/salary_prediction/chromedriver')
    driver = webdriver.Chrome(service = service, options=options)
    driver.maximize_window()

    keyword = keyword.replace(" ", "+") #Jora requires '+' between the words in the url.
    url = "https://au.jora.com/j?sp=search&trigger_source=serp&q="+keyword+"&l="
    driver.get(url)
    jobs = [] # We store our jobs here.

    time.sleep(4) # The waiting time (in seconds) between requests. Ensure it is high enough to load pages.

    while len(jobs) < num_jobs:  # If true, should be still looking for new jobs.

        # Close the job alert popup in case it appears upon entering a new page.
        try:
            driver.find_element(By.XPATH, "//div[@class='dismiss']").click()
            time.sleep(4)
        except:
            pass

        # 'job_buttons' contains all the jobs on the page
        job_buttons = driver.find_elements(By.XPATH, "//div[contains(@class, 'job-card result organic-job') or contains(@class, 'job-card result sponsored-job premium-job spon-top') or contains(@class, 'job-card result sponsored-job premium-job spon-bottom')]") 

        for job_button in job_buttons: # Going through each job on the page.

            if len(jobs) >= num_jobs: # If we have collected enough jobs mid-way through the page then stop. 
                break

            # Click the job listing to open its javascript components. Allows us to obtain valuable info under this job. Note that if we are 
            # not careful where we click, then we might inadvertantly click on a link. This will result in an error  
            # (probably 'StaleElementReferenceException' since the next job in 'job_buttons' will not appear in the new link).
            # We avoid this by clicking on any spot that we are certain will contain no links. Here we click on the title of the job listing. 
            # Note the '.' in the path. This is necessary since we want to start searching from the element in job_button. 
            # Withtout it, it will start searching from root, which is not what we want. 
            job_button.find_element(By.XPATH, "./div[@class='top-container']").click() 
            
            time.sleep(4)

            # Collect information on the job title, location, and description. These should always be available under each job, and we make 
            # sure we obtain them by using a while loop. 
            collected_successfully = False
            while not collected_successfully:
                try:
                    job_title = driver.find_element(By.XPATH, "//h3[@class='job-title heading -size-xxlarge -weight-700']").text
                    location = driver.find_element(By.XPATH, "//span[@class='location']").text
                    job_description = driver.find_element(By.XPATH, "//div[@class='job-description-container']").text
                    collected_successfully = True
                except:
                    time.sleep(2)

            # Collect other useful information. If they are missing from the job listing we assign "missing". 
            try:
                company_name = driver.find_element(By.XPATH, "//span[@class='company']").text
            except NoSuchElementException:
                company_name = "missing"

            try:
                employment_type = driver.find_element(By.XPATH, "//div[@id='job-info-container']/div[@class='badge -default-badge']/div[@class='content'][not(contains(text(), '$'))]").text
            except NoSuchElementException:
                employment_type = "missing"

            try:
                salary = driver.find_element(By.XPATH, "//div[@id='job-info-container']/div[@class='badge -default-badge']/div[@class='content'][contains(text(), '$')]").text
            except NoSuchElementException:
                salary = "missing"

            # Printing for debugging.
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))
                print("Work Model: missing")
                print("Employment Type: {}".format(employment_type))
                print("Industry: missing")
                print("Company Type: missing")
                print("Company Size: missing")
                print("Job Description: {}".format(job_description[:100]))
                print("Salary: {}".format(salary))

            #Add the job to 'jobs'.
            jobs.append({"Job Title": job_title,
            "Company Name": company_name,
            "Location": location,
            "Work Model": "missing",
            "Employment Type": employment_type,   
            "Industry": "missing",
            "Company Type": "missing", 
            "Company Size": "missing",
            "Job Description": job_description,
            "Salary": salary})

            #Print progress.
            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))

        # If more jobs need to be collected then go to the next page.
        if len(jobs)<num_jobs:
            try:
                driver.find_element(By.XPATH, "//a[@class='next-page-button']").click()
                time.sleep(4)
            except NoSuchElementException: # This will happen when the current page is the last one.
                print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
                break
    return pd.DataFrame(jobs) 

In [None]:
#Start the scraping. 
df = get_jobs("data", 2000, False)
df

In [4]:
# df.to_csv("data_jora.csv") 