In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd  
import time 



In [2]:
def get_jobs(keyword, num_jobs, verbose):
    
    '''Gathers jobs as a dataframe, scraped from LinkedIn'''
    
    # Initializing the webdriver.
    options = webdriver.ChromeOptions()
    # options.add_argument('headless') # Uncomment the left code for browserless scraping.
    service = Service(executable_path='/Users/tonyha/Documents/Projects/salary_prediction/chromedriver')
    driver = webdriver.Chrome(service = service, options=options)
    driver.maximize_window()

    # We start at the homepage since the site may force us to go there anyways.
    url = "https://www.linkedin.com/?trk=guest_homepage-basic_nav-header-logo"
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    jobs = [] # We store our job listings here. It will be a list of dictionaries.

    time.sleep(4) # The waiting time (in seconds) between requests. Ensure it is high enough to load pages. 

    # Click on "Jobs" button.
    driver.find_elements(By.XPATH, "//icon[@class='top-nav-link__icon flex h-3 w-3 flex-shrink-0 justify-center lazy-loaded']")[3].click()
    time.sleep(4)
    
    # Input the value in the "keyword" variable inside in job search box.
    job_search_box = driver.find_element(By.XPATH, "//input[@aria-label='Search job titles or companies']")
    job_search_box.send_keys(keyword)
    time.sleep(3)

    # Clear the location search box and input "Australia" then hit enter.
    location_search_box = driver.find_element(By.XPATH, "//input[@aria-label='Location']")
    location_search_box.clear()
    time.sleep(3)
    location_search_box.send_keys("Australia")
    time.sleep(3)
    location_search_box.send_keys(Keys.ENTER)
    time.sleep(4) 

    # Scroll to the bottom of the page until it no longer loads.
    last_height = driver.execute_script("return document.body.scrollHeight") # Get current height of page.
    body = driver.find_element(By.XPATH, "/html/body")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") # Scroll to the bottom to load.
        time.sleep(2)
        body.send_keys(Keys.ARROW_UP) # To help the page load.
        time.sleep(4)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height: # Only possible when the page no longer loads, i.e., we have reached the end.
            break
        last_height = new_height

    # Click "See more jobs". Sometimes when clicking the button the page does not load. If this happens above a threshold then we stop clicking. 
    fail_threshold = 10 # Number of fail clicks allowed before stopping. 
    fail_count = 0 # Count the number of failed clicks.
    more_jobs = len(driver.find_elements(By.XPATH, "//button[@aria-label='See more jobs']"))>0 # Determine if the button exists.
    last_height = driver.execute_script("return document.body.scrollHeight") # Get current height of page.
    while more_jobs and fail_count<fail_threshold: # While the button exists and we are below the fail threshold.
        driver.find_element(By.XPATH, "//button[@aria-label='See more jobs']").click() # Click the button.
        time.sleep(4)
        new_height = driver.execute_script("return document.body.scrollHeight")
        # We determine whether clicking the button loads the page or not by the height of the page. If the height stays the same then 
        # it means that it did not load. In this case we increase the fail count by 1.
        if new_height == last_height: 
            fail_count += 1
        else: # The click is successful and the page loads. We reset the fail count. 
            fail_count = 0
            last_height = new_height
            # Check if the button still exists after loading. Note that the below code is not required if the loading fails, since the button
            # will still be there.
            more_jobs = len(driver.find_elements(By.XPATH, "//button[@aria-label='See more jobs']"))>0 

    time.sleep(4)

    # 'job_buttons' contains all the jobs on the page.
    job_buttons = driver.find_elements(By.XPATH, "//ul[@class='jobs-search__results-list']/*")
    
    # If fewer jobs are found than what is requested, tell the user. Otherwise take the first "num_jobs" from the page.
    if len(job_buttons)<num_jobs:
        print("The requested number of jobs is {0} but the search found {1} jobs. The scraper will therefore return only {1} jobs.".format(num_jobs, len(job_buttons)))
    else:
        job_buttons = job_buttons[:num_jobs]
    
    for job_counter in range(len(job_buttons)): # Going through each job on the page.

        job_buttons[job_counter].click() # Click the job listing to open its contents. 
        
        time.sleep(4)

        # Sometimes clicking on a job does not load it. One fix is to click on a different job, then click the current job again. The 
        # "different" job depends on whether the job that does not load is the first one or not. We implement this idea in the following way. 
        # If the first job does not load then click on the previous one, for every other job click on the next one. Finally, to check
        # if a page has loaded or not, we see whether the job title text is empty. 
        job_clicked_successfuly = len(driver.find_element(By.XPATH, "//span[@class='topcard__flavor topcard__flavor--bullet']").text)>0
        while not job_clicked_successfuly:
            if job_counter == 0:
                job_buttons[job_counter+1].click()
                time.sleep(2)
                job_buttons[job_counter].click()
                time.sleep(4)
                job_clicked_successfuly = len(driver.find_element(By.XPATH, "//span[@class='topcard__flavor topcard__flavor--bullet']").text)>0
            else:
                job_buttons[job_counter-1].click()
                time.sleep(2)
                job_buttons[job_counter].click()
                time.sleep(4)
                job_clicked_successfuly = len(driver.find_element(By.XPATH, "//span[@class='topcard__flavor topcard__flavor--bullet']").text)>0
    
        # Collect information on the job title, location, and description. These should always be available under each job.
        job_title = driver.find_element(By.XPATH, "//h2[@class='top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title']").text
        location = driver.find_element(By.XPATH, "//span[@class='topcard__flavor topcard__flavor--bullet']").text
        job_description = driver.find_element(By.XPATH, "//div[@class='description__text description__text--rich']/section/div").text

        # Colelct information on the company name, employment type, and job salary. If they are missing from the job listing we assign "missing".
        try:
            company_name = driver.find_element(By.XPATH, "//a[@class='topcard__org-name-link topcard__flavor--black-link']").text
        except NoSuchElementException:
            company_name = "missing"

        try:
            employment_type = driver.find_element(By.XPATH, "//ul[@class='description__job-criteria-list']/li[2]/span[@class='description__job-criteria-text description__job-criteria-text--criteria']").text
        except NoSuchElementException:
            employment_type = 'missing'

        try:
            salary = driver.find_element(By.XPATH, "//div[@class='salary compensation__salary']").text
        except NoSuchElementException:
            salary = 'missing'

        # To get information on the industry, company type, and company size we must click on the company name to open a new link. After scraping its 
        # contents we close the tab.
        if company_name != "missing": # The link is only available if there is a company name.
            original_window = driver.current_window_handle # Stores the main window so we can switch back to it after scraping the link.
            assert len(driver.window_handles) == 1 # Before clicking the link we make sure the current window is the only one.
            driver.find_element(By.XPATH, "//a[@class='topcard__org-name-link topcard__flavor--black-link']").click() # Click company name to open the link.
            wait.until(EC.number_of_windows_to_be(2)) # Wait for the new window to load.
            driver.switch_to.window(driver.window_handles[-1]) # Switch to the new window. 
            time.sleep(4)
            try: # Close the sign-in popup if it appears.
                driver.find_element(By.XPATH, "//icon[@class='contextual-sign-in-modal__modal-dismiss-icon lazy-loaded']").click()
                time.sleep(4)
            except:
                pass
            try:
                industry = driver.find_element(By.XPATH, "//div[@class='core-section-container__content break-words']/dl/div[@data-test-id='about-us__industry']/dd").text
            except:
                industry = "missing"
            try:
                company_type = driver.find_element(By.XPATH, "//div[@class='core-section-container__content break-words']/dl/div[@data-test-id='about-us__organizationType']/dd").text
            except:
                company_type = "missing"
            try:
                company_size = driver.find_element(By.XPATH, "//div[@class='core-section-container__content break-words']/dl/div[@data-test-id='about-us__size']/dd").text
            except:
                company_size = "missing"
            driver.close() # Close the tab
            driver.switch_to.window(original_window) # Switch back to main window.
            time.sleep(4)

        # Printing for debugging.
        if verbose:
            print("Job Title: {}".format(job_title))
            print("Company Name: {}".format(company_name))
            print("Location: {}".format(location))
            print("Work Model: missing")
            print("Employment Type: {}".format(employment_type))
            print("Industry: {}".format(industry))
            print("Company Type: {}".format(company_type))
            print("Company Size: {}".format(company_size))
            print("Job Description: {}".format(job_description[:100]))
            print("Salary: {}".format(salary))
            
        # Add the job to 'jobs'.
        jobs.append({"Job Title": job_title,
        "Company Name": company_name,
        "Location": location,
        "Work Model": "missing",
        "Employment Type": employment_type,
        "Industry": industry,
        "Company Type": company_type,
        "Company Size": company_size,
        "Job Description" : job_description,
        "Salary": salary})

        # Print progress.
        print("Progress: {}".format("" + str(len(jobs)) + "/" + str(len(job_buttons))))
        
    return pd.DataFrame(jobs) 


In [None]:
# Start the scraping. 
df = get_jobs("data", 2000, False)
df

In [4]:
# df.to_csv("data_linkedin.csv")