In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd

In [2]:
def get_jobs(num_jobs, verbose, path):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor
        reference: https://towardsdatascience.com/selenium-tutorial-scraping-glassdoor-com-in-10-minutes-3d0915c6d905
        It seems a few differences have been done on Glassdoor, so I had to change that code a little bit 
        to meet requirements
    '''
    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    #options.add_argument('headless')
    
    #Change the path to where chromedriver is in your home folder.
    driver = webdriver.Chrome(executable_path=path, options=options)
    driver.set_window_size(1000, 1000)
    
    url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KE0,14.htm?radius=100'
    jobs = []

    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.
        #Let the page load. Change this number based on your internet speed.
        #Or, wait until the webpage is loaded, instead of hardcoding it.
        time.sleep(4)

        #Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element_by_class_name("css-bkasv9").click()
        except ElementClickInterceptedException:
            print('x not work')
            pass

        time.sleep(1)

        try:
            driver.find_element_by_css_selector('[alt="Close"]').click()  #clicking to the X.
        except NoSuchElementException:
            pass

        
        #Going through each job in this page
        job_buttons = driver.find_elements_by_class_name("react-job-listing")  #jl for Job Listing. These are the buttons we're going to click.
        for job_button in job_buttons:  
            if len(jobs) >= num_jobs:
                break
                
            job_button.click()  #You might 
            time.sleep(5)
            collected_successfully = False
            
            try: 
                driver.find_element_by_xpath('..//span[contains(@class="modal_closeIcon")]').click()
                time.sleep(1)
            except  NoSuchElementException as e:
                pass
            
            while not collected_successfully:
                try:
                    company_name = driver.find_element_by_xpath('.//div[contains(@class, "css-xuk5ye")]').text
                    job_title = driver.find_element_by_xpath('.//div[contains(@class, "css-1j389vi e1tk4kwz2")]').text
                    location = driver.find_element_by_xpath('.//div[contains(@class, "css-56kyx5 e1tk4kwz1")]').text
                    job_description = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except Exception:
                    time.sleep(5)

            try:
                salary_estimate = driver.find_element_by_xpath('.//div[@class="css-dufhjo e1tk4kwz0"]//span[@data-test="detailSalary"]').text
            except NoSuchElementException as e:
                salary_estimate = -1 #You need to set a "not found value. It's important."
            
            try:
                driver.find_element_by_xpath('.//div[@data-item="tab" and @data-tab-type="rating"]').click()
                rating = driver.find_element_by_xpath('.//div[@data-test="rating-info"]/div').text
            except NoSuchElementException:
                rating = -1 #You need to set a "not found value. It's important."

            #Going to the Company tab...
            company_info = {
                'Size': -1,
                'Type': -1,
                'Sector': -1,
                'Founded': -1,
                'Industry': -1,
                'Revenue': -1,
            }
            try:
                driver.find_element_by_xpath('.//div[@data-item="tab" and @data-tab-type="overview"]').click()

                try:
                    spans = driver.find_element_by_xpath('.//div[@id="EmpBasicInfo"]/div/div').find_elements_by_xpath('div/span')
                    for index, span in enumerate(spans):                           
                        if index % 2 != 0:
                            continue
                        company_info[span.text] = spans[index+1].text
                except NoSuchElementException as e:
                    pass


            except NoSuchElementException as e:  #Rarely, some job postings do not have the "Company" tab.
                pass
            
            job = {
                    "Job Title" : job_title,
                    "Salary Estimate" : salary_estimate,
                    "Job Description" : job_description,
                    "Rating" : rating,
                    "Company Name" : company_name,
                    "Location" : location
                } | company_info
            
            jobs.append(job)
            #add job to jobs
            
            try:
                drive.find_element_by_xpath('.//button[contains(@class, "nextButton")]').click()
            except  NoSuchElementException:
                break
            
    return pd.DataFrame(jobs)


In [3]:
chromedriver_path = "" # download chrome driver and put its path
df = get_jobs(1000, False, chromedriver_path)

In [4]:
df = pd.DataFrame(df)

In [317]:
df.to_csv('glassdoor_job.csv')