In [10]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException, StaleElementReferenceException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import random
import re

In [11]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.set_window_size(1120, 1000)

In [12]:

def get_jobs(keyword, num_jobs):
    
    word_list = keyword.split(' ')
    url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=' + '%20'.join(word_list)

    driver.get(url)
    jobs = []
    page = 1
    print("Start crawling for {}".format(keyword))
    id = 0
    try:
        page_count = int(driver.find_element(By.XPATH, "//div[@data-test='pagination-footer-text']").text.split(' ')[-1])
        print("Number of page is {}".format(page_count))
    except Exception:
        return jobs

    while len(jobs) < num_jobs:
        print("Page {}\n".format(page))
        print(driver.current_url)
        time.sleep(2)
        job = dict()

        if page > page_count:
            return jobs

        try:
            job_ages = driver.find_elements(By.XPATH, "//div[@data-test='job-age']")
        except Exception:
            job_ages = [None]*30
        # print("Length of job age is {}".format(len(job_ages)))
        
        # find job results on the current page
        for idx, job in enumerate(driver.find_elements(By.XPATH, "//div[@id='JobResults']//article[@id='MainCol']//li[@data-test='jobListing']")):
            # print('-'*60)
            # deal with sign in panel
            try:
                closeBtn = driver.find_element(By.XPATH, "//span[@alt='Close']")
                closeBtn.click()
                print("Has login panel in the current page")
            except Exception:
                # print("No login panel appear")
                pass
            # click the job to get more information
            try:
                job.click()
            except Exception:
                print("Page need refresh")
                driver.refresh()
                break
            
            # filter by job age
            try:
                job_age = job_ages[idx].text.strip()
                # if 'd' in job_age and float(re.findall(r'\d+', job_age)[0]) > float(re.findall(r'\d+', job_age_threshold)[0]):
                #     print("{} > {}".format(job_age, job_age_threshold))
                #     continue
            except Exception:
                print("Can't find job age")
                job_age = None
                continue

            sleep_time = random.randint(1,3)
            time.sleep(sleep_time)
            # get detailed information on the right grid panel
            try: 
                element = driver.find_element(By.XPATH, "//div[@id='JDCol']")
            except Exception as error:
                print("Cannot find detailed information")
                continue

            # employerName (company name)
            try: 
                employer = element.find_element(By.XPATH, "//div[@data-test='employerName']")
                employer_name = employer.text.split('\n')
            except Exception:
                employer_name = None
                pass

            # employer link
            try:
                employer_link = element.find_element(By.XPATH, "//div[@id='EmpBasicInfo']//a")
                emp_link = employer_link.get_attribute('href')
            except Exception as error:
                emp_link = None
                pass

            # title
            try:    
                # title = element.find_element(By.XPATH, "//div[@data-test='jobTitle']")
                title = job.find_element(By.XPATH, "//a[@data-test='job-link']")
                title = title.text
            except Exception:
                title = None
                pass

            # location
            try:
                location = driver.find_element(By.XPATH, "//span[@data-test='emp-location']")
                location = location.text
            except Exception:
                location = None
                pass

            # salary
            try:
                estimated_salary = element.find_element(By.XPATH, "//span[@data-test='detailSalary']")
                estimated_salary = estimated_salary.text
            except Exception:
                estimated_salary = None
                pass

            # job description
            try:
                job_description = element.find_element(By.XPATH, "//div[@class='jobDescriptionContent desc']").text
            except Exception:
                job_description = None
                pass

            # application link
            try:
                link = element.find_element(By.XPATH, "//a[@data-test='apply-button']")
                application_link = 'www.glassdoor.com' + link.get_attribute('data-job-url')
            except Exception:
                application_link = None
                # print("not find application link")
                pass
        
            jobs.append({'Company_Name': employer_name, 
                         'Company_Link': emp_link,
                         'Title': title,
                         'Location': location,
                         'Estimated_Salary': estimated_salary,
                         'Job_Description': job_description,
                         'Job_Age': job_age, 
                         'Application_Link': application_link})          
            id += 1
            if id % 50 == 0:
                print("Progress {} / {}".format(id, num_jobs))
            if len(jobs) >= num_jobs:
                return jobs

            time.sleep(0.5)
        # turn to next page
        # element = WebDriverWait(driver, 10).until(
        # EC.element_to_be_clickable((By.ID, "myDynamicElement")))
        
        try:
            next_button = driver.find_element(By.XPATH, "//button[@aria-label='Next']")
        except Exception:
            return jobs
        try:
            next_button.click()
        except Exception:
            print("Not clickable")
            try:
                closeBtn = driver.find_element(By.XPATH, "//span[@alt='Close']")
                closeBtn.click()
                print("Has login panel in the current page")
            except Exception:
                # print("No login panel appear")
                return jobs
                
        page += 1
        if page > 1:
            return jobs

In [13]:
keywords = []
with open('./data/keywords.txt') as fh:
    lines = fh.readlines()
    for line in lines:
        keywords.append(line.strip())
with open('./data/companies.csv','r') as fh:
    company_list = []
    lines = fh.readlines()
    for line in lines[1:]:
        company_list.append(line.split(',')[1])

In [14]:
dir_path = './glassdoor_jobs_info/'
num_jobs = float('inf')
for company in company_list:
    company_jobs = []
    for keyword in keywords:
        jobs = get_jobs(company+' '+keyword, num_jobs)
        company_jobs += jobs
    pd.DataFrame.from_dict(company_jobs).to_csv(dir_path + company + '.csv', index=False)      

Start crawling for Walmart software developer
Number of page is 21
Page 1

https://www.glassdoor.com/Job/walmart-software-developer-jobs-SRCH_KO0,26.htm
Has login panel in the current page
Start crawling for Walmart web developer
Number of page is 3
Page 1

https://www.glassdoor.com/Job/walmart-web-developer-jobs-SRCH_KO0,21.htm
Page need refresh


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=107.0.5304.110)
Stacktrace:
0   chromedriver                        0x0000000105282a88 chromedriver + 4123272
1   chromedriver                        0x000000010520e778 chromedriver + 3647352
2   chromedriver                        0x0000000104ed0ac4 chromedriver + 248516
3   chromedriver                        0x0000000104eb0304 chromedriver + 115460
4   chromedriver                        0x0000000104f26e64 chromedriver + 601700
5   chromedriver                        0x0000000104f38744 chromedriver + 673604
6   chromedriver                        0x0000000104efab10 chromedriver + 420624
7   chromedriver                        0x0000000104efbc30 chromedriver + 425008
8   chromedriver                        0x0000000105254ae4 chromedriver + 3934948
9   chromedriver                        0x0000000105257f24 chromedriver + 3948324
10  chromedriver                        0x0000000105258508 chromedriver + 3949832
11  chromedriver                        0x000000010525eb30 chromedriver + 3975984
12  chromedriver                        0x0000000105258b24 chromedriver + 3951396
13  chromedriver                        0x000000010523371c chromedriver + 3798812
14  chromedriver                        0x00000001052752f0 chromedriver + 4068080
15  chromedriver                        0x0000000105275444 chromedriver + 4068420
16  chromedriver                        0x0000000105289450 chromedriver + 4150352
17  libsystem_pthread.dylib             0x00000001953a426c _pthread_start + 148
18  libsystem_pthread.dylib             0x000000019539f08c thread_start + 8
