# Importing the Libraries

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd

##  Function to Scrape Glassdoor for Job Listings
The code I used from @Kenarapfaik was created in 2020 and in 2021 Glassdoor has since changed its website. As such, I had to go through the process of debugging and find which xpaths were no longer relative; sadly, it was all of it. Moreover, in order to get salary from glassdoor, users must login to see the information. I thus found @williamxiell's function for logging in which I also adapted to fit this web scraping function.

In [394]:

"""
Created on Thu Apr  2 09:32:36 2020
author: Kenarapfaik (for scraping framework), williamxie11 (for login)
url: https://github.com/arapfaik/scraping-glassdoor-selenium
     https://github.com/williamxie11/glassdoor-interview-scraper/blob/master/scraper_v1.2.py
"""

''' 
Get jobs from glassdoor and put it into a dataframe
'''
def get_jobs(job, num_jobs, debug, path, sleep, username, password):    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    #options.add_argument('headless')
    
    #Change the path to where chromedriver is in your home folder.
    driver = webdriver.Chrome(executable_path=path, options=options)
    driver.wait = WebDriverWait(driver, 10)
    driver.set_window_size(1120, 1000)
    
    url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=" + job + "&sc.keyword=" + job + "&locT=&locId=&jobType="
    driver.get(url)
    jobs = []
    
    # Test for the "Sign Up" prompt by clicking a job listing
    try:
        driver.find_element_by_xpath('//*[@id="MainCol"]/div[1]/ul/li[1]/div[2]/a').click()
    except ElementClickInterceptedException:
        pass
    
    # Signing in 
    try:
        driver.find_element_by_xpath('//*[@id="JAModal"]/div/div[2]/div[4]/div/div/a').click()
        
        user_field = driver.wait.until(EC.presence_of_element_located((By.ID, "userEmail")))
        pw_field = driver.wait.until(EC.presence_of_element_located((By.ID, "userPassword")))
        login_button = driver.find_element_by_xpath('//*[@id="LoginModal"]/div/div/div[2]/div[2]/div[2]/div/div/div/div[3]/form/div[3]/div[1]/button')
        
        user_field.send_keys(username)
        time.sleep(5)
        user_field.send_keys(Keys.TAB)
        time.sleep(1)
        pw_field.send_keys(password)
        time.sleep(1)
        
        login_button.click()
    except NoSuchElementException:
        print('login failed')
        pass

    # If true, should be still looking for new jobs.
    while len(jobs) < num_jobs:  
        # Used to control the loop's rate i.e. to ensure that the server is not flooded with too many requests thereby blocking our ip.address
        time.sleep(sleep)
        
        # Clicking through each job listing on the current page
        job_buttons = driver.find_elements_by_xpath('//*[@data-test="job-link"]') # Each job listing
        
        for job_button in job_buttons:
            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
            if len(jobs) >= num_jobs:
                break
            # Click each job listing to access data
            job_button.click()
            time.sleep(1)
            collected_successfully = False
            
            # Collecting company, location, job title, and description
            while not collected_successfully:
                try:                                                         
                    company_name = driver.find_element_by_xpath('//*[@class="css-87uc0g e1tk4kwz1"]').text
                    location = driver.find_element_by_xpath('//*[@class="css-56kyx5 e1tk4kwz5"]').text
                    job_title = driver.find_element_by_xpath('//*[@class="css-1vg6q84 e1tk4kwz4"]').text
                    job_description = driver.find_element_by_xpath('//*[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except:
                    time.sleep(5)

            # Collecting given salary estimate
            try:
                salary_estimate = driver.find_element_by_xpath('.//span[@class="css-56kyx5 css-16kxj2j e1wijj242" and @data-test="detailSalary"]').text
            except NoSuchElementException:
                salary_estimate = -1 #You need to set a "not found value. It's important."

            # Printing for debugging
            if debug:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))
            
            # Collect size of company, industry, and sector
            try:                                     
                size = driver.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[3]/div[2]/div[4]').text
            except NoSuchElementException:
                size = -1
            try:
                industry = driver.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[3]/div[2]/div[3]').text
            except NoSuchElementException:
                industry = -1
            try:
                sector = driver.find_element_by_xpath('//*[@id="EmpBasicInfo"]/div[1]/div/div[5]/span[2]').text
            except NoSuchElementException:
                sector = -1

            if debug:
                print("Size: {}".format(size))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

            # Adding data to list to transform later into dataframe
            jobs.append({"Job Title" : job_title,
            "Salary Estimate" : salary_estimate,
            "Job Description" : job_description,
            "Company Name" : company_name,
            "Location" : location,
            "Size" : size,
            "Industry" : industry,
            "Sector" : sector})
            
            
        # Clicking on the "next page" button
        try:
            driver.find_element_by_xpath('.//a[@data-test="pagination-next"]').click()
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break

    return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame.

In [397]:
path = #your chrome driver path
username = #your username
password = #your password

df = get_jobs('data analyst', 120, False, path, 15, username, password)
df.to_csv('glassdoor_data.csv', index = False)

Progress: 0/120
Progress: 1/120
Progress: 2/120
Progress: 3/120
Progress: 4/120
Progress: 5/120
Progress: 6/120
Progress: 7/120
Progress: 8/120
Progress: 9/120
Progress: 10/120
Progress: 11/120
Progress: 12/120
Progress: 13/120
Progress: 14/120
Progress: 15/120
Progress: 16/120
Progress: 17/120
Progress: 18/120
Progress: 19/120
Progress: 20/120
Progress: 21/120
Progress: 22/120
Progress: 23/120
Progress: 24/120
Progress: 25/120
Progress: 26/120
Progress: 27/120
Progress: 28/120
Progress: 29/120
Progress: 30/120
Progress: 31/120
Progress: 32/120
Progress: 33/120
Progress: 34/120
Progress: 35/120
Progress: 36/120
Progress: 37/120
Progress: 38/120
Progress: 39/120
Progress: 40/120
Progress: 41/120
Progress: 42/120
Progress: 43/120
Progress: 44/120
Progress: 45/120
Progress: 46/120
Progress: 47/120
Progress: 48/120
Progress: 49/120
Progress: 50/120
Progress: 51/120
Progress: 52/120
Progress: 53/120
Progress: 54/120
Progress: 55/120
Progress: 56/120
Progress: 57/120
Progress: 58/120
Progres

In [398]:
df

Unnamed: 0,Job Title,Salary Estimate,Job Description,Company Name,Location,Size,Industry,Sector
0,Associate Research Analyst - Aviation Data Ana...,$55K - $112K (Glassdoor est.),"Friday, January 22, 2021\n\nCNA fosters an inc...",CNA Corporation\n3.4,"Arlington, VA",Size: 501 to 1000 Employees,Industry: Aerospace & Defense,-1
1,"Data Engineer / Data Analyst – PBI, SQL, Kusto",$49K - $91K (Glassdoor est.),"Data Engineer / Data Analyst – PBI, SQL, Kusto...","Akvelon, Inc.\n4.1","Bellevue, WA",Size: 501 to 1000 Employees,Industry: Information Technology,-1
2,Data Analyst,$50K - $90K (Glassdoor est.),Who We Are\n\n\nThe School Systems and Data An...,New Visions Central Office\n4.0,"New York, NY",Size: 501 to 1000 Employees,Industry: Education,-1
3,Data Analyst,-1,"We are seeking a resource who loves data, anal...",TalentDash,"Chicago, IL",Size: 1 to 50 Employees,Industry: Information Technology,-1
4,Data Analyst,$40K - $74K (Glassdoor est.),Essen is currently seeking a Full-time Data An...,Essen Health Care\n3.0,"New York, NY",Size: 1001 to 5000 Employees,Industry: Health Care,-1
...,...,...,...,...,...,...,...,...
115,Data Analyst,$35K - $64K (Glassdoor est.),Do you want to be part of a team that encourag...,MassMutual\n3.8,"Boston, MA",Size: 5001 to 10000 Employees,Industry: Insurance,-1
116,Data Analyst,$35K - $64K (Glassdoor est.),Do you want to be part of a team that encourag...,MassMutual\n3.8,"Boston, MA",Size: 5001 to 10000 Employees,Industry: Insurance,-1
117,Data Analyst,$35K - $64K (Glassdoor est.),Data Analyst Job Description\nDaisyBill is see...,DaisyBill,"New York, NY",Size: N/A,Industry: N/A,-1
118,"Analyst, Data Science - Product Analytics",$35K - $64K (Glassdoor est.),This is a great opportunity to join Vrbos glob...,Expedia Group\n4.0,"Austin, TX",Size: 10000+ Employees,Industry: Information Technology,-1


# Personal Web Scraping Issue
I ran into a personal problem that the script would not scrape the full 1000 listings because of my internet speed and would fail around 160 listings. Therefore, I decided to scrape the listings by increments of 120 (this is because 30 listings are on each page so I scrape 4 pages at a time). In the end I had 7 separate csv files which I merged into a consolidated csv file.