In [221]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException, StaleElementReferenceException
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from datetime import datetime, timedelta
import os
import re

now = datetime.now()
timestamp = now.strftime("%d/%m/%Y %H:%M:%S")

In [222]:
FOLDER = os.path.join(os.getcwd(), "preprocessing_parquet")
print(FOLDER)
if not os.path.exists(FOLDER):
    os.mkdir(FOLDER)

C:\Users\dokha\school-projects\job-market-and-employee-engagement-dashboard\preprocessing_parquet


In [223]:
import urllib
import requests
sample_endpoint = "https://www.glassdoor.com/findPopularLocationAjax.htm?" +                     f"term={urllib.parse.quote('Vancouver, BC')}&maxLocationsToReturn=10"
headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
}
response = requests.get(sample_endpoint, headers=headers)
response.json()

[{'compoundId': 'C2278756',
  'countryName': 'Canada',
  'id': 'C2278756',
  'label': 'Vancouver, BC (Canada)',
  'locationId': 2278756,
  'locationType': 'C',
  'longName': 'Vancouver, BC (Canada)',
  'realId': 2278756}]

In [224]:
def location_glassdoor_id(location: str):
    url_encoded_loc = urllib.parse.quote(location)
    endpoint = f"https://www.glassdoor.com/findPopularLocationAjax.htm?term={url_encoded_loc}&maxLocationsToReturn=10"
    headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
    }
    response = requests.get(endpoint, headers=headers)
    # Best guess location
    location = response.json()[0]
    location_name = location['longName']
    location_ID = location['locationId']
    location_type = location['locationType']
    
    return location_name, location_ID, location_type

#location_glassdoor_id("Vancouver, BC")


In [225]:
def get_post_date(input_str):
    today = datetime.today()
    if (input_str == "30d+"):
        days_to_stt = timedelta(days=40)
    if (input_str[-1] == "h"):
        days_to_stt = timedelta(days=0)
    else:
        res = re.findall('(\d+|[A-Za-z]+)', input_str)
        days_to_stt = timedelta(days=int(res[0]))
    post_date = today - days_to_stt
    return post_date.strftime("%Y-%m-%d")

In [238]:
def get_jobs(keyword, location, num_jobs, verbose=False):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    #options.add_argument('headless')
    
    #Change the path to where chromedriver is in your home folder.
    s=Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=s)
    #driver = webdriver.Chrome(executable_path="C:\\Users\\dokha\\Downloads\\scraping-glassdoor-selenium-master\\scraping-glassdoor-selenium-master\\chromedriver.exe", options=options)
    driver.set_window_size(1120, 1000)
    
    name, loc_id, loc_type = location_glassdoor_id(location)
    url = 'https://www.glassdoor.ca/Job/index.htm'
    driver.get(url)
    driver.find_element(by = By.XPATH, value = '//input[@data-test="search-bar-keyword-input"]').send_keys(keyword)
    driver.find_element(by = By.XPATH, value = '//input[@data-test="search-bar-location-input"]').clear()
    driver.find_element(by = By.XPATH, value = '//input[@data-test="search-bar-location-input"]').send_keys(location)
    #url = f'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="{keyword}"' +             f'&locT={loc_type}&locId={loc_id}' +             '&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true' +             '&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all' +             '&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    #url = f'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="{keyword}"' +             f'&locT={loc_type}&locId={loc_id}' +             '&jobType=all&fromAge=-1&minSalary=0&lo_IP{page}.htm?includeNoSalaryJobs=true' +             '&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all' +             '&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    """
    Get starting url: glassdoor.ca/Jobs/index.htm
    Get the inputs element and put in the keywords (Do once at start of method)
    Input search texts by Selenium's element.send_keys('<input_text>')
    Click search
    Should be redirected to canonical search page, get url under \\meta[@property="og:url"]
    If index = 1, replace '.htm' with '_IP{index}.htm' for transitioning from first page to second page and after (index 1 -> index 2 onwards)
    For indexing after 2, replace '_IP{old_index}.htm' with '_IP{new_index}.htm'
    """

    #url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    # url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword="' + location + '"&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    #driver.get(url)
    jobs = []
    source = "Glassdoor"
    job_type = ""
    job_exp = ""
    signup_found = False
    time.sleep(5.5)
    meta_url = None

    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.

        # Change this number based on internet speed.
        time.sleep(5)
        
        # Close the Sign Up box
        try:
            driver.find_element(by = By.XPATH, value = '//button[@data-test="search-bar-submit"]').click() 
            time.sleep(2.5)
        except ElementClickInterceptedException:
            print("Cannot find search bar")
            pass

        try:
            driver.find_element(by = By.CLASS_NAME, value = "ModalStyle__xBtn___29PT9").click()  #clicking to the X.
        except NoSuchElementException:
            print("Cannot find modalstyle")
            pass
#         if not meta_url:
#             meta_url_element = driver.find_element(by = By.XPATH, value = '//meta[@property="og:url"]')
#             meta_url = meta_url_element.get_attribute('content')
#             print(meta_url)

        #Going through each job in this page
        try:
            meta_url_element = driver.find_element(by = By.XPATH, value = '//meta[@property="og:url"]')
            meta_url = meta_url_element.get_attribute('content')
            print(meta_url)
            job_buttons = driver.find_elements(by = By.XPATH, value = '//a[@class="jobLink"]')
            index_footer = driver.find_element(by = By.XPATH, value = './/div[@class="paginationFooter"]').text.split(" ")
            num_pages = int(index_footer[-1])
            index = int(index_footer[1])
            print("Current index: " + str(index))

            for job_button in job_buttons:
                print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
                if len(jobs) >= num_jobs:
                    break

                try:
                    if not signup_found:
                        driver.find_element(by = By.CLASS_NAME, value = "modal_closeIcon").click()  #clicking to the X.
                        print("found a pop-up, clicking X...")
                        time.sleep(2)
                        signup_found = True
                except NoSuchElementException:
                    pass
                try:
                    job_button.click()  #You might 
                except StaleElementReferenceException:
                    driver.refresh()
                    driver.implicitly_wait(10)
                    #job_button.click()
                    ActionChains(driver).double_click(job_button).perform()
                    #driver.implicitly_wait(10)
                    #ActionChains(driver).move_to_element(job_button).click(job_button).perform()

                time.sleep(3)
                collected_successfully = False

                while not collected_successfully:
                    try:
                        #company_name = driver.find_element(by = By.XPATH, value = './/div[@class="css-xuk5ye e1tk4kwz5"]').text
                        company_name = driver.find_element(by = By.XPATH, value = './/div[@class="css-xuk5ye e1tk4kwz5"]').text.splitlines()[0]
                        location = driver.find_element(by = By.XPATH, value = './/div[@class="css-56kyx5 e1tk4kwz1"]').text
                        job_title = driver.find_element(by = By.XPATH, value = './/div[@class="css-1j389vi e1tk4kwz2"]').text
                        job_description = driver.find_element(by = By.XPATH, value = './/div[@class="jobDescriptionContent desc"]').text
                        #job_description = job_description.replace('\\n', '\n')
                        post_date = driver.find_element(by = By.XPATH, value = './/div[@class="d-flex align-items-end pl-std css-17n8uzw"]').text
                        post_date = get_post_date(post_date)
                        collected_successfully = True
                    except:
                        print("Cannot collect successfully")
                        time.sleep(5)


                try:
                    estimated_salary = driver.find_element(by = By.XPATH, value = './/div[@class="css-y2jiyn e2u4hf18"]').text
                except NoSuchElementException:
                    estimated_salary = None #You need to set a "not found value. It's important."

                #Printing for debugging
                if verbose:
                    print("Job Title: {}".format(job_title))
                    print("Salary Estimate: {}".format(estimated_salary))
                    #print("Job Description: {}".format(job_description[:500]))
                    print("Job Description: {}".format(job_description))
                    print("Rating: {}".format(rating))
                    print("Company Name: {}".format(company_name))
                    print("Location: {}".format(location))

                #Going to the Company tab...

                try:
                    industry = driver.find_element(by = By.XPATH, value = './/span[@class="css-1pldt9b e1pvx6aw1" and text()="Industry"]//following-sibling::*').text                
                except NoSuchElementException:
                    industry = None


                if verbose:
                    print("Headquarters: {}".format(headquarters))
                    print("Size: {}".format(size))
                    print("Founded: {}".format(founded))
                    print("Type of Ownership: {}".format(type_of_ownership))
                    print("Industry: {}".format(industry))
                    print("Sector: {}".format(sector))
                    print("Revenue: {}".format(revenue))
                    print("Competitors: {}".format(competitors))
                    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

                jobs.append({"job_title" : job_title,
                             "job_type" : job_type,
                             "job_type" : job_exp,
                             "company" : company_name,
                "industries" : industry,
                "location" : location,
                "description" : job_description,
                "source" : source,
                "search_kw" : keyword,
                "expected_salary" : estimated_salary,
                "post_date": post_date,           

                })
            
        except ElementNotInteractableException:
            if (index == 1):
                new_url = meta_url.replace(".htm","_IP2.htm")
                print(new_url)
#             for index in range(2,num_pages):
#                 new_url = meta_url.replace("f'_IP{str(index)}.htm'","f'_IP{str(index+1)}.htm'")
            driver.get(new_url)
            print(driver.find_element(by = By.XPATH, value = '//meta[@property="og:url"]').get_attribute('content'))
            time.sleep(10)
                #driver.navigate().refresh()
                #driver.refresh()
        time.sleep(6.5)
        

        #Clicking on the "next page" button
#         try:
#             if (index == 1):
#                 new_url = meta_url.replace(".htm","_IP2.htm")
#                 print(new_url)
#             for index in range(2,num_pages+1):
#                 new_url = meta_url.replace("f'_IP{index}.htm'","f'_IP{index+1}.htm'")
#             driver.get(new_url)
#             #print(new_url)
#             #element = driver.find_element(by = By.XPATH, value = '//button[@data-test="pagination-next"]')
#             #actions = ActionChains(driver)
#             #actions.move_to_element(element).pause(1).click().perform()
#             time.sleep(5.5)
#         except NoSuchElementException:
#             print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
#             break
            
    #return pd.DataFrame(jobs).to_parquet("glassdoor.parquet.gzip")
    timestamp = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
    filename = f"Glassdoor_{keyword}_{location}_{timestamp}.parquet"
    filepath = os.path.join(FOLDER, filename)
    return pd.DataFrame(jobs).to_csv("sde_toronto.csv")
    #return pd.DataFrame(jobs).to_parquet(filepath, compression='gzip')  #This line converts the dictionary object into a pandas DataFrame.


In [239]:
get_jobs("software developer", "Toronto, ON", 65)



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/99.0.4844.51/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\dokha\.wdm\drivers\chromedriver\win32\99.0.4844.51]


Cannot find modalstyle
https://www.glassdoor.ca/Job/toronto-software-developer-jobs-SRCH_IL.0,7_IC2281069_KO8,26.htm
Current index: 1
Progress: 0/65
Progress: 1/65
found a pop-up, clicking X...
Progress: 2/65
Progress: 3/65
Progress: 4/65
Progress: 5/65
Progress: 6/65
Progress: 7/65
Progress: 8/65
Progress: 9/65
Progress: 10/65
Progress: 11/65
Progress: 12/65
Progress: 13/65
Progress: 14/65
Progress: 15/65
Progress: 16/65
Progress: 17/65
Progress: 18/65
Progress: 19/65
Progress: 20/65
Progress: 21/65
Progress: 22/65
Progress: 23/65
Progress: 24/65
Progress: 25/65
Progress: 26/65
Progress: 27/65
Progress: 28/65
Progress: 29/65
Progress: 30/65
https://www.glassdoor.ca/Job/toronto-software-developer-jobs-SRCH_IL.0,7_IC2281069_KO8,26_IP2.htm
https://www.glassdoor.ca/Job/toronto-software-developer-jobs-SRCH_IL.0,7_IC2281069_KO8,26.htm


WebDriverException: Message: target frame detached
  (Session info: chrome=99.0.4844.51)
Stacktrace:
Backtrace:
	Ordinal0 [0x00F49943+2595139]
	Ordinal0 [0x00EDC9F1+2148849]
	Ordinal0 [0x00DD43F0+1065968]
	Ordinal0 [0x00DC5A97+1006231]
	Ordinal0 [0x00DC4AD0+1002192]
	Ordinal0 [0x00DC50D8+1003736]
	Ordinal0 [0x00DC5068+1003624]
	Ordinal0 [0x00DCB220+1028640]
	Ordinal0 [0x00DC62AD+1008301]
	Ordinal0 [0x00DC67C5+1009605]
	Ordinal0 [0x00DC65AF+1009071]
	Ordinal0 [0x00DC5BC6+1006534]
	Ordinal0 [0x00DC542B+1004587]
	Ordinal0 [0x00DC52C9+1004233]
	Ordinal0 [0x00DD58A0+1071264]
	Ordinal0 [0x00E2B3BB+1422267]
	Ordinal0 [0x00E1B806+1357830]
	Ordinal0 [0x00DF6086+1204358]
	Ordinal0 [0x00DF6F96+1208214]
	GetHandleVerifier [0x010EB232+1658114]
	GetHandleVerifier [0x011A312C+2411516]
	GetHandleVerifier [0x00FDF261+560433]
	GetHandleVerifier [0x00FDE366+556598]
	Ordinal0 [0x00EE286B+2173035]
	Ordinal0 [0x00EE75F8+2192888]
	Ordinal0 [0x00EE76E5+2193125]
	Ordinal0 [0x00EF11FC+2232828]
	BaseThreadInitThunk [0x76796739+25]
	RtlGetFullPathName_UEx [0x77738E7F+1215]
	RtlGetFullPathName_UEx [0x77738E4D+1165]
