In [2]:
import configparser
import json
import time 

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

In [3]:
# Parser to get config credentials and other private information
parser = configparser.ConfigParser()
parser.read_file(open('../credentials.config'))

In [4]:
# AWS credentials
bucket_name = parser.get('AWS', 'bucket_name')
aws_access_key = parser.get('AWS', 'aws_access_key')
aws_secret_key = parser.get('AWS', 'aws_secret_key')
aws_region = parser.get('AWS', 'region')

# Selenium variables
selenium_driver_path = parser.get('SELENIUM', 'path')

In [5]:
def exit_prompt(driver):
    '''
    Avoids modal pop up by clicking away from pop up.
    
    ARGUMENTS:
        driver: Selenium driver object for scraping.
    RETURNS:
        None
    '''
    try:
        elem = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="LoginModal"]/div/div/div[2]/button')))
        action = ActionChains(driver)
        action.move_to_element(elem).move_by_offset(250, 0).click().perform()
    except:
        pass

In [6]:
def check_next_page(driver, page_num):
    '''
    Checks if next page exists
    
    ARGUMENTS:
        driver: Selenium driver object for scraping.
        page_num: The page number to check.
    RETURNS:
        Boolean if next page exists.
    '''
    next_page = True
    page_element = f'//*[@id="MainCol"]/div[2]/div/div[1]/button[{page_num}]'
    try:
        driver.find_element_by_xpath(page_element).click()
    # Catch exception, boolean False.
    except NoSuchElementException:
        next_page = False
        # print('No Page Element Found')
    return next_page

In [7]:
def get_value(driver, test, element, property, by_type='class'):
    '''
    Finds page element by class or by xpath.
    
    ARGUMENTS:
        driver: Selenium driver object for scraping.
        test: If test, then print Exception, else set value as None
        element: Page element.
        property: What job property scraped.
        by_type: By 'class' name or by 'xpath'.
    '''
    try:
        if by_type == 'class':
            return driver.find_element_by_class_name(element).text
        elif by_type == 'xpath':
            return driver.find_element_by_xpath(element).text
    except NoSuchElementException:
            if test:
                print(f'NoSuchElementException for {property}. Defaulting to None')
            else:
                return None

In [8]:
def get_all_values_dict(driver, test=False):
    '''
    Scrapes all data for each job post on a page.
    
    ARGUMENTS:
        driver: Selenium driver object for scraping.
        element: Page element.
        test: If test, then print Exception, else set value as None

    RETURNS:
        Dictionary object of all job and company properties.
    '''
    try:
        # Expand 'Show More' option for job description
        driver.find_element_by_class_name('css-t3xrds').click()
    except:
        pass
    job_info = {}
    params = {
        'CompanyName': ['css-87uc0g', 'Company Name', 'class'],
        'JobTitle': ['css-1vg6q84', 'Job Name', 'class'],
        'JobLocation': ['css-56kyx5', 'Job Location', 'class'],
        'EasyApply': ['//*[@id="MainCol"]/div[1]/ul/li[3]/div/div/a/div[1]/div[5]/div', 'Easy Apply', 'xpath'],
        # 'JobDescription': ['jobDescriptionContent', 'Job Description', 'class'],
        'JobDescription': ['.//div[@class="jobDescriptionContent desc"]', 'Job Description', 'xpath'],
        'JobSalary': ['css-1xe2xww', 'Salary', 'class'],
        'CompanyRating': ['css-1m5m32b', 'Company Rating', 'class'],
        'CompanySize': ['//div[@id="EmpBasicInfo"]/div[1]/div/div[1]/span[2]', 'Company Size', 'xpath'],
        'CompanyType': ['//div[@id="EmpBasicInfo"]/div[1]/div/div[3]/span[2]', 'Company Type', 'xpath'],
        'CompanySector': ['//div[@id="EmpBasicInfo"]/div[1]/div/div[5]/span[2]', 'Company Sector', 'xpath'],
        'CompanyYearFounded': ['//div[@id="EmpBasicInfo"]/div[1]/div/div[2]/span[2]', 'Year Founded', 'xpath'],
        'CompanyIndustry': ['//div[@id="EmpBasicInfo"]/div[1]/div/div[4]/span[2]', 'Company Industry', 'xpath'],
        'CompanyRevenue': ['//div[@id="EmpBasicInfo"]/div[1]/div/div[6]/span[2]', 'Company Revenue', 'xpath']
    }

    for param in params.items():
        val = get_value(driver, test, param[1][0], param[1][1], param[1][2])
        job_info[param[0]] = job_info.get(param[0], val)
        
    return job_info

In [9]:
# def get_data(driver, test=False):
#     '''
#     Scrapes for data for each job on Glassdoor page.
    
#     ARGUMENTS:
#         driver: Selenium driver object for scraping.
#         url: URL for page to be scraped.
#         test: Default False. If test, print results instead of returning results all_jobs object.
#     RETURNS:
#         JSON object. 
#         Includes:
#             Job name, company name, job location, job salary, company rating, and company information
#     '''
#     all_jobs = []
#     # Start page 1, iterate to 2 on next click
#     page_num_element = 2
#     # 30 jobs per page. Once last job is reached, go to next page
#     while True:
#         job_info = {}
#         company_name = None
#         job_name = None
#         location = None
#         job_desc = None
#         salary = None
#         rating = None
#         company_size = None
#         company_type = None
#         company_sector = None
#         year_founded = None
#         company_industry = None
#         company_revenue = None
#         # To prevent element refresh or page document missing, save jobs
#         job_posts = driver.find_elements_by_class_name('css-7ry9k1')  
#         print('Saving job posts')

#         # Iterate through job posts
#         print('Begin job post iteration')
#         for i in range(len(job_posts)):
#             print(f'Job element: {i}')
#             try:
#                 job_posts[i].click()
#             except ElementClickInterceptedException:
#                 time.sleep(1)
#                 exit_prompt(driver)
#             try:
#                 # Basic Job Information
#                 # Company name
#                 company_name = get_value(driver, 'css-87uc0g', 'Company Name', 'class').text.split('\n')[0]
#                 # company_name = driver.find_element_by_class_name('css-xuk5ye').text.split('\n')[0]
            
#                 # Job name
#                 job_name = get_value(driver, 'css-1vg6q84', 'Job Name', 'class').text
#                 # job_name = driver.find_element_by_class_name('css-1j389vi').text
                
#                 # Job location
#                 location = get_value(driver, 'css-56kyx5', 'Job Location', 'class').text
#                 # location = driver.find_element_by_class_name('css-56kyx5').text
                
#                 # Expand 'Show More' option for job description
#                 driver.find_element_by_class_name('css-t3xrds').click()
                
#                 # Job description
#                 job_desc = get_value(driver, 'jobDescriptionContent', 'Job Description', 'class').text
#                 # job_desc = driver.find_element_by_class_name('jobDescriptionContent').text 

#                 # Salary. Handles for missing elements
#                 # Salary
#                 salary = get_value(driver, 'ccss-1xe2xww', 'Salary', 'class').text
#                 # try:
#                 #     salary = driver.find_element_by_class_name('css-1hbqxax').text
#                 # except NoSuchElementException:
#                 #     print(f'NoSuchElementException for {company_name} - {job_name}: Salary. (Job Number in List: {job_element_count})\n')

#                 # Company Information. Handles for missing elements
#                 # Company rating
#                 rating = get_value(driver, 'css-1m5m32b', 'Company Rating', 'class').text
#                 # try:
#                 #     rating = driver.find_element_by_class_name('css-ey2fjr').text
#                 # except:
#                 #     print(f'NoSuchElementException for {company_name} - {job_name}: Company Rating. (Job Number in List: {job_element_count})\n')
                
#                 # Company size
#                 company_size = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[1]/span[2]', 'Company Size', 'xpath').text
#                 # try:
#                 #     company_size = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[1]/span[2]').text
#                 # except NoSuchElementException:
#                 #     print(f'NoSuchElementException for {company_name} - {job_name}: Company Size. (Job Number in List: {job_element_count})\n')
                
#                 # Company type
#                 company_type = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[3]/span[2]', 'Company Type', 'xpath').text
#                 # try:
#                 #     company_type = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[3]/span[2]').text
#                 # except NoSuchElementException:
#                 #     print(f'NoSuchElementException for {company_name} - {job_name}: Company Type. (Job Number in List: {job_element_count})\n')
                
#                 # Company sector
#                 company_sector = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[5]/span[2]', 'Company Sector', 'xpath').text
#                 # try:
#                 #     company_sector = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[5]/span[2]').text
#                 # except NoSuchElementException:
#                 #     print(f'NoSuchElementException for {company_name} - {job_name}: Company Sector. (Job Number in List: {job_element_count})\n')
                
#                 # Company year founded
#                 year_founded = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[2]/span[2]', 'Year Founded', 'xpath').text
#                 # try:
#                 #     year_founded = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[2]/span[2]').text
#                 # except NoSuchElementException:
#                 #     print(f'NoSuchElementException for {company_name} - {job_name}: Company Year Founded. (Job Number in List: {job_element_count})\n')
                
#                 # Company Industry
#                 company_industry = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[4]/span[2]', 'Company Industry', 'xpath').text
#                 # try:
#                 #     company_industry = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[4]/span[2]').text
#                 # except NoSuchElementException:
#                 #     print(f'NoSuchElementException for {company_name} - {job_name}: Company Industry. (Job Number in List: {job_element_count})\n')
                
#                 # Company Revenue
#                 company_revenue = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[6]/span[2]', 'Company Revenue', 'xpath').text
#                 # try:
#                 #     company_revenue = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[6]/span[2]').text
#                 # except NoSuchElementException:
#                 #     print(f'NoSuchElementException for {company_name} - {job_name}: Company Revenue. (Job Number in List: {job_element_count})\n')
                
#                 if test:  # If testing, then print outputs
#                     print(f'Job Element: {job_element_count}')
#                     print(f'Company Name: {company_name}')
#                     print(f'Job Title: {job_name}')
#                     print(f'Location: {location}')
#                     print(f'Job Description: {job_desc[:20]}')
#                     print(f'Salary: {salary}')
#                     print(f'Rating: {rating}')
#                     print(f'Company Size: {company_size}')
#                     print(f'Company Type: {company_type}')
#                     print(f'Company Sector: {company_sector}')
#                     print(f'Year Founded: {year_founded}')
#                     print(f'Company Industry: {company_industry}')
#                     print(f'Company Revenue: {company_revenue}')
#                     print('\n')
#                 else:
#                     job_info['Company'] = company_name
#                     job_info['JobTitle'] = job_name
#                     job_info['JobLocation'] = location
#                     job_info['JobDescription'] = job_desc
#                     job_info['JobSalary'] = salary
#                     job_info['CompanyRating'] = rating
#                     job_info['CompanySize'] = company_size
#                     job_info['CompanyType'] = company_type
#                     job_info['CompanySector'] = company_sector
#                     job_info['CompanyYearFounded'] = year_founded
#                     job_info['CompanyIndustry'] = company_industry
#                     job_info['CompanyRevenue'] = company_revenue
#             except Exception as e:
#                 print(e)
#                 time.sleep(4)
#             all_jobs.append(job_info)
#         if not check_next_page(driver, page_num):
#             return all_jobs
#         # If not last page, reset counters.
#         else:                
#             page_num_element += 1

In [10]:
# Set selenium driver path
with webdriver.Edge('../SeleniumDrivers/msedgedriver.exe') as driver:
    # Open Glasdoor page: Last 24 hours of job postings for data engineers in USA
    url = 'https://www.glassdoor.com/Job/united-states-data-engineer-jobs-SRCH_IL.0,13_IN1_KO14,27.htm?fromAge=1'
    driver.get(url)
    driver.maximize_window()
    # Wait for page to load
    time.sleep(2)
    # Get all jobs on page
    job_posts = driver.find_elements_by_class_name('react-job-listing')
    # Iterate through job posts
    for i, job in enumerate(job_posts):
        all_jobs = []
        test = True
        print(f'Job {i+1}')
        time.sleep(1)
        job.click()
        exit_prompt(driver)
        try:
            job_info = get_all_values_dict(driver, True)
            all_jobs.append(job_info)
            if test:  # If test, print outputs 
                print(json.dumps(job_info))
                print()
        except Exception as e:
            print(e)
            time.sleep(4)

Job 1
NoSuchElementException for Salary. Defaulting to None
{"CompanyName": "Start.io\n4.4", "JobTitle": "Data Engineer", "JobLocation": "United States", "EasyApply": "Easy Apply", "JobDescription": "Description\nStart.io is a mobile marketing and audience platform. Start.io (formerly StartApp) empowers the mobile app ecosystem and simplifies mobile marketing, audience building, and mobile monetization. Start.io's direct integration with over 500,000 monthly active mobile apps provides access to unprecedented levels of global first-party data, which can be leveraged to understand and predict behaviors, identify new opportunities, and fuel growth.If you are a data enthusiast and want to participate in real-time data streams of billions of events from billions of users, your place is with us.\nResponsibilities:\nDevelop and deploy real-time and batch data processing infrastructures and pipelines.\nIngesting streams of billions of records per day.\nDevelop real-time streams of hundreds of

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: MicrosoftEdge=114.0.1823.67)


In [None]:
# # Set selenium driver path
# with webdriver.Edge('../SeleniumDrivers/msedgedriver.exe') as driver:
#     # Open Glasdoor page: Last 24 hours of job postings for data engineers in USA
#     url = 'https://www.glassdoor.com/Job/united-states-data-engineer-jobs-SRCH_IL.0,13_IN1_KO14,27.htm?fromAge=1'
#     driver.get(url)
#     driver.maximize_window()
#     # Wait for page to load
#     time.sleep(2)
#     # Get all jobs on page
#     job_posts = driver.find_elements_by_class_name('react-job-listing')
#     # Iterate through job posts
#     for i, job in enumerate(job_posts):
#         job_info = {}
#         test = False
#         print(f'Job {i+1}')
#         time.sleep(1)
#         job.click()
#         try:
#             # Can't wait just 1 second, too fast. 10 seconds is too long to wait for each post.
#             elem = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.XPATH, '//*[@id="LoginModal"]/div/div/div[2]/button')))
#             action = ActionChains(driver)
#             action.move_to_element(elem).move_by_offset(250, 0).click().perform()
#         except:
#             pass
#         try:
#             # Basic Job Information
#             # Company name
#             company_name = get_value(driver, 'css-87uc0g', 'Company Name', 'class').split('\n')[0]
            
#             # Job name
#             job_name = get_value(driver, 'css-1vg6q84', 'Job Name', 'class')

#             # Job location
#             location = get_value(driver, 'css-56kyx5', 'Job Location', 'class')
            
#             # Easy apply status
#             easy_apply = get_value(driver, '//*[@id="MainCol"]/div[1]/ul/li[3]/div/div/a/div[1]/div[5]/div', 'easy apply', 'xpath')

#             # Expand 'Show More' option for job description
#             driver.find_element_by_class_name('css-t3xrds').click()
            
#             # Job description
#             job_desc = get_value(driver, 'jobDescriptionContent', 'Job Description', 'class')

#             # Salary. Handles for missing elements
#             # Salary
#             salary = get_value(driver, 'css-1xe2xww', 'Salary', 'class')

#             # Company Information. Handles for missing elements
#             # Company rating
#             rating = get_value(driver, 'css-1m5m32b', 'Company Rating', 'class')

#             # Company size
#             company_size = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[1]/span[2]', 'Company Size', 'xpath')
            
#             # Company type
#             company_type = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[3]/span[2]', 'Company Type', 'xpath')

#             # Company sector
#             company_sector = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[5]/span[2]', 'Company Sector', 'xpath')

#             # Company year founded
#             year_founded = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[2]/span[2]', 'Year Founded', 'xpath')

#             # Company Industry
#             company_industry = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[4]/span[2]', 'Company Industry', 'xpath')

#             # Company Revenue
#             company_revenue = get_value(driver, '//div[@id="EmpBasicInfo"]/div[1]/div/div[6]/span[2]', 'Company Revenue', 'xpath')
            
#             if test:  # If test, print outputs 
#                 print(f'Job: {i + 1}')
#                 print(f'Company Name: {company_name}')
#                 print(f'Job Title: {job_name}')
#                 print(f'Location: {location}')
#                 print(f'East Apply Status: {easy_apply}')
#                 print(f'Job Description: {job_desc[:20]}')
#                 print(f'Salary: {salary}')
#                 print(f'Rating: {rating}')
#                 print(f'Company Size: {company_size}')
#                 print(f'Company Type: {company_type}')
#                 print(f'Company Sector: {company_sector}')
#                 print(f'Year Founded: {year_founded}')
#                 print(f'Company Industry: {company_industry}')
#                 print(f'Company Revenue: {company_revenue}')
#                 print('\n')
#             else:
#                 job_info['Company'] = company_name
#                 job_info['JobTitle'] = job_name
#                 job_info['JobLocation'] = location
#                 job_info['JobDescription'] = job_desc
#                 job_info['JobSalary'] = salary
#                 job_info['CompanyRating'] = rating
#                 job_info['CompanySize'] = company_size
#                 job_info['CompanyType'] = company_type
#                 job_info['CompanySector'] = company_sector
#                 job_info['CompanyYearFounded'] = year_founded
#                 job_info['CompanyIndustry'] = company_industry
#                 job_info['CompanyRevenue'] = company_revenue
#         except Exception as e:
#             print(e)
#             time.sleep(4)

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: MicrosoftEdge=114.0.1823.67)


In [None]:
# driver.get('https://www.soleretriever.com/raffles')

# elem = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="LoginModal"]/div/div/div[2]/button')))

# action = ctionChains(driver)

# action.move_to_element(elem).move_by_offset(250, 0).click().perform()