In [5]:
import os
import time
import numpy as np

from selenium import webdriver  # Used for opening up a web browser
from selenium.common.exceptions import NoSuchElementException  # Used for when element not found
from selenium.common.exceptions import ElementClickInterceptedException  # Used for an element obscuring another
from selenium.webdriver.common.by import By  # For condition for finding element
from selenium.webdriver.common.keys import Keys  # Allows access to all keyboard keys such as alt, tab, enter, etc.
from selenium.webdriver.support.ui import WebDriverWait  # For explicit wait
from selenium.webdriver.support import expected_conditions as EC  # For explicit wait condition

In [6]:
def exit_prompt(driver):
    '''
    Tries to click out of sign in prompt from glassdoor.
    Sign in prompt does not trigger when page loads, only after a job posting is clicked (does not appear again after clicking X)
    '''
    try:
        exit = driver.find_element_by_class_name('modal_closeIcon')
        exit.click()
    except NoSuchElementException:
        print('No Sign In Prompt Detected')

In [7]:
def get_data(rows, test=False):
    '''
    Grabs rows of data 
    '''
    url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm'
    driver = webdriver.Edge()  # Using Edge browser
    driver.get(url)
    all_jobs = {}

    # For each page, things need to be reset.
    # How to know when a page is done? When the last job post is the post you are clicking on
    # while last job false:
        # grab job posts
        # for each job post on each page:
        # last job boolean -> last job true false
        # if true click next page, reset the job posts
    for i in range(rows):
        company_name = ''
        job_name = ''
        location = ''
        job_desc = ''
        salary = ''
        rating = ''
        company_size = ''
        company_type = ''
        company_sector = ''
        year_founded = ''
        company_industry = ''
        company_revenue = ''

        job_posts = driver.find_elements_by_class_name('react-job-listing')  # Grab all posts
        job_posts[0].click()  # Click on first post to trigger prompt
        exit_prompt(driver)  # Exit prompt

        for j in range(len(job_posts)):  # Begin iteration
            job_posts[j].click()
            time.sleep(2)  # Wait to prevent bot detection 

            try:  # Attempt at acquiring information
                # Basic Job Information
                company_name = driver.find_element_by_class_name('css-xuk5ye').text.split('\n')[0]
                job_name = driver.find_element_by_class_name('css-1j389vi').text
                location = driver.find_element_by_class_name('css-56kyx5').text
                driver.find_element_by_class_name('css-t3xrds').click()
                job_desc = driver.find_element_by_class_name('jobDescriptionContent').text

                # Salary and Company Rating
                try:  # If salary estimate exists
                    salary = driver.find_element_by_class_name('css-1hbqxax').text
                except NoSuchElementException:
                    salary = -1
                try:  # If rating exists
                    rating = driver.find_element_by_class_name('css-ey2fjr').text
                except:
                    rating = -1

                # Company Information
                try: # Separate try except for each company info, or else all will default to -1 if even one piece of information is missing.
                    company_size = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[1]/span[2]').text
                except NoSuchElementException:
                    -1
                try:
                    company_type = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[3]/span[2]').text
                except NoSuchElementException:
                    -1
                try:
                    company_sector = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[5]/span[2]').text
                except NoSuchElementException:
                    -1
                try:
                    year_founded = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[2]/span[2]').text
                except NoSuchElementException:
                    -1
                try:
                    company_industry = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[4]/span[2]').text
                except NoSuchElementException:
                    -1
                try:
                    company_revenue = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[6]/span[2]').text
                except NoSuchElementException:
                    -1
                if test:  # If testing, then print outputs
                    print('Company Name: ', company_name)
                    print('Job Name: ', job_name)
                    print('Location: ', location)
                    print('Job Description: ', job_desc[:20])
                    print('Salary:', salary)
                    print('Rating:' , rating)
                    print('Company Size: ', company_size)
                    print('Company Type: ', company_type)
                    print('Company Sector: ', company_sector)
                    print('Year Founded: ', year_founded)
                    print('Company Industry: ', company_industry)
                    print('Company Revenue: ', company_revenue)
                    print('\n')

                all_jobs.append({
                    'Company Name': company_name,
                    'Job Name': job_name,
                    'Location': location,
                    'Job Description': job_desc,
                    'Salary': salary,
                    'Rating': rating,
                    'Company Size': company_size,
                    'Company Type': company_type,
                    'Company Sector': company_sector,
                    'Year Founded': year_founded,
                    'Company Industry': company_industry,
                    'Company Revenue': company_revenue
                })

            except:
                time.sleep(4)

In [8]:
get_data(rows=30, test=True)

Company Name:  American Capital Group
Job Name:  Data Scientist
Location:  Bellevue, WA
Job Description:  e are adding a Data
Salary: Employer Provided Salary:$160K - $175K
Rating: 4.2
Company Size:  201 to 500 Employees
Company Type:  Company - Private
Company Sector:  Real Estate
Year Founded:  1986
Company Industry:  Real Estate
Company Revenue:  Unknown / Non-Applicable


Company Name:  Kadence International
Job Name:  Junior Data Scientist
Location:  Remote
Job Description:  verview
Title: Juni
Salary: Employer Provided Salary:$45K - $60K
Rating: -1
Company Size:  201 to 500 Employees
Company Type:  Company - Private
Company Sector:  Real Estate
Year Founded:  1986
Company Industry:  Real Estate
Company Revenue:  Unknown / Non-Applicable


Company Name:  Innovative Precision Health
Job Name:  Senior Data Scientist
Location:  Remote
Job Description:  ob Responsibilities
Salary: Employer Provided Salary:$150K
Rating: 3.2
Company Size:  1 to 50 Employees
Company Type:  Unknown / Non-

KeyboardInterrupt: 

In [None]:
stop

In [None]:
# url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm'
# driver = webdriver.Edge()  # Using Edge browser
# driver.get(url)
# time.sleep(3)  # Wait for page to load

# all_jobs = []
# job_posts = driver.find_elements_by_class_name('react-job-listing')  # Job posts
# job_posts[0].click()  # Trigger the sign in prompt
# time.sleep(2)  # Wait for sign in prompt to load
# exit_prompt(driver)  # Exit the sign in prompt
# for job_post in job_posts:  # For each tile
#     job_details = []
#     job_post.click()
#     try:  # Attempting to grab the first job listing's details
#         print('Entering Try')
#         rand = driver.find_element_by_class('css-xuk5ye').text
#         print(rand)
#         # company_size, company_type, company_sector, year_founded, company_industry, company_revenue = company_info(driver)
#         # company_name, job_name, location, job_desc = basic_info(driver)
#         # salary, rating = salary_rating(driver)
#     except:
#         time.sleep(3)

#     time.sleep(1)

Entering Try
Entering Try
Entering Try


KeyboardInterrupt: 

In [None]:

# url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm'
# driver = webdriver.Edge()  # Using Edge browser
# driver.get(url)

# exit_prompt(driver)
# all_jobs = []
# job_posts = driver.find_elements_by_class_name('react-job-listing')
# for job_post in job_posts:
#     job_details = []
#     exit_prompt(driver)
#     job_post.click()
#     time.sleep(4)
#     try:  # Attempting to grab the first job listing's details
        
#         company_name = driver.find_element_by_class_name('css-xuk5ye').text.split('\n')[0]
#         job_name = driver.find_element_by_class_name('css-1j389vi').text
#         location = driver.find_element_by_class_name('css-56kyx5').text
#         driver.find_element_by_class_name('css-t3xrds').click()
#         job_desc = driver.find_element_by_class_name('jobDescriptionContent').text

#         print('Company Name: ', company_name)
#         print('Job Name: ', job_name)
#         print('Location: ', location)
#         print('Job Description: ', job_desc[:-100])
#         print('\n')

#         job_details.append(company_name, job_name, location)

#     except:
#         time.sleep(4)









KeyboardInterrupt: 

In [None]:
# def get_data(rows, test=False):
#     '''
#     Grabs rows of data 
#     '''
#     url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm'
#     driver = webdriver.Edge()  # Using Edge browser
#     driver.get(url)
#     all_jobs = {}

#     for i in range(rows):
#         company_name = ''
#         job_name = ''
#         location = ''
#         job_desc = ''
#         salary = ''
#         rating = ''
#         company_size = ''
#         company_type = ''
#         company_sector = ''
#         year_founded = ''
#         company_industry = ''
#         company_revenue = ''

#         job_posts = driver.find_elements_by_class_name('react-job-listing')  # Grab all posts
#         job_posts[0].click()  # Click on first post to trigger prompt
#         exit_prompt(driver)  # Exit prompt

#         for j in range(len(job_posts)):  # Begin iteration
#             job_posts[j].click()
#             time.sleep(2)  # Wait to prevent bot detection 

#             try:  # Attempt at acquiring information
#                 # Basic Job Information
#                 company_name = driver.find_element_by_class_name('css-xuk5ye').text.split('\n')[0]
#                 job_name = driver.find_element_by_class_name('css-1j389vi').text
#                 location = driver.find_element_by_class_name('css-56kyx5').text
#                 driver.find_element_by_class_name('css-t3xrds').click()
#                 job_desc = driver.find_element_by_class_name('jobDescriptionContent').text

#                 # Salary and Company Rating
#                 try:  # If salary estimate exists
#                     salary = driver.find_element_by_class_name('css-1hbqxax').text
#                 except NoSuchElementException:
#                     salary = -1
#                 try:  # If rating exists
#                     rating = driver.find_element_by_class_name('css-ey2fjr').text
#                 except:
#                     rating = -1

#                 # Company Information
#                 try: # Separate try except for each company info, or else all will default to -1 if even one piece of information is missing.
#                     company_size = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[1]/span[2]').text
#                 except NoSuchElementException:
#                     -1
#                 try:
#                     company_type = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[3]/span[2]').text
#                 except NoSuchElementException:
#                     -1
#                 try:
#                     company_sector = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[5]/span[2]').text
#                 except NoSuchElementException:
#                     -1
#                 try:
#                     year_founded = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[2]/span[2]').text
#                 except NoSuchElementException:
#                     -1
#                 try:
#                     company_industry = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[4]/span[2]').text
#                 except NoSuchElementException:
#                     -1
#                 try:
#                     company_revenue = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[6]/span[2]').text
#                 except NoSuchElementException:
#                     -1
#                 if test:  # If testing, then print outputs
#                     print('Company Name: ', company_name)
#                     print('Job Name: ', job_name)
#                     print('Location: ', location)
#                     print('Job Description: ', job_desc[:20])
#                     print('Salary:', salary)
#                     print('Rating:' , rating)
#                     print('Company Size: ', company_size)
#                     print('Company Type: ', company_type)
#                     print('Company Sector: ', company_sector)
#                     print('Year Founded: ', year_founded)
#                     print('Company Industry: ', company_industry)
#                     print('Company Revenue: ', company_revenue)
#                     print('\n')

#                 all_jobs.append({
#                     'Company Name': company_name,
#                     'Job Name': job_name,
#                     'Location': location,
#                     'Job Description': job_desc,
#                     'Salary': salary,
#                     'Rating': rating,
#                     'Company Size': company_size,
#                     'Company Type': company_type,
#                     'Company Sector': company_sector,
#                     'Year Founded': year_founded,
#                     'Company Industry': company_industry,
#                     'Company Revenue': company_revenue
#                 })

#             except:
#                 time.sleep(4)