In [1]:
import os
import time
import numpy as np

from selenium import webdriver  # Used for opening up a web browser
from selenium.common.exceptions import NoSuchElementException  # Used for when element not found
from selenium.common.exceptions import ElementClickInterceptedException  # Used for an element obscuring another
from selenium.webdriver.common.by import By  # For condition for finding element
from selenium.webdriver.common.keys import Keys  # Allows access to all keyboard keys such as alt, tab, enter, etc.
from selenium.webdriver.support.ui import WebDriverWait  # For explicit wait
from selenium.webdriver.support import expected_conditions as EC  # For explicit wait condition

In [2]:
def exit_prompt(driver):
    '''
    Tries to click out of sign in prompt from glassdoor.
    Sign in prompt does not trigger when page loads, only after a job posting is clicked (does not appear again after clicking X)
    '''
    try:
        exit = driver.find_element_by_class_name('modal_closeIcon')
        exit.click()
    except NoSuchElementException:
        print('No Sign In Prompt Detected')

In [3]:
# //*[@id="MainCol"]/div[1]/ul/li[1]/div[2]/div[1]/a/span

In [4]:
def basic_info(driver):
    '''
    Grabs basic information of company such as company name, job name, location, and job description. All job postings have this information.
    '''
    company_name = ''
    job_name = ''
    location = ''
    job_desc = ''
    try:  # Attempting to grab the first job listing's details
        print('entering')
        # 
        # 
        # 
        # driver.find_element_by_class_name('css-t3xrds').click()
        # 
        # company_name = driver.find_element_by_xpath('//div[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[1]').text
        company_name = driver.find_element_by_class_name('css-xuk5ye').text.split('\n')[0]
        print('Company Name: ', company_name)

        job_name = driver.find_element_by_class_name('css-1j389vi').text
        print('Job Name: ', job_name)

        print('Could not find job name')

        location = driver.find_element_by_class_name('css-56kyx5').text
        print('Location: ', location)

        job_desc = driver.find_element_by_class_name('jobDescriptionContent').text
        print('Job Description: ', job_desc[:100])
    except:
        time.sleep(4)
    # print('Company Name: ', company_name)
    # print('Job Name: ', job_name)
    # print('Location: ', location)
    # print('Job Description: ', job_desc[:100])

    return company_name, job_name, location, job_desc

In [5]:
def salary_rating(driver):
    '''
    Try except handling for information on jobs that could be missing from the posting. If element does not exist, value is set to -1.
    Salary, rating, 
    '''
    salary = ''
    rating = ''
    try:  # If salary estimate exists
        salary = driver.find_element_by_class_name('css-1hbqxax').text
    except NoSuchElementException:
        salary = -1
    try:  # If rating exists
        rating = driver.find_element_by_class_name('css-ey2fjr').text
    except:
        rating = -1

    print('Salary:', salary)
    print('Rating:' , rating)
    
    return salary, rating

In [6]:
def company_info(driver):
    '''
    Clicks on company information and tries to extract information such as company size, company type, year founded, etc.
    '''
    company_size = '' 
    company_type = ''
    company_sector = ''
    year_founded = ''
    company_industry = ''
    company_revenue = ''
    
    # try:  # If company tab exists
    #     print('Before Click: ')
    #     driver.find_element_by_xpath('//div[@id="SerpFixedHeader"]/div/div/div[3]').click()
    #     print('After Click: ')
    try: # Separate try except for each company info, or else all will default to -1 if even one piece of information is missing.
        company_size = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[1]/span[2]').text
        print('Company Size: ', company_size)
    except NoSuchElementException:
        -1
    try:
        company_type = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[3]/span[2]').text
        print('Company Type: ', company_type)
    except NoSuchElementException:
        -1
    try:
        company_sector = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[5]/span[2]').text
        print('Company Sector: ', company_sector)
    except NoSuchElementException:
        -1
    try:
        year_founded = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[2]/span[2]').text
        print('Year Founded: ', year_founded)
    except NoSuchElementException:
        -1
    try:
        company_industry = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[4]/span[2]').text
        print('Company Industry: ', company_industry)
    except NoSuchElementException:
        -1
    try:
        company_revenue = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[6]/span[2]').text
        print('Company Revenue: ', company_revenue)
    except NoSuchElementException:
        -1
    # except NoSuchElementException:  # In the case of no company tab.
    #     company_size = -1
    #     company_type = -1
    #     company_sector = -1
    #     year_founded = -1
    #     company_industry = -1
    #     company_revenue = -1
    print('Company Size After: ', company_size)
    print('Company Type After: ', company_type)
    print('Company Sector After: ', company_sector)
    print('Year Founded: After', year_founded)
    print('Company Industry After: ', company_industry)
    print('Company Revenue After: ', company_revenue)
    
    return company_size, company_type, company_sector, year_founded, company_industry, company_revenue

In [7]:
url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm'
driver = webdriver.Edge()  # Using Edge browser
driver.get(url)

company_name = ''
job_name = ''
location = ''
job_desc = ''
salary = ''
rating = ''
company_size = ''
company_type = ''
company_sector = ''
year_founded = ''
company_industry = ''
company_revenue = ''

all_jobs = {}
job_posts = driver.find_elements_by_class_name('react-job-listing')
job_posts[0].click()  # Click on first post to trigger prompt
exit_prompt(driver)  # Exit prompt

for job_post in job_posts:  # Begin iteration
    job_post.click()
    time.sleep(2)  # Wait to prevent bot detection 

    
    try:  # Attempt at acquiring information
        # Basic Job Information
        company_name = driver.find_element_by_class_name('css-xuk5ye').text.split('\n')[0]
        job_name = driver.find_element_by_class_name('css-1j389vi').text
        location = driver.find_element_by_class_name('css-56kyx5').text
        driver.find_element_by_class_name('css-t3xrds').click()
        job_desc = driver.find_element_by_class_name('jobDescriptionContent').text

        print('\n')
        print('Company Name: ', company_name)
        print('Job Name: ', job_name)
        print('Location: ', location)
        print('Job Description: ', job_desc[1:20])

        # Salary and Company Rating
        try:  # If salary estimate exists
            salary = driver.find_element_by_class_name('css-1hbqxax').text
        except NoSuchElementException:
            salary = -1
        try:  # If rating exists
            rating = driver.find_element_by_class_name('css-ey2fjr').text
        except:
            rating = -1

        print('Salary:', salary)
        print('Rating:' , rating)

        # Company Information
        try: # Separate try except for each company info, or else all will default to -1 if even one piece of information is missing.
            company_size = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[1]/span[2]').text
        except NoSuchElementException:
            -1
        try:
            company_type = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[3]/span[2]').text
        except NoSuchElementException:
            -1
        try:
            company_sector = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[5]/span[2]').text
        except NoSuchElementException:
            -1
        try:
            year_founded = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[2]/span[2]').text
        except NoSuchElementException:
            -1
        try:
            company_industry = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[4]/span[2]').text
        except NoSuchElementException:
            -1
        try:
            company_revenue = driver.find_element_by_xpath('//div[@id="EmpBasicInfo"]/div[1]/div/div[6]/span[2]').text
        except NoSuchElementException:
            -1
        print('Company Size After: ', company_size)
        print('Company Type After: ', company_type)
        print('Company Sector After: ', company_sector)
        print('Year Founded: After', year_founded)
        print('Company Industry After: ', company_industry)
        print('Company Revenue After: ', company_revenue)
    
        all_jobs.append({
            'Company Name': company_name,
            'Job Name': job_name,
            'Location': location,
            'Job Description': job_desc,
            'Salary': salary,
            'Rating': rating,
            'Company Size': company_size,
            'Company Type': company_type,
            'Company Sector': company_sector,
            'Year Founded': year_founded,
            'Company Industry': company_industry,
            'Company Revenue': company_revenue
        })

    except:
        time.sleep(4)



Company Name:  American Capital Group
Job Name:  Data Scientist
Location:  Bellevue, WA
Job Description:  e are adding a Data
Salary: Employer Provided Salary:$160K - $175K
Rating: 4.2
Company Size After:  201 to 500 Employees
Company Type After:  Company - Private
Company Sector After:  Real Estate
Year Founded: After 1986
Company Industry After:  Real Estate
Company Revenue After:  Unknown / Non-Applicable


Company Name:  RunBuggy
Job Name:  Senior Data Scientist - Remote
Location:  San Francisco, CA
Job Description:  unBuggy is a techno
Salary: $98K - $169K (Glassdoor est.)
Rating: 4.0
Company Size After:  51 to 200 Employees
Company Type After:  Company - Private
Company Sector After:  Information Technology
Year Founded: After 2016
Company Industry After:  Information Technology Support Services
Company Revenue After:  Unknown / Non-Applicable


Company Name:  GALE Partners
Job Name:  Senior Data Scientist
Location:  New York, NY
Job Description:  bout GALE: GALE is 
Salary: $9

KeyboardInterrupt: 

In [None]:
stop

In [None]:
url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm'
driver = webdriver.Edge()  # Using Edge browser
driver.get(url)
time.sleep(3)  # Wait for page to load

all_jobs = []
job_posts = driver.find_elements_by_class_name('react-job-listing')  # Job posts
job_posts[0].click()  # Trigger the sign in prompt
time.sleep(2)  # Wait for sign in prompt to load
exit_prompt(driver)  # Exit the sign in prompt
for job_post in job_posts:  # For each tile
    job_details = []
    job_post.click()
    try:  # Attempting to grab the first job listing's details
        print('Entering Try')
        rand = driver.find_element_by_class('css-xuk5ye').text
        print(rand)
        # company_size, company_type, company_sector, year_founded, company_industry, company_revenue = company_info(driver)
        # company_name, job_name, location, job_desc = basic_info(driver)
        # salary, rating = salary_rating(driver)
    except:
        time.sleep(3)

    time.sleep(1)

Entering Try
Entering Try
Entering Try


KeyboardInterrupt: 

In [None]:

# url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm'
# driver = webdriver.Edge()  # Using Edge browser
# driver.get(url)

# exit_prompt(driver)
# all_jobs = []
# job_posts = driver.find_elements_by_class_name('react-job-listing')
# for job_post in job_posts:
#     job_details = []
#     exit_prompt(driver)
#     job_post.click()
#     time.sleep(4)
#     try:  # Attempting to grab the first job listing's details
        
#         company_name = driver.find_element_by_class_name('css-xuk5ye').text.split('\n')[0]
#         job_name = driver.find_element_by_class_name('css-1j389vi').text
#         location = driver.find_element_by_class_name('css-56kyx5').text
#         driver.find_element_by_class_name('css-t3xrds').click()
#         job_desc = driver.find_element_by_class_name('jobDescriptionContent').text

#         print('Company Name: ', company_name)
#         print('Job Name: ', job_name)
#         print('Location: ', location)
#         print('Job Description: ', job_desc[:-100])
#         print('\n')

#         job_details.append(company_name, job_name, location)

#     except:
#         time.sleep(4)









KeyboardInterrupt: 

In [None]:
print(len(job_posts))

30


In [None]:
# may need to deal with the salaries and company ratings missing at some point. 

In [None]:
# WebDriverWait(driver=driver, timeout=30).until(
#     EC.text_to_be_present_in_element(
#         (By.CLASS_NAME, 'progress-label'),  # First argument is the element that you want to check for condition. This is the same method of finding elements as find_element_by_name
#         'Complete!'  # Second argument is the text you expect to have after 30 seconds
#     )

In [None]:
dasdsads

NameError: name 'dasdsads' is not defined