In [66]:
# library here
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as soup
import requests
import time

In [68]:
# point chromDriver's path using Service 
# you need to download chromedrive in advance to use Selenium
# this path is different from the place of chromedriver.exe. Check out yours!
service = Service(r"C:\Users\Owner\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe") # r"" is a raw string literal

# activate webdriver (Selenium)
driver = webdriver.Chrome(service=service)

# access to simply hired
occupation = "data+scientist" # define occupation you want to search
locate = "united+states" # define location you want to search

# the final url of simply hired 
url = f'https://www.simplyhired.com/search?q={occupation}&l={locate}'

# this line tells Selenium to open the URL in the browser (Selenium)
driver.get(url)

In [70]:
# list for storing job posts
job_list = []

# function to collect information in each job post
def get_jobs():
    
    # this line retrieves the entire HTML source code of the currently loaded webpage in the browser (Selenium)
    page_source = driver.page_source
    
    # this line parses the page_source (BeautifulSoup)
    soup_obj = soup(page_source, 'html.parser')
    
    # get all job lists in a page
    job_posts = soup_obj.find("ul", {'class': 'css-13ia03s'}).find_all('li', {'class': 'css-0'})

    
    # loop through all job lists and collect information
    for job in job_posts:
        job_post = {}

        # collect and store company name
        try:
            job_post["company_name"] = job.find('span', {'class': "css-lvyu5j"}).text.strip()
        except AttributeError:
            job_post["company_name"] = None

        # collect and store job title
        try:
            job_post["title"] = job.find('a', {'class': "chakra-button css-1djbb1k"}).text.strip()
        except AttributeError:
            job_post["title"] = None


        # collect and store location
        try:
            job_post["location"] = job.find('span', {'class': "css-1t92pv"}).text.strip()
        except AttributeError:
            job_post["location"] = None

        # collect and store summary description
        try:
            job_post["summary"] = job.find('p', {'class': 'chakra-text css-jhqp7z'}).text.strip()
        except AttributeError:
            job_post["summary"] = None

        # collect and store salary
        try:
            job_post["salary"] = job.find('p', {'class': 'chakra-text css-1g1y608'}).text.strip()
        except AttributeError:
            job_post["salary"] = None

        # collect and store rate
        try:
            job_post["star"] = job.find('span', {'class': 'css-epvm6'}).find("span", {"class": "css-0"}).text.strip()
        except AttributeError:
            job_post["star"] = None

        
        # get further information by going to each company's job post link
        detail_url = job.find("a", {"class": "chakra-button css-1djbb1k"})["href"]
        base_url = "https://www.simplyhired.com"
        driver.get(base_url + detail_url)  # move to each company's job post website using Selenium

        # wait until the website is completely loaded
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CSS_SELECTOR, "span[data-testid='detailText']")))

        # get each company's website HTML
        page_source = driver.page_source
        detail_soup = soup(page_source, 'html.parser')
        #collect and store work style
        try:
            #job_post["work style"] = job.find('span', {'class': 'chakra-stack css-xyzzkl'}).find("span").text.strip()
            job_post["work_style"] = detail_soup.find('span', {'data-testid': 'viewJobBodyJobDetailsJobType'}).find("span", {"data-testid": "detailText"}).text.strip()
        except AttributeError:
            job_post["work_style"] = None

        #collect and store Benefit
        try:
            benefits_section = detail_soup.find("div", {'data-testid': 'viewJobBodyJobBenefits'})
            if benefits_section:
                all_benefits = benefits_section.find_all("span", {"data-testid": "viewJobBenefitItem"})
                job_post["benefit"] = ", ".join([benefit.text.strip() for benefit in all_benefits])
            else:
                job_post["benefit"] = None
        except AttributeError:
            job_post["benefit"] = None

        # collect and store Qualification
        try:
            qualifications_section = detail_soup.find("div", {'data-testid': 'viewJobQualificationsContainer'})
            if qualifications_section:
                all_qualifications = qualifications_section.find_all("span", {"data-testid": "viewJobQualificationItem"})
                job_post["qualification"] = ", ".join([qualification.text.strip() for qualification in all_qualifications])
            else:
                job_post["qualification"] = None
        except AttributeError:
            job_post["qualification"] = None

        # collect and store full Job Description
        try:
            job_post["full description"] = detail_soup.find('div', {'class': 'css-cxpe4v'}).text.strip()
        except AttributeError:
            job_post["full description"] = None

        # after collecting the job details, go back to the previous page (the job listing page)
        driver.back()

        # wait until the page is completely loaded
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.XPATH, "//ul[@class='css-13ia03s']")))
        
        # add the job list in job post
        job_list.append(job_post)



In [72]:
# get the job post in the first page
get_jobs()

# display the current page number and URL
print(f"Collecting data from page 1, Current URL: {url}")

i=2

while True: # collect all job information in all pages
    try:   
        # move to next page
        #this line searches for the anchor (<a>) tag that has an aria-label attribute set to 'Next page'
        # the "next page" button is identified as [aria-label='Next page'] and is placed within [a] tag
        next_button = driver.find_element(By.XPATH, "//a[@aria-label='Next page']")

        # wait until the button is clickable
        WebDriverWait(driver, 3).until(EC.element_to_be_clickable(next_button))
        
        # click the next button
        next_button.click()

        
        # wait untile the next page is completely loaded
        # this condition checks if an element, identified by the specified XPath ("//ul[@class='css-13ia03s']"), is present on the page
        # What I want to collect is job posts and they are identified as [class='css-13ia03s'] in ul tag
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.XPATH, "//ul[@class='css-13ia03s']")))

        
        # get all job information in the this page
        get_jobs()

        
        # wait for a little bit
        time.sleep(1)

        i+=1
        # display the current page number and URL
        print(f"Collecting data from page {i}, Current URL: {driver.current_url}")

        # If we reach the 5th page, stop the loop
        if i > 3: # adjust by yourself
            break
        
    except Exception as e:
        print("No More Additional Pages", e)
        break # if there is no next page, we get outside of the loop 

Collecting data from page 1, Current URL: https://www.simplyhired.com/search?q=data+scientist&l=united+states
Collecting data from page 2, Current URL: https://www.simplyhired.com/search?q=data+scientist&l=united+states&cursor=ABQAAQAUAAAAAAAAAAAAAAACQBfcsgEBAQcAgZe1PxMVXsJ%2FLTGpnURsoselL7m6MMSHRj%2B21tjo1%2BzUc58ZwlSsGFXAXGqrY2VX31U%3D
Collecting data from page 3, Current URL: https://www.simplyhired.com/search?q=data+scientist&l=united+states&cursor=ABQAAgAoAAAAAAAAAAAAAAACQBfcsgEBAQgRaCNX8nWD9N3yQ80GxiqxIqIownKoMF99e%2FKvWLyX2Ros%2B09NdiqkQK%2FrFQJHJ3qj3bvyz%2FmvpdQOY9aSVvU9S8hUz1Pyzsi9jadGPl9TajXzINaR2LkAEGJLMHoe8E98


In [74]:
# convert to dataframe
df = pd.DataFrame(job_list)
df = df.dropna(subset=['company_name'])  # Drop the null value in the company_name column
df.head()

Unnamed: 0,company_name,title,location,summary,salary,star,work_style,benefit,qualification,full description
0,DataAnnotation —,Data Scientist - AI Trainer,"Elkhart, IN",You can work on your own schedule. A bachelor'...,From $40 an hour,4.0,Contract,Flexible schedule,"Writing skills, C#, English, Mid-level, SQL, C...",We are looking for proficient programmers to j...
1,Macquarie Group Limited —,Senior Data Scientist | Commodities,"Houston, TX","In this role you will work with traders, analy...",,3.8,,Employee assistance program,"Power BI, Doctoral degree, Trading, 5 years, G...","Join Macquarie’s North American Power, Gas and..."
2,Deloitte —,"Senior Data Scientist, Molecular Modeling and ...","Chicago, IL",Research and implement novel machine learning ...,,3.9,Full-time,,"TensorFlow, Azure, Doctoral degree, Kubernetes...","Senior Data Scientist, Molecular Modeling and ..."
3,Outlier Ai —,Law Expertise Sought for AI Training - AI Trainer,"Las Vegas, NV",Crafting and answering questions related to La...,$30 - $50 an hour,2.4,Contract | Freelance,,"Law, English, Mid-level, Bachelor's degree",Outlier helps the world’s most innovative comp...
4,Outlier Ai —,Data Scientist - AI Trainer,American Samoa,About the opportunity: Outlier is looking for ...,$30 - $50 an hour,2.4,Freelance,,"Mathematics, Mid-level, Math, Bachelor's degree",Outlier helps the world’s most innovative comp...


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   company_name      80 non-null     object
 1   title             80 non-null     object
 2   location          80 non-null     object
 3   summary           80 non-null     object
 4   salary            67 non-null     object
 5   star              69 non-null     object
 6   work_style        52 non-null     object
 7   benefit           50 non-null     object
 8   qualification     80 non-null     object
 9   full description  80 non-null     object
dtypes: object(10)
memory usage: 6.4+ KB
