# Web Scraping - Indeed.com

## Configure scraping settings

NOTE: Please only modify the cell below before running this program

In [None]:
# 1.job position
# eg. data analyst, data scientist, data engineer, data develop, manager of analytics, director of analytics
position = "director of analytics"

# 2.job location
locations = "Toronto, ON"

# 3.Number of postings to scrape (200 is recommended)
postings = 30

## Import Dependencies 

In [50]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random

## Pre Set-up

### Define Captcha detection

In [51]:
def detect_captcha(driver):
    try:
        title = driver.find_element(By.XPATH, "//*[contains(text(), 'Additional Verification Required')]").text
        return True
        
    except Exception as e:
        return False

class IpIsBlockedError(Exception):
    def __init__(self, message="Your IP has been blocked. Change your IP and retry."):
        self.message = message
        super().__init__(self.message)

### Initialize webdriver (Firefox) 

In [52]:
def init_driver():
    options = webdriver.FirefoxOptions()
    options.set_preference("browser.privatebrowsing.autostart", True)
    # options.add_argument('--headless')
    # options.add_argument('--disable-gpu')
    driver = webdriver.Firefox(options=options)
    return driver

### Define position and location 

In [53]:
def get_url(position, location):
    url_template = "https://ca.indeed.com/jobs?q={}&l={}"
    url = url_template.format(position, location)
    return url

url = get_url(position, locations)

## 1. Scrape job postings

In [54]:
driver = init_driver()
time.sleep(3)
dataframe = []
dataframe = pd.DataFrame(columns=["Title", "Company", "Location", "Rating", "Date", "Salary", "Description", "Links"])
jn=0

for i in range(0, postings, 10):
    driver.get(url + "&start=" + str(i))
    driver.implicitly_wait(random.randint(0, 3))

    # Check for CAPTCHA and restart driver if needed
    if detect_captcha(driver):   
        time.sleep(3)
        driver.quit()
        time.sleep(3)
        # triple check whether the ip has been blocked
        check_times = 3
        iter = 0

        while True:
            driver = init_driver()
            driver.get(url + "&start=" + str(i))
            driver.implicitly_wait(random.randint(0,3))

            if not detect_captcha(driver):
                break
            
            time.sleep(3)
            driver.quit()
            time.sleep(3)

            iter = iter + 1
            if iter >= check_times - 1:
                raise IpIsBlockedError(f"IP is blocked. Please use another IP and retry.")

    jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')

    for job in jobs:
        result_html = job.get_attribute('innerHTML')
        soup = BeautifulSoup(result_html, 'html.parser')
        
        jn += 1
        
        liens = job.find_elements(By.TAG_NAME, "a")
        links = liens[0].get_attribute("href")
        
        title = soup.select('.jobTitle')[0].get_text().strip()
        try:
            company = soup.find_all(attrs={'data-testid': 'company-name'})[0].get_text().strip() 

        except:
            company = 'NaN'
            
        try:
            location = soup.find_all(attrs={'data-testid': 'text-location'})[0].get_text().strip()

        except:
            location = 'NaN'
            
        try:
            salary = soup.select('.salary-snippet-container')[0].get_text().strip()
        except:
            salary = 'NaN'
            
        try:
            rating = soup.find("div",{"class":"companyInfo"}).find("span",{"class":"ratingsDisplay"}).text #scrapfly version
        except:
            rating = 'NaN'
            
        try:
            date = soup.find_all('span',attrs={'data-testid': 'myJobsStateDate'})[0].get_text().strip()
            words_posted_today = ["Today" , "Just", "ongoing"]
            if "ago" in date:
                date_temp = date.split()
                date_temp = date_temp[-3:]
                date = (date_temp[0] + ' ' + date_temp[1] + ' ' + date_temp[2])
            elif any(x in date for x in words_posted_today):
                date = "0 days ago"
            else:
                date = 'NaN'
        except:
            date = 'NaN'
            
        try:
            description = soup.select('.job-snippet')[0].get_text().strip()
        except:
            description = ''
       
        dataframe = pd.concat([dataframe, pd.DataFrame([{'Title': title,
                                          "Company": company,
                                          'Location': location,
                                          'Rating': rating,
                                          'Date': date,
                                          "Salary": salary,
                                          "Description": description,
                                          "Links": links}])], ignore_index=True)
        print("Job number {0:4d} added - {1:s}".format(jn,title))

Job number    1 added - Equity Specialist Anti-Racism & Equity
Job number    2 added - Social Media Content Specialist
Job number    3 added - Manager, Content and Social Media
Job number    4 added - Analyst, Business Intelligence
Job number    5 added - Intern, Investments
Job number    6 added - Workforce Analyst
Job number    7 added - Director, Digital Health & Strategy
Job number    8 added - Officer, Stewardship and Donor Relations
Job number    9 added - Advancement Officer - Database and Administration
Job number   10 added - Sr. Financial Analyst - FP&A
Job number   11 added - Manager, Food and Beverage Social Media Marketing
Job number   12 added - Project Coordinator
Job number   13 added - Fixed Income Financial Planning & Analysis Co-op (Winter & Summer 2025)
Job number   14 added - Director of Supply Chain
Job number   15 added - Fraud Reporting Analyst
Job number   16 added - Marketing Specialist
Job number   17 added - Senior Analyst, VMO Reporting and Analytics
Job nu

In [55]:
driver.quit()
dataframe = dataframe.drop_duplicates(subset=['Title', 'Company', 'Location', 'Rating', 'Date', 'Salary', 'Description']).reset_index()
dataframe = dataframe.drop(columns=['index'], axis=1)
dataframe

Unnamed: 0,Title,Company,Location,Rating,Date,Salary,Description,Links
0,Equity Specialist Anti-Racism & Equity,Unity Health Toronto,"Toronto, ON",,,$40.72–$50.90 an hour,,https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYl...
1,Social Media Content Specialist,ACTRA National,"Toronto, ON M4Y 2G1",,12 days ago,"$80,594–$97,319 a year",,https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYl...
2,"Manager, Content and Social Media",Mastercard,"Toronto, ON M5B 2L7",,,,,https://ca.indeed.com/rc/clk?jk=5f820e7259bcfe...
3,"Analyst, Business Intelligence",Great Canadian,"Toronto, ON",,,,,https://ca.indeed.com/rc/clk?jk=57135b0b1cf1bd...
4,"Intern, Investments",CAPREIT,"Toronto, ON M5E 1W1",,,,,https://ca.indeed.com/rc/clk?jk=97b98831db9778...
5,Workforce Analyst,Shoppers Drug Mart / Pharmaprix,"Toronto, ON M2J 4W8",,,,,https://ca.indeed.com/rc/clk?jk=a43d8eab1e9993...
6,"Director, Digital Health & Strategy",Shoppers Drug Mart / Pharmaprix,"Toronto, ON M2J 4W8",,,,,https://ca.indeed.com/rc/clk?jk=bb84cd4bca4219...
7,"Officer, Stewardship and Donor Relations",Unity Health Toronto,"Toronto, ON",,,"$68,000–$89,000 a year",,https://ca.indeed.com/rc/clk?jk=4d099862c4f0ea...
8,Advancement Officer - Database and Administration,St. Clement's School,"Toronto, ON",,,"$65,000–$70,000 a year",,https://ca.indeed.com/rc/clk?jk=0236660b7bd692...
9,Sr. Financial Analyst - FP&A,Indigo Books & Music,"Toronto, ON M5V 1M6",,,,,https://ca.indeed.com/rc/clk?jk=60fc95efa491d3...


## 2. Scrape full job descriptions

In [56]:
dataframe['Description'] = dataframe['Description'].astype(str)

In [57]:
link = ''
jd = ''

driver = init_driver()
time.sleep(3)

for index in range(len(dataframe)):

    link = dataframe.iloc[index]['Links']
    driver.get(link)
    driver.implicitly_wait(random.randint(0,3))

    if detect_captcha(driver):
        
        time.sleep(3)
        driver.quit()
        time.sleep(3)

        # triple check whether the ip has been blocked
        check_times = 3
        iter = 0

        while True:
            driver = init_driver()
            driver.get(link)
            driver.implicitly_wait(random.randint(0,3))

            if not detect_captcha(driver):
                break
            
            time.sleep(3)
            driver.quit()
            time.sleep(3)

            iter = iter + 1
            if iter >= check_times - 1:
                timestamp_str = str(int(time.time()))
                dataframe.to_csv(timestamp_str + '.csv', index=False)
                dataframe.to_pickle(timestamp_str + '.pkl')
                raise IpIsBlockedError(f"IP is blocked. Current work is saved as check_point_{timestamp_str}.csv")
    
    try:
        jd = driver.find_element(By.XPATH, '//div[@id="jobDescriptionText"]').text
    except:
        jd = 'No details provided.'

    dataframe.iloc[index, dataframe.columns.get_loc('Description')] = jd

In [58]:
driver.quit()
timestamp_str = str(int(time.time()))
dataframe.to_csv('output' + timestamp_str + '.csv', index=False)

In [59]:
print("Final Output:")
dataframe

Final Output:


Unnamed: 0,Title,Company,Location,Rating,Date,Salary,Description,Links
0,Equity Specialist Anti-Racism & Equity,Unity Health Toronto,"Toronto, ON",,,$40.72–$50.90 an hour,Unity Health Toronto’s vision is to create the...,https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYl...
1,Social Media Content Specialist,ACTRA National,"Toronto, ON M4Y 2G1",,12 days ago,"$80,594–$97,319 a year","ACTRA (Alliance of Canadian Cinema, Television...",https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYl...
2,"Manager, Content and Social Media",Mastercard,"Toronto, ON M5B 2L7",,,,Our Purpose\nWe work to connect and power an i...,https://ca.indeed.com/rc/clk?jk=5f820e7259bcfe...
3,"Analyst, Business Intelligence",Great Canadian,"Toronto, ON",,,,Position Summary:\nUnder the general direction...,https://ca.indeed.com/rc/clk?jk=57135b0b1cf1bd...
4,"Intern, Investments",CAPREIT,"Toronto, ON M5E 1W1",,,,":\n\nTitle: Intern, Investments (Winter 2025)\...",https://ca.indeed.com/rc/clk?jk=97b98831db9778...
5,Workforce Analyst,Shoppers Drug Mart / Pharmaprix,"Toronto, ON M2J 4W8",,,,Referred applicants must not apply directly to...,https://ca.indeed.com/rc/clk?jk=a43d8eab1e9993...
6,"Director, Digital Health & Strategy",Shoppers Drug Mart / Pharmaprix,"Toronto, ON M2J 4W8",,,,Referred applicants must not apply directly to...,https://ca.indeed.com/rc/clk?jk=bb84cd4bca4219...
7,"Officer, Stewardship and Donor Relations",Unity Health Toronto,"Toronto, ON",,,"$68,000–$89,000 a year",DEPARTMENT SUMMARY\nSt. Michael's Hospital Fou...,https://ca.indeed.com/rc/clk?jk=4d099862c4f0ea...
8,Advancement Officer - Database and Administration,St. Clement's School,"Toronto, ON",,,"$65,000–$70,000 a year",St. Clement’s School is currently looking for ...,https://ca.indeed.com/rc/clk?jk=0236660b7bd692...
9,Sr. Financial Analyst - FP&A,Indigo Books & Music,"Toronto, ON M5V 1M6",,,,Company Description\n\nDedicated to telling st...,https://ca.indeed.com/rc/clk?jk=60fc95efa491d3...
