In [1]:
pip install selenium webdriver-manager pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import random

class IndeedSeleniumScraper:
    def __init__(self, headless=True):
        self.options = Options()
        if headless:
            self.options.add_argument('--headless')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('--disable-blink-features=AutomationControlled')
        self.options.add_experimental_option("excludeSwitches", ["enable-automation"])
        self.options.add_experimental_option('useAutomationExtension', False)
        self.options.add_argument('--disable-extensions')
        self.options.add_argument('--disable-gpu')
        self.options.add_argument('--window-size=1920,1080')
        
        # Real user agent
        self.options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        
        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=self.options
        )
        
        # Hide automation detection
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        
        self.jobs_data = []
    
    def scrape_jobs(self, job_title="software developer", location="Toronto, ON", max_pages=3):
        """Scrape jobs using Selenium"""
        
        try:
            for page in range(max_pages):
                start_param = page * 10
                if page == 0:
                    url = f"https://ca.indeed.com/jobs?q={job_title.replace(' ', '+')}&l={location.replace(' ', '+').replace(',', '%2C')}"
                else:
                    url = f"https://ca.indeed.com/jobs?q={job_title.replace(' ', '+')}&l={location.replace(' ', '+').replace(',', '%2C')}&start={start_param}"
                
                print(f"Navigating to: {url}")
                self.driver.get(url)
                
                # Wait for page to load
                time.sleep(random.uniform(3, 5))
                
                # Check for CAPTCHA
                if self._check_captcha():
                    print("CAPTCHA detected! Please solve it manually or try again later.")
                    break
                
                # Wait for job cards to load
                try:
                    WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, '.job_seen_beacon, .cardOutline, .jobsearch-SerpJobCard'))
                    )
                except:
                    print("Job cards not found, trying to continue...")
                
                # Extract jobs from current page
                self._extract_page_jobs()
                
                print(f"Page {page + 1}/{max_pages} completed. Found {len(self.jobs_data)} jobs so far.")
                
                # Random delay between pages
                time.sleep(random.uniform(2, 4))
                
        except Exception as e:
            print(f"Error during scraping: {e}")
        
        finally:
            self.driver.quit()
        
        return self.jobs_data
    
    def _check_captcha(self):
        """Check if CAPTCHA is present"""
        try:
            captcha_elements = self.driver.find_elements(By.ID, 'captcha')
            captcha_input = self.driver.find_elements(By.ID, 'captcha-input')
            return len(captcha_elements) > 0 or len(captcha_input) > 0
        except:
            return False
    
    def _extract_page_jobs(self):
        """Extract jobs from the current page"""
        try:
            # Try multiple selectors for job cards
            selectors = [
                'div.job_seen_beacon',
                'div.cardOutline',
                'div.jobsearch-SerpJobCard',
                'div[data-jk]'
            ]
            
            job_cards = []
            for selector in selectors:
                try:
                    job_cards = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    if job_cards:
                        break
                except:
                    continue
            
            print(f"Found {len(job_cards)} job cards on this page")
            
            for card in job_cards:
                job_info = self._extract_job_info(card)
                if job_info:
                    self.jobs_data.append(job_info)
                    
        except Exception as e:
            print(f"Error extracting jobs from page: {e}")
    
    def _extract_job_info(self, card):
        """Extract information from a single job card"""
        try:
            # Title
            try:
                title_elem = card.find_element(By.CSS_SELECTOR, 'h2.jobTitle, h2.title, a.jcs-JobTitle')
                title = title_elem.text.strip()
            except:
                title = "N/A"
            
            # Company
            try:
                company_elem = card.find_element(By.CSS_SELECTOR, 'span.companyName, span.company')
                company = company_elem.text.strip()
            except:
                company = "N/A"
            
            # Location
            try:
                location_elem = card.find_element(By.CSS_SELECTOR, 'div.companyLocation, div.location')
                location = location_elem.text.strip()
            except:
                location = "N/A"
            
            # Salary
            try:
                salary_elem = card.find_element(By.CSS_SELECTOR, 'div.salary-snippet, span.salaryText')
                salary = salary_elem.text.strip()
            except:
                salary = "Not specified"
            
            # Link
            try:
                link_elem = card.find_element(By.CSS_SELECTOR, 'a.jcs-JobTitle, a.jobtitle')
                link = link_elem.get_attribute('href')
                if link and link.startswith('/'):
                    link = 'https://ca.indeed.com' + link
            except:
                link = "N/A"
            
            # Date
            try:
                date_elem = card.find_element(By.CSS_SELECTOR, 'span.date, span.datePosted')
                date_posted = date_elem.text.strip()
            except:
                date_posted = "N/A"
            
            job_info = {
                'title': title,
                'company': company,
                'location': location,
                'salary': salary,
                'link': link,
                'date_posted': date_posted,
                'scraped_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
            }
            
            return job_info
            
        except Exception as e:
            print(f"Error extracting job info: {e}")
            return None
    
    def save_to_csv(self, filename="indeed_jobs_selenium.csv"):
        """Save results to CSV"""
        if not self.jobs_data:
            print("No jobs data to save.")
            return False
        
        df = pd.DataFrame(self.jobs_data)
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Successfully saved {len(self.jobs_data)} jobs to {filename}")
        return True

# Main execution
if __name__ == "__main__":
    print("Starting Indeed Canada Job Scraper...")
    
    # Initialize scraper (set headless=False to see the browser)
    scraper = IndeedSeleniumScraper(headless=False)  # Set to True to run in background
    
    # Scrape jobs
    jobs = scraper.scrape_jobs(
        job_title="",
        location="Canada",
        max_pages=2  # Start with 2 pages
    )
    
    # Save results
    if jobs:
        success = scraper.save_to_csv("canada_developer_jobs.csv")
        if success:
            print("\nFirst 5 job listings:")
            for i, job in enumerate(jobs[:5], 1):
                print(f"\n{i}. {job['title']}")
                print(f"   Company: {job['company']}")
                print(f"   Location: {job['location']}")
                print(f"   Salary: {job['salary']}")
    else:
        print("No jobs were scraped. Please check:")
        print("1. Your internet connection")
        print("2. If Indeed is blocking requests")
        print("3. Try running with headless=False to see what's happening")

Starting Indeed Canada Job Scraper...
Navigating to: https://ca.indeed.com/jobs?q=&l=Canada
Found 15 job cards on this page
Page 1/2 completed. Found 15 jobs so far.
Navigating to: https://ca.indeed.com/jobs?q=&l=Canada&start=10
Job cards not found, trying to continue...
Found 0 job cards on this page
Page 2/2 completed. Found 15 jobs so far.
Successfully saved 15 jobs to canada_developer_jobs.csv

First 5 job listings:

1. Online ESL Tutors for Young Kids
   Company: N/A
   Location: N/A
   Salary: Not specified

2. Destination Specialist
   Company: N/A
   Location: N/A
   Salary: Not specified

3. Concierge (Entretien ménager)
   Company: N/A
   Location: N/A
   Salary: Not specified

4. Armed Guard
   Company: N/A
   Location: N/A
   Salary: Not specified

5. Journalier
   Company: N/A
   Location: N/A
   Salary: Not specified
