In [8]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.common.exceptions import TimeoutException
from datetime import datetime
import pandas as pd 
import time

def generate_url(jobTitle,location):
    """Generate URL given job title and location"""
    url = f"https://ca.indeed.com/{jobTitle}-jobs-in-{location}"
    return url

def get_record(card):
    """Extract info from one job card"""
    
    atag = card.h2.a
    jobTitle = atag.get('title')
    jobURL = "http://indeed.com" + atag.get('href')
    jobCompany = card.find('span',class_='company').text.strip()
    
    # for job location, sometimes it's <span> sometimes it's <div> 
    if card.find('span',class_='location'):
        jobLocation = card.find('span',class_='location').text.strip()
    else:
        jobLocation = card.find('div',class_='location').text.strip()
        
    jobSummary = card.find('div','summary').text.strip()
    jobPostDate = card.find('span','date').text
    today = datetime.today().strftime('%Y-%m-%d')
    
    # sometimes there's salary
    if card.find('span','salaryText'):
        jobSalary = card.find('span','salaryText').text.strip() 
    else:
        jobSalary = ''
        
    job = (jobTitle,jobCompany,jobLocation,jobPostDate,today,jobSalary,jobSummary,jobURL)
    
    return job

In [17]:
records = []

job_title = ""  # empty string for all jobs
loc = "ontario"
url = generate_url(jobTitle = job_title, location = loc)

driver = webdriver.Chrome("/Users/Traky/Desktop/jn/chromedriver")

# there will be pop-ups but that doesn't matter
while True:
    time.sleep(1)
    driver.get(url)

    # wait for human resolving a captcha
    #### multiple captcha... this block needs update 
    try: 
        captcha = driver.find_element_by_css_selector('iframe[role=presentation]')
        driver.switch_to.frame(captcha)
        wait = WebDriverWait(driver, 6000)  # wait 10 mins
        try:
            wait.until(ec.presence_of_element_located(('css selector', 'span[aria-checked="true"]')))
        except TimeoutException:
            print('\nTime out')
    except: 
        print('\nCaptcha solved successfully, proceeding to click!')
        driver.switch_to.parent_frame()
        driver.find_element_by_css_selector('#recaptcha-demo-submit').click()

    # make soup
    soup = BeautifulSoup(driver.page_source,'html.parser')
    cards = soup.find_all('div',class_='jobsearch-SerpJobCard')
    
    for card in cards: 
        record = get_record(card)
        records.append(record)    # append tuple to list
        
    try:
        url = "https://ca.indeed.com" + soup.find('a',{'aria-label':'Next'}).get('href')
    except AttributeError:
        break

In [18]:
# make dataframe 
records_df = pd.DataFrame(records, columns =['jobTitle','jobCompany','jobLocation','jobPostDate','today','jobSalary','jobSummary','jobURL']) 

records_df

Unnamed: 0,jobTitle,jobCompany,jobLocation,jobPostDate,today,jobSalary,jobSummary,jobURL


In [None]:
# to csv
records_df.to_csv("data/"+job_title+"_in_"+loc+".csv",index=False)