In [1]:
from os import path
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import math
import json
import re

In [2]:
BASE_URL = "https://www.jobbank.gc.ca"

KEY_WORDS = ["linux", "database", "code", "Git", "Devops", "Kafka", "Confluent", "Java", "C++", "C#", "Javascript", "CSS", "HTML", "Angular", "Fintech", "Agile", "software", "microservice", ".net", "spring boot", "CosmosDB", "Azure", "unit test", "integration test", "test", "Kubernetes", "Insurance", "script", "visual basic", "vb", "scala", "SQL", "graphql", "JS", "typescript", "Docker", "Adaptable", "lead", "understanding", "collaborate", "detail oriented", "engineer", "develop", "full stack", "mysql", "windows", "Object-Oriented", "Object Oriented", "programming", "unix", "data", "cloud", "ci/cd", "saas", "mock-up", "web-site", "backend", "frontend", "life cycle", ]

JOB_TITLE_WORDS = ["computer", "engineer", "develop", "full stack", "software", "analyst", "architect", "app", "information", "technology", "system", "program", "web"]

EXPERIENCE_NOT_ENOUGH = "5 years or more"

NUMBER_OF_KEY_WORDS_IN_JOB_DESCRIPTION = 2

NUMBER_OF_KEY_WORDS_IN_JOB_TITLE = 0

In [3]:
def trim(text):
    if text is not None:
        return re.sub("\s+", ' ', text).strip()

def SetupSelenium():
    url = f"{BASE_URL}/jobsearch/jobsearch?sort=M&fskl=15141&fskl=100000"

    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--headless')

    homedir = path.expanduser("~")
    webdriver_service = Service(f"{homedir}/chromedriver/stable/chromedriver")

    browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

    browser.get(url)

    return browser

def GetTotalNumberOfJobs(browser):
    totalNumberOfJobs = int(browser.find_element(By.XPATH, "//span[@class='found']").get_attribute("textContent"))
    print("TOTAL NUMBER OF JOBS: ", totalNumberOfJobs)
    return totalNumberOfJobs

def ClickMoreResults(browser, numberOfClicks):
    for i in range(numberOfClicks):
        moreResultsButton = browser.find_element(By.XPATH, "//button[@id='moreresultbutton']")
        browser.execute_script("arguments[0].click();", moreResultsButton)
        sleep(1)

def GetJobs(articles):
    jobUrls = []
    for job in articles:
        jobTitle = trim(job.find_element(By.XPATH, ".//span[@class='noctitle']").get_attribute("textContent"))
        aTag = job.find_element(By.XPATH, ".//a")
        link = aTag.get_attribute("href")
        jobUrls.append({"title": jobTitle, "link": link})
    return jobUrls

def FilterJobsBasedOnTitle(jobs):
    filteredJobs = []
    for job in jobs:
        jobTitleMatches = [ele for ele in JOB_TITLE_WORDS if(ele in job["title"])]
        if len(jobTitleMatches) > NUMBER_OF_KEY_WORDS_IN_JOB_TITLE:
            filteredJobs.append(job["link"])
    print("NUMBER OF JOBS AFTER FILTERING TITLE: ", len(filteredJobs))
    return filteredJobs

def FilterJobsBasedOnExperience(jobs):
    filteredJobs = []
    for job in jobs:
            filteredJobs.append(job["link"])
    print("NUMBER OF JOBS AFTER FILTERING TITLE: ", len(filteredJobs))
    return filteredJobs

def GetXPathElementText(browser, XPATH):
    element = browser.find_elements(By.XPATH, XPATH)
    if len(element) == 1:
        return element[0].get_attribute("textContent")

def GetMatchedJobInfo(browser):
    minSalary = GetXPathElementText(browser, "//span[@property='minValue']")
    maxSalary = GetXPathElementText(browser, "//span[@property='maxValue']")
    workHours = GetXPathElementText(browser, "//span[@property='workHours']")
    companyName = trim(GetXPathElementText(browser, "//span[@property='hiringOrganization']"))
    experience = trim(GetXPathElementText(browser, "//p[@property='experienceRequirements qualification']"))
    return {"minSalary": minSalary, "maxSalary": maxSalary, "workHours": workHours, "companyName": companyName, "experience": experience}

In [4]:
browser = SetupSelenium()

totalNumberOfJobs = GetTotalNumberOfJobs(browser)

numberOfClicks = int(totalNumberOfJobs/25)
print("NUMBER OF CLICKS: ", numberOfClicks)

articles = browser.find_elements(By.XPATH, '//article')
print("NUMBER OF JOBS BEFORE CLICKS:", len(articles))

ClickMoreResults(browser, numberOfClicks)

articlesAfterClicks = browser.find_elements(By.XPATH, '//article')
print("NUMBER OF JOBS AFTER CLICK:", len(articlesAfterClicks))

allJobs = GetJobs(articlesAfterClicks)

TOTAL NUMBER OF JOBS:  894
NUMBER OF CLICKS:  35
NUMBER OF JOBS BEFORE CLICKS: 25
NUMBER OF JOBS AFTER CLICK: 894


In [5]:
jobsFilteredByTitle = FilterJobsBasedOnTitle(allJobs)

NUMBER OF JOBS AFTER FILTERING TITLE:  209


In [6]:
descriptions = []
for url in jobsFilteredByTitle:
    browser.get(url)
    descriptions.append(trim(GetXPathElementText(browser, "//div[@class='job-posting-detail-requirements ']")))

In [None]:
import collections

bigDescriptionString = " ".join(descriptions)

countedWords = dict(collections.Counter(bigDescriptionString.split()))

countedWordsSorted = dict(sorted(countedWords.items(), key=lambda item: item[1]))

with open("most-common-words.json", "w") as final:
   json.dump(countedWordsSorted, final)

In [None]:
jobsWithKeywords = []

for url in jobsFilteredByTitle:
    browser.get(url)

    jobDescription = trim(GetXPathElementText(browser, "//div[@class='job-posting-detail-requirements ']"))
    matchedKeyWords = [ele for ele in KEY_WORDS if(ele in jobDescription)]

    experience = trim(GetXPathElementText(browser, "//p[@property='experienceRequirements qualification']"))

    if len(matchedKeyWords) > NUMBER_OF_KEY_WORDS_IN_JOB_DESCRIPTION and experience != EXPERIENCE_NOT_ENOUGH:
        jobTitle = trim(GetXPathElementText(browser, "//span[@property='title']"))
        print(f"TITLE: {jobTitle}, URL: {url}")
        jobInfo = GetMatchedJobInfo(browser)
        jobInfo.update({"Title": jobTitle, "KeyWords": matchedKeyWords, "link": url})
        jobsWithKeywords.append(jobInfo)

In [26]:
with open("mydata.json", "w") as final:
   json.dump(jobsWithKeywords, final)

browser.quit()

In [27]:
import json
import csv

with open('./mydata.json') as json_file:
    jsondata = json.load(json_file)

data_file = open('jobs.csv', 'w', newline='')
csv_writer = csv.writer(data_file)

count = 0
for data in jsondata:
    if count == 0:
        header = data.keys()
        csv_writer.writerow(header)
        count += 1
    csv_writer.writerow(data.values())

data_file.close()