## Webscrape for Data Related Jobs in Canada
----
4 queries were made into ca.Indeed.com to obtain job postings for (1) Chemical Engineer, (2) Process Engineer, (3) Process Design Engineer, (4) Clean Energy Engineer, (5) Renewable Energy Engineer, (6) Data Analyst Engineer, (7) Mettallurgical Engineer, (8) Metallurgy engineer. Data will be limited to the last 14 days. Due to large quantities of job postings, a limit of 70 pages were extracted (equaling around 1000 job posts) for each role. 

Job title index was also assigned to each job posting during the web scrape. For example, while scraping for Data Analyst roles, an index number of 1 was assigned to each posting. This will help for analysis.

In [None]:
# ! pip install fake_useragent

In [1]:
# Dependencies
from bs4 import BeautifulSoup as Soup
import requests
import pymongo
import urllib, requests, re, pandas as pd
from pprint import pprint
import random
from fake_useragent import UserAgent
import time

In [2]:
# From here we generate a random user agent
ua = UserAgent()

In [3]:
# Get a list of latest proxies from a free website
res = requests.get('https://www.sslproxies.org/', headers={'User-Agent':'Mozilla/5.0'})
soup = Soup(res.text,"lxml")

proxy_list = []

for items in soup.select("#proxylisttable tbody tr"):
    proxy_ip = ':'.join([item.text for item in items.select("td")[:2]])
    proxy_list.append(proxy_ip)

print(proxy_list)

['181.129.183.19:53281', '125.27.251.87:58182', '103.90.145.196:8080', '154.72.204.122:8080', '110.232.86.52:53281', '103.242.15.37:55443', '124.41.240.126:31984', '51.81.82.175:80', '89.40.48.186:8080', '190.167.215.170:52479', '69.65.65.178:58389', '187.72.139.10:80', '78.111.97.179:3139', '139.99.102.114:80', '37.120.192.154:8080', '103.109.59.242:53281', '106.104.151.142:58198', '122.15.131.65:57873', '168.119.49.225:10007', '40.79.26.139:1080', '20.50.135.160:1080', '24.248.207.7:55443', '161.202.226.194:80', '41.217.219.53:31398', '36.84.99.192:3128', '60.246.7.4:8080', '41.65.174.66:8080', '142.93.147.210:3128', '87.140.8.148:8080', '51.75.147.41:3128', '68.183.221.156:42430', '35.182.149.39:3129', '159.89.121.54:8080', '118.175.207.180:40017', '207.144.111.230:8080', '109.74.66.102:3128', '81.182.0.87:8080', '92.115.102.133:55443', '128.14.178.94:3128', '45.235.110.66:53281', '139.255.11.147:8080', '128.14.163.94:3128', '167.172.191.249:46377', '167.172.109.12:44465', '167.71.2

In [14]:
# list job titles to be searched
titles = ["Process+EIT",
          "Process+Engineer-in-training",         
          "Junior+Process+Engineer",
          "Junior+Chemical+Engineer",
          "Junior+Project+Engineer",
          "Junior+R&D+Engineer",
          "Junior+Field+Production+Engineer",
          "Junior+Reliability+Engineer",
          "Junior+QA+Engineer"
          "Junior+Water+Engineer",
          "Junior+Data+Engineer",
          "Junior+Sustainable+Energy+Engineer",
          "Junior+Renewable+Energy+Engineer",         
          "Junior+Mettallurgical+Engineer"
          ]

# Create empty lists to collect information later
job_title_list = []
job_title_index = []
company_list = []
job_id_list = []
location_list = []
links_list = []

In [15]:
# Create random parameters
user_agent = ua.random
header = {"user-agent": str(user_agent)}

# Create random proxy list
proxy = random.choice(proxy_list)
proxy_protocol = {
    "http"  : proxy,
    "https" : proxy
}

In [16]:
# Country code 
country_code = "ca"
country = "canada"
days = "14"

# Start the main web scraping
for i, title in enumerate(titles):

    print("---------------")
    print("Starting job search for: ", title)
    # Reset page to 0
    page = 0
    counter = True

    while counter == True:
        
        # search query for Data Analyst roles
        url = f'https://{country_code}.indeed.com/jobs?q={title}&l={country}&sort=date&fromage={days}&start={page}'
        print("Current page: ", url)

        # Random time gap
        time_gap = random.randrange(3, 7, 1)
        time.sleep(time_gap)
        
        # Retrieve page with the requests module
        response = requests.get(
                            url,
                            #proxies=proxy_protocol,
                            headers=header
                        )
        
        # Create BeautifulSoup object; parse with 'html.parser'
        soup = Soup(response.text, 'lxml')
           
        # Retrieve the parent divs for all articles
        results = soup.find_all('div', class_='result')


        # For page one, calculate the page number by deviding job counts by 15 (each Indeed page has 15 postings)
        if page == 0:
            
            try:
            
                job_count = soup.find('div', id='searchCountPages').text.strip()
                job_count = job_count.replace(",", "")
                job_count = int(job_count.split(" ")[3])
                page_count = round(job_count / 15, 0)
                page_range = int(page_count)

                if page_count == 0:
                    page_count = 1
                    page_range = int(page_count)
                    
                    
                print("Page number: ", int(page_count))

            except:
                counter = False
                print("There is 0 result for this job title")
                print("---------------")

        # Stop going to the next page when the last page was reached, 10 is because page goes as 10,20,30,...
        elif page == page_range*10:
            counter = False


        # loop over results to get article data
        for result in results:
            
            try:
                # scrape the article header 
                job_title = result.find('a', class_='jobtitle').text.strip()

                # give the current title an index for differentiation purpose later in analysys step
                job_index = i + 1

                # scrape information
                company = result.find('span', class_='company').text.strip()
                job_id = result.get('id')
                location = result.find(class_='location').text 
#                 id_number = job_id.replace("_", " ")
#                 id_number = str(id_number.split(" ")[2])
                link = f"{url}&vjk={job_id}"
                
                # append to the lists
                job_title_list.append(job_title)
                job_title_index.append(job_index)
                company_list.append(company)
                location_list.append(location)
                job_id_list.append(job_id)
                links_list.append(link)
                
            except:
                pass

        # Update page parameter by adding 10
        page += 10

        # Every 10 pages, get random UA
        if page % 100 == 0:

            user_agent = ua.random
            header = {"user-agent": str(user_agent)}
            print(f"---------------\n\
                A new user-agent was created:\n\
                {user_agent}\n----------------")

    


print("===================\nScraping completed")

---------------
Starting job search for:  Process+EIT
Current page:  https://ca.indeed.com/jobs?q=Process+EIT&l=canada&sort=date&fromage=14&start=0
Page number:  3
Current page:  https://ca.indeed.com/jobs?q=Process+EIT&l=canada&sort=date&fromage=14&start=10
Current page:  https://ca.indeed.com/jobs?q=Process+EIT&l=canada&sort=date&fromage=14&start=20
Current page:  https://ca.indeed.com/jobs?q=Process+EIT&l=canada&sort=date&fromage=14&start=30
---------------
Starting job search for:  Process+Engineer-in-training
Current page:  https://ca.indeed.com/jobs?q=Process+Engineer-in-training&l=canada&sort=date&fromage=14&start=0
Page number:  1
Current page:  https://ca.indeed.com/jobs?q=Process+Engineer-in-training&l=canada&sort=date&fromage=14&start=10
---------------
Starting job search for:  Junior+Process+Engineer
Current page:  https://ca.indeed.com/jobs?q=Junior+Process+Engineer&l=canada&sort=date&fromage=14&start=0
Page number:  1
Current page:  https://ca.indeed.com/jobs?q=Junior+Pr

In [18]:
df = pd.DataFrame({
                "Job Title Index" : job_title_index,    
                "Job ID" : job_id_list,
                "Job Title" : job_title_list, 
                "Company Name" : company_list, 
                "Company Location" : location_list,
                "Link": links_list
                    
})


df.to_csv('CA_chemengjobs.csv')