## Webscrape for Data Related Jobs in Canada
----
4 queries were made into ca.Indeed.com to obtain job postings for (1) Chemical Engineer, (2) Process Engineer, (3) Process Design Engineer, (4) Clean Energy Engineer, (5) Renewable Energy Engineer, (6) Data Analyst Engineer, (7) Mettallurgical Engineer, (8) Metallurgy engineer. Data will be limited to the last 14 days. Due to large quantities of job postings, a limit of 70 pages were extracted (equaling around 1000 job posts) for each role. 

Job title index was also assigned to each job posting during the web scrape. For example, while scraping for Data Analyst roles, an index number of 1 was assigned to each posting. This will help for analysis.

In [None]:
# ! pip install fake_useragent

In [1]:
# Dependencies
from bs4 import BeautifulSoup as Soup
import requests
import pymongo
import urllib, requests, re, pandas as pd
from pprint import pprint
import random
from fake_useragent import UserAgent
import time
from selenium import webdriver

In [2]:
# From here we generate a random user agent
ua = UserAgent()

In [3]:
# Get a list of latest proxies from a free website
res = requests.get('https://www.sslproxies.org/', headers={'User-Agent':'Mozilla/5.0'})
soup = Soup(res.text,"lxml")

proxy_list = []

for items in soup.select("#proxylisttable tbody tr"):
    proxy_ip = ':'.join([item.text for item in items.select("td")[:2]])
    proxy_list.append(proxy_ip)

print(proxy_list)

['35.182.149.39:3129', '60.246.7.4:8080', '182.253.174.78:8080', '139.59.114.159:8118', '118.174.232.234:44618', '103.209.64.19:6666', '118.173.232.61:47115', '139.255.11.147:8080', '3.131.207.170:19390', '51.81.82.175:80', '52.175.120.142:8080', '177.223.16.137:8080', '87.140.8.148:8080', '185.202.113.233:8080', '62.221.76.133:8080', '165.22.81.30:46215', '191.100.20.127:8080', '162.127.140.112:8080', '118.193.33.151:8080', '181.76.10.86:8080', '185.67.95.179:3128', '2.178.208.14:8080', '1.2.169.49:36335', '43.225.67.35:53905', '1.2.169.101:47477', '36.72.104.210:8080', '208.80.28.208:8080', '118.27.1.112:3128', '139.59.114.107:8118', '118.27.28.45:3128', '176.119.134.161:23500', '167.172.191.249:46377', '118.175.93.103:48214', '79.99.18.86:3128', '91.196.200.13:8080', '194.143.249.130:41258', '116.203.232.229:8118', '172.104.4.99:3128', '191.97.38.193:23500', '14.251.159.15:4145', '51.178.49.77:3132', '118.27.17.151:3128', '193.36.61.203:8000', '213.105.29.14:3128', '103.221.254.125:

# Choose 1/2 following cells, either scrape Chem Eng Jobs or Data Jobs

In [None]:
# list job Chem Eng Job titles to be searched

# titles = ["Process+EIT",
#           "Process+Engineer-in-training",         
#           "Junior+Process+Engineer",
#           "Junior+Chemical+Engineer",
#           "Junior+Project+Engineer",
#           "Junior+R&D+Engineer",
#           "Junior+Field+Production+Engineer",
#           "Junior+Reliability+Engineer",
#           "Junior+QA+Engineer"
#           "Junior+Water+Engineer",
#           "Junior+Data+Engineer",
#           "Junior+Sustainable+Energy+Engineer",
#           "Junior+Renewable+Energy+Engineer",         
#           "Junior+Mettallurgical+Engineer"
#           ]

# # Create empty lists to collect information later
# job_title_list = []
# job_title_index = []
# company_list = []
# job_id_list = []
# location_list = []
# links_list = []

In [4]:
# list job Chem Eng Job titles to be searched
titles = ["Data+Analyst",
          "Data+Scientist",
          "Data+Engineer",
          "Analytics+Engineer",
          "Analytical+Engineer",
          "Business+Analyst",
          "Machine+Learning"
          ]

# Create empty lists to collect information later
job_title_list = []
job_title_index = []
company_list = []
job_id_list = []
location_list = []
links_list = []

In [5]:
# Create random parameters
user_agent = ua.random
header = {"user-agent": str(user_agent)}

# Create random proxy list
proxy = random.choice(proxy_list)
proxy_protocol = {
    "http"  : proxy,
    "https" : proxy
}

In [None]:
# country_code = "ca"
# country = "canada"
# days = "14"
# title = "Process Engineer"
# page = 0



# url = f'https://{country_code}.indeed.com/jobs?q={title}&l={country}&sort=date&fromage={days}&start={page}'

# # Retrieve page with the requests module
# response = requests.get(
#                     url,
#                     #proxies=proxy_protocol,
#                     headers=header
#                 )

# # Create BeautifulSoup object; parse with 'html.parser'
# soup = Soup(response.text, 'lxml')
           
# # Retrieve the parent divs for all articles
# results = soup.find_all('div', class_='result')

# for result in results:
            
#     try:
#         job_link = result.find('a', class_='turnstileLink').get('href')
#         click_link = f"https://{country_code}.indeed.com{job_link}"

#     except:
#         pass
    
# print(click_link)

In [None]:
# Country code 
country_code = "us"
country = "USA"
days = "14"

# Start the main web scraping
for i, title in enumerate(titles):

    print("---------------")
    print("Starting job search for: ", title)
    # Reset page to 0
    page = 0
    counter = True

    while counter == True:
        
        # search query for Data Analyst roles
        url = f'https://www.indeed.com/jobs?q=\"{title}\"&l={country}&sort=date&fromage={days}&start={page}'
        print("Current page: ", url)

        # Random time gap
        time_gap = random.randrange(3, 7, 1)
        time.sleep(time_gap)
        
        # Retrieve page with the requests module
        response = requests.get(
                            url,
                            #proxies=proxy_protocol,
                            headers=header
                        )
        
        # Create BeautifulSoup object; parse with 'html.parser'
        soup = Soup(response.text, 'lxml')
           
        # Retrieve the parent divs for all articles
        results = soup.find_all('div', class_='result')


        # For page one, calculate the page number by deviding job counts by 15 (each Indeed page has 15 postings)
        if page == 0:
            
            try:
            
                job_count = soup.find('div', id='searchCountPages').text.strip()
                job_count = job_count.replace(",", "")
                job_count = int(job_count.split(" ")[3])
                page_count = round(job_count / 15, 0)
                page_range = int(page_count)

                if page_count == 0:
                    page_count = 1
                    page_range = int(page_count)
                    
                    
                print("Page number: ", int(page_count))

            except:
                counter = False
                print("There is 0 result for this job title")
                print("---------------")

        # Stop going to the next page when the last page was reached, 10 is because page goes as 10,20,30,...
        elif page == page_range*10:
            counter = False


        # loop over results to get article data
        for result in results:
            
            try:
                # scrape the article header 
                job_title = result.find('a', class_='jobtitle').text.strip()
                job_link = result.find('a', class_='jobtitle').get('href')

                # give the current title an index for differentiation purpose later in analysys step
                job_index = i + 1

                # scrape information
                company = result.find('span', class_='company').text.strip()
                job_id = result.get('id')
                location = result.find(class_='location').text 
                job_link = result.find('a', class_='turnstileLink').get('href')
                click_link = f"https://{country_code}.indeed.com{job_link}"
                
                # append to the lists
                job_title_list.append(job_title)
                job_title_index.append(job_index)
                company_list.append(company)
                location_list.append(location)
                job_id_list.append(job_id)
                links_list.append(click_link)
                
            except:
                pass

        # Update page parameter by adding 10
        page += 10

        # Every 10 pages, get random UA
        if page % 100 == 0:

            user_agent = ua.random
            header = {"user-agent": str(user_agent)}
            print(f"---------------\n\
                A new user-agent was created:\n\
                {user_agent}\n----------------")

    


print("===================\nScraping completed")

---------------
Starting job search for:  Data+Analyst
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=0
Page number:  100
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=10
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=20
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=30
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=40
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=50
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=60
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=70
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=80
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=US

Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=710
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=720
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=730
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=740
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=750
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=760
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=770
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=780
Current page:  https://www.indeed.com/jobs?q="Data+Analyst"&l=USA&sort=date&fromage=14&start=790
---------------
                A new user-agent was created:
                Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/5

Current page:  https://www.indeed.com/jobs?q="Data+Scientist"&l=USA&sort=date&fromage=14&start=410
Current page:  https://www.indeed.com/jobs?q="Data+Scientist"&l=USA&sort=date&fromage=14&start=420
Current page:  https://www.indeed.com/jobs?q="Data+Scientist"&l=USA&sort=date&fromage=14&start=430
Current page:  https://www.indeed.com/jobs?q="Data+Scientist"&l=USA&sort=date&fromage=14&start=440
Current page:  https://www.indeed.com/jobs?q="Data+Scientist"&l=USA&sort=date&fromage=14&start=450
Current page:  https://www.indeed.com/jobs?q="Data+Scientist"&l=USA&sort=date&fromage=14&start=460
Current page:  https://www.indeed.com/jobs?q="Data+Scientist"&l=USA&sort=date&fromage=14&start=470
Current page:  https://www.indeed.com/jobs?q="Data+Scientist"&l=USA&sort=date&fromage=14&start=480
Current page:  https://www.indeed.com/jobs?q="Data+Scientist"&l=USA&sort=date&fromage=14&start=490
---------------
                A new user-agent was created:
                Mozilla/5.0 (Windows NT 6.1) Ap

Current page:  https://www.indeed.com/jobs?q="Data+Engineer"&l=USA&sort=date&fromage=14&start=380
Current page:  https://www.indeed.com/jobs?q="Data+Engineer"&l=USA&sort=date&fromage=14&start=390
---------------
                A new user-agent was created:
                Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36
----------------
Current page:  https://www.indeed.com/jobs?q="Data+Engineer"&l=USA&sort=date&fromage=14&start=400
Current page:  https://www.indeed.com/jobs?q="Data+Engineer"&l=USA&sort=date&fromage=14&start=410
Current page:  https://www.indeed.com/jobs?q="Data+Engineer"&l=USA&sort=date&fromage=14&start=420
Current page:  https://www.indeed.com/jobs?q="Data+Engineer"&l=USA&sort=date&fromage=14&start=430
Current page:  https://www.indeed.com/jobs?q="Data+Engineer"&l=USA&sort=date&fromage=14&start=440
Current page:  https://www.indeed.com/jobs?q="Data+Engineer"&l=USA&sort=date&fromage=14&start=450
Current page:

Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=420
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=430
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=440
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=450
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=460
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=470
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=480
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=490
---------------
                A new user-agent was created:
                Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0;  rv:11.0) like Gecko
----------------
Current page:

Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=1110
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=1120
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=1130
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=1140
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=1150
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=1160
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=1170
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=1180
Current page:  https://www.indeed.com/jobs?q="Business+Analyst"&l=USA&sort=date&fromage=14&start=1190
---------------
                A new user-agent was created:
                Mozi

In [None]:
df = pd.DataFrame({
                "Job Title Index" : job_title_index,    
                "Job ID" : job_id_list,
                "Job Title" : job_title_list, 
                "Company Name" : company_list, 
                "Company Location" : location_list,
                "Link": links_list
                    
})


df.to_csv('US_Data_jobs.csv')