# Job Offers web-scrapping

In [2]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.keys import Keys
import pandas as pd
import re
import random

### Automation 1: Log-in Action

In [2]:
'''
    > This section automates the process of Logging-in to Linkedin.
    > Make sure to use a valid Email Address and password.
    > Selenium will open a chrome window which you can interact with.
    > Once logged in, the site may ask to verify if you are human, you can use the pop-up window to solve the captcha or insert the access code.
    > If any error arises, make sure to check the relevant driver.find_elements, as with time these are likely to be updated by Linkedin.
'''

# Creating a webdriver instance to log into LinkedIn using selenium required Chrome version
driver = webdriver.Chrome(ChromeDriverManager().install())

# Opening linkedIn's login page
driver.get("https://linkedin.com/uas/login")

# waiting for the page to load
time.sleep(5)

# Accessing username field
username = driver.find_element_by_id("username")

# Enter Your Email Address
username.send_keys("email@emailprovider.com")

# Accessing password fiel
pword = driver.find_element_by_id("password")

# Enter Your Password
pword.send_keys("PASSWORD")

# Clicking on the log in button
driver.find_element_by_xpath("//button[@type='submit']").click()

[WDM] - Downloading: 100%|██████████| 6.79M/6.79M [00:01<00:00, 5.60MB/s]


### Automation 2: Navigate to job section and insert job queries (Job Title and location)

In [5]:
'''
    > This section automates the process to access the required job listings.
    > If any error arises, make sure to check the relevant driver.find_elements as with time these are likely to be updated by Linkedin
'''

jobTitle = 'computer science'                   # Insert job title to query
jobLocation = 'United Arab Emirates'            # Insert job location to query

# Switching to Jobs page to be able to query location
jobs = driver.find_element_by_xpath('//*[@id="global-nav"]/div/nav/ul/li[3]')
jobs.click()

# Clicking on search bar
searchBar = driver.find_element_by_class_name('jobs-search-box__keywords-label')
searchBar.click()
search_src = driver.page_source

time.sleep(5)

#Adds Job Title

patternCriteria = re.compile(r'jobs-search-box-keyword-id-ember[0-9]*')
matchCriteria = patternCriteria.search(search_src).group(0)
searchKeyWords = driver.find_element_by_id(matchCriteria).send_keys(jobTitle)

time.sleep(5)

#Adds Location and hits enter
patternLocation = re.compile(r'jobs-search-box-location-id-ember[0-9]*')
matchLocation = patternLocation.search(search_src).group(0)
CountryKeyWords = driver.find_element_by_id(matchLocation).send_keys(jobLocation)
time.sleep(2)

driver.find_element_by_id(matchLocation).send_keys(Keys.ENTER)

### Automation 3: extract URLs from job offers list

In [None]:
'''
    > This process will automatically scroll through the different pages available and extract all the URLs from the different job posts.
    > Upon completion and as a checkpoint of the process, all links will be saved to disk in a CSV format in the project directory.
    > Different validations are in place to have an overview of the extraction process and supervise the workflow.
    > If any error arises, make sure to check the relevant driver.find_elements as with time these are likely to be updated by Linkedin.
'''
# Settings: Scroll through Pagination
findPages = driver.find_elements_by_class_name("artdeco-pagination__indicator.artdeco-pagination__indicator--number")
totalPages = findPages[len(findPages)-1].text
totalPagesInt = int(re.sub(r"[^\d.]", "", totalPages))

# Variable Setup for URL Extraction Loop
indexPage = 1
jobLoopTTime = 0
job_links = []
randTime = random.randint(1,6)
extractedURL = []
final_extractedURL = []

# URL Extraction Loop
for page in  range(0,totalPagesInt):
    jobLoop_start_time = time.time()
    time.sleep(randTime)

    # Scroll down to load all available jobs to reach job list bottom line
    jobsBlock = driver.find_element_by_class_name('scaffold-layout__list-container')  #Indicates container of job list
    jobsScaffold = jobsBlock.find_elements_by_css_selector(".jobs-search-results__list-item")  #Indicates css selector for the job entries (can be pulled from first *li* element)
    jobEntry = 0
    for i in jobsScaffold:
        jobEntry+= 1
        print(f"-- scrolling to job: {str(jobEntry)} --")
        driver.execute_script('arguments[0].scrollIntoView();', jobsScaffold[jobEntry-1])

    #Scraps all links in job list
    time.sleep(randTime)
    job_src = driver.page_source
    soup = BeautifulSoup(job_src, 'lxml')
    for link in soup.find_all('a', {'class': 'job-card-list__title'}):
        job_links.append(link.get('href'))
        URL = link.get('href')
        pageNumb = indexPage
        extractedURL = dict(((i, eval(i)) for i in ('pageNumb', 'URL')))
        final_extractedURL.append(extractedURL)
    print(job_links)
    print(extractedURL)
    print(f"links extracted: {str(len(job_links))}")
    print(f"pages scrapped: {str(round(len(job_links) / 25))}")
    print(f"Duplicate Links Status: {str('NO Duplicated links' if (len(job_links) == len(set(job_links))) else 'There ARE Duplicates')}")

    #Scrolls to next page
    indexPage+= 1
    getNextPage = driver.find_element_by_xpath("//button[@aria-label='Page "+str(indexPage)+"']")
    getNextPage.send_keys(Keys.RETURN)
    time.sleep(randTime)
    print(f"Next page is: {str(indexPage)}")
    jobLoop_end_time = time.time()
    jobLoopTTime+= jobLoop_end_time - jobLoop_start_time
    print(f"elapsed time: {str(round(jobLoopTTime,3))} seconds")
    print("________________")

print(f'Total Elapsed Time: {str(round(jobLoopTTime/60,3))} minutes')

# Adding main URL to extracted links
full_url = [('https://linkedin.com' + URL) for URL in job_links]

#Saving to disk the extracted links as backup
URLsExport = pd.DataFrame(set(full_url))
URLsExport.to_csv("URLscrapped.csv", index=False, header=False)

'''
    > Uncomment below lines for debugging duplicates, URLsExport_2 stores both the URL and the page where the link was extracted from.
'''
# URLsExport_2 = pd.DataFrame(final_extractedURL)
#URLsExport_2.to_csv("URLscrapped_bypage.csv", index=False, header=False)

### Automation 4: Extract job offer details

In [4]:
'''
    > This process will use the links saved in the CSV and not the ones stored in memory.
'''
#extract links from CSV
full_url_csv = pd.read_csv('URLscrapped.csv', names=['link'])
full_url_csv = full_url_csv['link'].tolist()
print(f'Total URLs loaded from file: {len(full_url_csv)} links')

Total URLs loaded from file: 75 links


In [110]:
'''
    > Uncomment below section if you want to scrap the job details in batches.
    > Recommended if you are modifying the code and want to test the updates.
'''
# Spliting links into different groups to run the scrap in batches

#test_urls_1 = full_url_csv[0:100]
#test_urls_2 = full_url_csv[101:200]
#test_urls_3 = full_url_csv[201:300]
#test_urls_4 = full_url_csv[301:400]
#test_urls_5 = full_url_csv[401:500]
#test_urls_6 = full_url_csv[501:600]
#test_urls_7 = full_url_csv[601:700]
#test_urls_8 = full_url_csv[701:800]
#test_urls_9 = full_url_csv[801:900]
#test_urls_10 = full_url_csv[901:1000]

In [10]:
#variable to insert dictionary of elements extracted
final_dict = []

In [11]:
'''
    > The following loop will automate the extraction process from the loaded URLs.
    > Extracted details will be saved to lnkd_scrapping_jobs.csv
    > Different validations are in place to have an overview of the extraction process and supervise the workflow.
    > If any error arises, make sure to check the relevant driver.find_elements as with time these are likely to be updated by Linkedin.
    > If a link is no longer available, the fields for that given URL will be saved as empty fields and status will be set to: no longer active.
'''
# Loop to extract details from URL page
start_time_cell = time.time()
indexPage = 0
jobLoopTTime = 0
job_links = []
randTime = random.randint(1,6)
jobPageTime = 0

for links in full_url_csv:
    start_time = time.time()
    try:
        driver.get(links)
        time.sleep(randTime)
        jobDetails_src = driver.page_source
        driver.find_element_by_class_name("artdeco-card__action").click()
        soup = BeautifulSoup(jobDetails_src, 'lxml')

        jobURL = links
        jobTitle = soup.find('h1', {'class': 'jobs-unified-top-card__job-title'}).text.strip()
        jobDescription = soup.find('div', {'class': 'jobs-description-content__text--stretch'}).text.strip()
        URLStatus = 'Active'
        try:
            jobCompanyName = soup.find('span', {'class': 'jobs-unified-top-card__company-name'}).text.strip()
            jobLocation = driver.find_elements_by_class_name('jobs-unified-top-card__bullet')[0].get_attribute("innerHTML")
            jobType = soup.find('span', {'class': 'jobs-unified-top-card__workplace-type'}).text.strip()
            jobPostedDate = soup.find('span', {'class': 'jobs-unified-top-card__posted-date'}).text.strip()
            jobApplicantsRange = soup.find_all('span', {'class': 'jobs-unified-top-card__bullet'})[1].text.strip()
            jobTotalApplicants = soup.find('li', {'class': 'jobs-unified-top-card__job-insight--highlight'}).text.strip()
            jobLevel = soup.find_all('li', {'class': 'jobs-unified-top-card__job-insight'})[0].text.strip()
            jobCompSize = soup.find_all('li', {'class': 'jobs-unified-top-card__job-insight'})[1].text.strip()
        except:
            jobCompanyName = ''
            jobLocation = ''
            jobType = ''
            jobPostedDate = ''
            jobApplicantsRange = ''
            jobTotalApplicants =  ''
            jobLevel = ''
            jobCompSize = ''
    except:
        print("URL no longer available")
        jobURL = links
        URLStatus = 'no longer active'
        jobTitle = ''
        jobDescription = ''
        jobCompanyName = ''
        jobLocation = ''
        jobType = ''
        jobPostedDate = ''
        jobApplicantsRange = ''
        jobTotalApplicants =  ''
        jobLevel = ''
        jobCompSize = ''

    time.sleep(randTime)
    extractDict = dict(((i, eval(i)) for i in ('jobURL', 'jobTitle', 'jobDescription', 'jobCompanyName', 'jobLocation', 'jobType', 'jobLevel', 'jobCompSize', 'jobPostedDate', 'jobApplicantsRange', 'jobTotalApplicants', 'URLStatus')))
    final_dict.append(extractDict)
    print(f"links scrapped: {str(len(final_dict))}")
    print(f"overall completion: %{str(round((len(final_dict) / len(full_url_csv) * 100),2))}")
    end_time = time.time()
    jobPageTime+= end_time-start_time
    print(f"elapsed time: {str(round(jobPageTime,2))} seconds" )
    print("----------------------------")


final_export = pd.DataFrame(final_dict)
final_export.to_csv(f"lnkd_scrapping_jobs_[{len(full_url_csv)}].csv", index=False, header=True)
end_time_cell = time.time()
print(f"Total elapsed time: {round((end_time_cell - start_time_cell) / 60,2)} minutes")



links scrapped: 1
overall completion: %1.33
elapsed time: 9.47 seconds
----------------------------
links scrapped: 2
overall completion: %2.67
elapsed time: 19.03 seconds
----------------------------
links scrapped: 3
overall completion: %4.0
elapsed time: 28.27 seconds
----------------------------
links scrapped: 4
overall completion: %5.33
elapsed time: 37.31 seconds
----------------------------
links scrapped: 5
overall completion: %6.67
elapsed time: 46.43 seconds
----------------------------
links scrapped: 6
overall completion: %8.0
elapsed time: 55.51 seconds
----------------------------
links scrapped: 7
overall completion: %9.33
elapsed time: 64.59 seconds
----------------------------
links scrapped: 8
overall completion: %10.67
elapsed time: 73.81 seconds
----------------------------
links scrapped: 9
overall completion: %12.0
elapsed time: 83.12 seconds
----------------------------
links scrapped: 10
overall completion: %13.33
elapsed time: 92.32 seconds
-------------------