    Scrape LinkedIn using Selenium, Request and BeautifulSoup 

The folowing details will be scraped:
- job id
- job title
- seniority level
- location
- job description
- number of candidates
- posted time ago

- To scrape the job ids, will use Selenium to navigate to: https://www.linkedin.com/jobs/search?
- LinkedIn credentials and having chromedriver executable are required
- And after that scraping other job details (level, description...) will be done using simple GET request, from the requests library, using it's already collected job id

    1. Scraping LinkedIn Jobs Ids using Selenium and BeautifulSoup

In [27]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
import requests

import time, datetime
import pandas as pd
import numpy as np 
import math, re, sys
import warnings
warnings.filterwarnings("ignore")


    LogIn to LinkedIn using selenium

LinkedIn credentials will be saved in '../data/user_credentials.txt', so can access and change them from there

In [1]:
with open('../data/user_credentials.txt', 'r', encoding="utf-8") as file:
    user_credentials = file.readlines()
    user_credentials = [line.rstrip() for line in user_credentials]

email,passwd = user_credentials[0],user_credentials[1]
email, passwd

('email', 'password')

    Set up the chromedriver and open the linkedIn page

In [29]:
chrome_driver_path = '../chromedriver-mac-arm64/chromedriver'
service = Service(executable_path=chrome_driver_path)

options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options, service=service)

driver.get('https://www.linkedin.com/login')
time.sleep(5) # waiting page to load

email_input = driver.find_element(By.ID, 'username')
password_input = driver.find_element(By.ID, 'password')
email_input.send_keys(email)
password_input.send_keys(passwd)

# Click the login button
password_input.send_keys(Keys.ENTER)

time.sleep(10)



    Scrapping Linkedin Jobs IDs

Should provide search parameters, like job title and location. In LinkedIn search results are displayed on many pages, where 25 jobs are listed on each page. So to navigate on each page will use start parameter and scroll to the bottom on each page, so the full data can be loaded. And at the end to get the searched job ids will parse the HTML content using BeautifulSoup

In [30]:
job_ids = []

In [31]:
def scroll_to_bottom(driver,sleep_time=120):
    last_height = driver.execute_script('return document.body.scrollHeight')
    while True:
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        new_height = driver.execute_script('return document.body.scrollHeight')
        if new_height == last_height:
            break
        last_height = new_height
    
    time.sleep(sleep_time)  

In [32]:
keywords = 'junior%20developer'
location = 'Sofia'
start = 0

url = f'https://linkedin.com/jobs/search/?keywords={keywords}&location={location}&start={start}'
url = requests.utils.requote_uri(url)
driver.get(url)
scroll_to_bottom(driver, sleep_time=120)

Parse the HTML content of the page and get number of jobs and pages as well

In [33]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

try:
    div_number_of_jobs = soup.find("div",{"class":"jobs-search-results-list__subtitle"})
    number_of_jobs = int(div_number_of_jobs.find('span').get_text().strip().split()[0])
except:
    number_of_jobs = 0
    
number_of_pages=math.ceil(number_of_jobs/25)
print("number_of_jobs:",number_of_jobs)
print("number_of_pages:",number_of_pages)

number_of_jobs: 38
number_of_pages: 2


In [34]:
def find_job_ids(soup):
    ids_on_page = []

    job_postings = soup.find_all('li', {'class': 'ember-view'})

    for job_posting in job_postings:
        job_id = job_posting.get('data-occludable-job-id')
        ids_on_page.append(job_id)

    return ids_on_page

Find job ids on the first page

In [35]:
first_page_jobs = find_job_ids(soup)
filtered_job_ids = [job_id for job_id in first_page_jobs if job_id is not None]  # filter them just in case if there's something wrong
print(filtered_job_ids)
print("Jobs on page: ", len(filtered_job_ids))

job_ids.extend(filtered_job_ids)

['4137245989', '3936580453', '4063857099', '4122081493', '4121139946', '4133033481', '4139199423', '3935983004', '4120976805', '4132048865', '4121134848', '4139845618', '4082686855', '4129433849', '4131020637', '4072902328', '4144819234', '4144820138', '4144434105', '4142359405', '4144343176', '4145588902', '4144441725', '4143224517', '4143223837']
Jobs on page:  25


In case if number of pages is more than 1, iterate over the remaining pages and get the job ids

Save extracted job ids in csv file

Can change the value of the sleep_time based on internet speed

In [36]:
if number_of_pages>1:
    for page_num in range(1,number_of_pages):
        print(f"Scraping page: {page_num + 1}",end="...")
        start = 25 * page_num
        
        url = f'https://www.linkedin.com/jobs/search/?keywords={keywords}&location={location}&start={start}'
        url = requests.utils.requote_uri(url)
        driver.get(url)
        scroll_to_bottom(driver, sleep_time=120)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        current_page_jobs = find_job_ids(soup)
        filtered_job_ids = [job_id for job_id in current_page_jobs if job_id is not None]
        print(filtered_job_ids)
        job_ids.extend(filtered_job_ids)  
        print(f'Jobs on page:{len(filtered_job_ids)}')

pd.DataFrame({"Job_Id":job_ids}).to_csv('../data/job_ids.csv',index=False)
print('Saved job ids in csv file')

Scraping page: 2...['4144900760', '4144431991', '3787789000', '4042554544', '4144909903', '4120978886', '4142361143', '4072487387', '4142319103', '4142312846', '4130151238', '4142330252']
Jobs on page:12
Saved job ids in csv file


In [37]:
driver.quit()

    Scrapping job description using Requests and BeautifulSoup

https://www.scrapingdog.com/blog/scrape-linkedin-jobs/

In [38]:
import requests
from bs4 import BeautifulSoup

list_job_ids = pd.read_csv("../data/job_ids.csv").Job_Id.to_list()

In [41]:
print(len(list_job_ids))
list_job_ids[:5] # got only the first 5 to check if everything is ok

37


[4137245989, 3936580453, 4063857099, 4122081493, 4121139946]

Function designed to remove HTML tags while keeping the visible text context

In [42]:
def remove_tags(html):
    soup = BeautifulSoup(html, "html.parser")
 
    for data in soup(['style', 'script']):
        data.decompose()
 
    return ' '.join(soup.stripped_strings)

Here the code takes each job ID from list_job_ids, constructs the job's LinkedIn API URL, makes a request to fetch the job listing details, and extracts relevant information from the HTML response

In [45]:
job_url = 'https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{}'
list_jobs = []

# LinkedIn might block frequent requests, so it's safer to add User-Agent header to avoid detection as a bot
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def safe_find_text(soup, tag, attrs):
    element = soup.find(tag, attrs)
    return element.text.strip() if element else None

for j in range(len(list_job_ids)):
    print(f"{j+1} ... read jobId:{list_job_ids[j]}")

    time.sleep(1)
    resp = requests.get(job_url.format(list_job_ids[j]), headers=headers)
    soup = BeautifulSoup(resp.text, 'html.parser')
    
    job = {}

    job["Job_ID"] = list_job_ids[j]
    job["Job_txt"] = remove_tags(resp.text)

    try:
        job["company"]=soup.find("div",{"class":"top-card-layout__card"}).find("a").find("img").get('alt')
    except:
        job["company"]=None

    try:
        job["job-title"]=soup.find("div",{"class":"top-card-layout__entity-info"}).find("a").text.strip()
    except:
        job["job-title"]=None

    try:
        job["level"]=soup.find("ul",{"class":"description__job-criteria-list"}).find("li").text.replace("Seniority level","").strip()
    except:
        job["level"]=None

    try:
        job["location"]=soup.find("span",{"class":"topcard__flavor topcard__flavor--bullet"}).text.strip()
    except:
        job["location"]=None
    
    try:
        job["posted-time-ago"]=soup.find("span",{"class":"posted-time-ago__text topcard__flavor--metadata"}).text.strip()
    except:
        job["posted-time-ago"]=None

    try:
        nb_candidats_text = soup.find("span", {"class": "num-applicants__caption"}).text.strip()
        job["nb_candidats"] = int(nb_candidats_text.split()[0])
    except:
        job["nb_candidats"] = None

    list_jobs.append(job)

jobs_DF = pd.DataFrame(list_jobs)

1 ... read jobId:4137245989
2 ... read jobId:3936580453
3 ... read jobId:4063857099
4 ... read jobId:4122081493
5 ... read jobId:4121139946
6 ... read jobId:4133033481
7 ... read jobId:4139199423
8 ... read jobId:3935983004
9 ... read jobId:4120976805
10 ... read jobId:4132048865
11 ... read jobId:4121134848
12 ... read jobId:4139845618
13 ... read jobId:4082686855
14 ... read jobId:4129433849
15 ... read jobId:4131020637
16 ... read jobId:4072902328
17 ... read jobId:4144819234
18 ... read jobId:4144820138
19 ... read jobId:4144434105
20 ... read jobId:4142359405
21 ... read jobId:4144343176
22 ... read jobId:4145588902
23 ... read jobId:4144441725
24 ... read jobId:4143224517
25 ... read jobId:4143223837
26 ... read jobId:4144900760
27 ... read jobId:4144431991
28 ... read jobId:3787789000
29 ... read jobId:4042554544
30 ... read jobId:4144909903
31 ... read jobId:4120978886
32 ... read jobId:4142361143
33 ... read jobId:4072487387
34 ... read jobId:4142319103
35 ... read jobId:41423

In [48]:
jobs_DF.head()

Unnamed: 0,Job_ID,Job_txt,company,job-title,level,location,posted-time-ago,nb_candidats
0,4137245989,"Junior Software Developer IBM Sofia, Sofia Cit...",IBM,Junior Software Developer,Entry level,"Sofia, Sofia City, Bulgaria",1 week ago,41.0
1,3936580453,"Junior Software Engineer Trading 212 Sofia, So...",Trading 212,Junior Software Engineer,Entry level,"Sofia, Sofia City, Bulgaria",1 week ago,
2,4063857099,"Junior Python Developer DXC Technology Sofia, ...",DXC Technology,Junior Python Developer,Entry level,"Sofia, Sofia City, Bulgaria",3 months ago,
3,4122081493,"Junior Java Engineer Dreamix Sofia, Sofia City...",Dreamix,Junior Java Engineer,Entry level,"Sofia, Sofia City, Bulgaria",3 weeks ago,
4,4121139946,Junior Backend Developer LimeChain - Blockchai...,LimeChain - Blockchain & Web3 Solutions,Junior Backend Developer,Entry level,Sofia Metropolitan Area,3 weeks ago,174.0


All LinkedIn job details are scraped, so the next step is to process the data, create new columns(like posted_time_ago) and clean job_description as well

The function below cleans the job description by removing unwanted text that comes from LinkedIn's interface elements, such as login prompts, buttons, and legal disclaimers

In [49]:
def clean_job_description(text):
    senetences_to_remove = ["Remove photo First name Last name Email Password (8+ characters) ",
                            "By clicking Agree & Join",
                            "you agree to the LinkedIn User Agreement",
                            "Privacy Policy and Cookie Policy",
                            "Continue Agree & Join or Apply on company website",
                            "Security verification",
                            "Close Already on LinkedIn ?",
                            "Close Already on LinkedIn?",
                            "Sign in Save Save job Save this job with your existing LinkedIn profile , or create a new one",
                            "Sign in Save Save job Save this job with your existing LinkedIn profile, or create a new one",
                            "Your job seeking activity is only visible to you",
                            "Email Continue Welcome back"]
    for sentence in senetences_to_remove:
        result = text.find(sentence)
        if result > -1:
            text = text[:result] + text[result+len(sentence):]

    return text 

In [50]:
def get_posted_date(posted_time_ago,date_scraping):
    """Convert posted_time_ago to number of days.
    For example, 1 month ago is replaced by 30. 1 week by 7 and so on..."""
    posted_date = None
    
    try:
        details = posted_time_ago.split()
        N_DAYS_AGO = int(details[0])
        day_week_month_year = details[1] 
        if day_week_month_year.startswith("day"):
            N_DAYS_AGO = N_DAYS_AGO
        elif day_week_month_year.startswith("week"):
            N_DAYS_AGO = N_DAYS_AGO*7
        elif day_week_month_year.startswith("month"):
            N_DAYS_AGO = N_DAYS_AGO*30
        elif day_week_month_year.startswith("year"):
            N_DAYS_AGO = N_DAYS_AGO*365
        else:
            N_DAYS_AGO = None

        posted_date = date_scraping - datetime.timedelta(days=N_DAYS_AGO)
    except:
        posted_date = None

    return posted_date

In [53]:
jobs_DF['scraping_date'] = pd.to_datetime(datetime.date.today())
jobs_DF['posted_date'] = np.vectorize(get_posted_date)(jobs_DF['posted-time-ago'], jobs_DF['scraping_date'])

jobs_DF['Job_txt'] = jobs_DF['Job_txt'].apply(clean_job_description)
jobs_DF.level = jobs_DF.level.apply(lambda x:x.replace("Employment type\n        \n\n          ","") if x is not None else x)

jobs_DF.head()

Unnamed: 0,Job_ID,Job_txt,company,job-title,level,location,posted-time-ago,nb_candidats,scraping_date,posted_date
0,4137245989,"Junior Software Developer IBM Sofia, Sofia Cit...",IBM,Junior Software Developer,Entry level,"Sofia, Sofia City, Bulgaria",1 week ago,41.0,2025-02-06,2025-01-30
1,3936580453,"Junior Software Engineer Trading 212 Sofia, So...",Trading 212,Junior Software Engineer,Entry level,"Sofia, Sofia City, Bulgaria",1 week ago,,2025-02-06,2025-01-30
2,4063857099,"Junior Python Developer DXC Technology Sofia, ...",DXC Technology,Junior Python Developer,Entry level,"Sofia, Sofia City, Bulgaria",3 months ago,,2025-02-06,2024-11-08
3,4122081493,"Junior Java Engineer Dreamix Sofia, Sofia City...",Dreamix,Junior Java Engineer,Entry level,"Sofia, Sofia City, Bulgaria",3 weeks ago,,2025-02-06,2025-01-16
4,4121139946,Junior Backend Developer LimeChain - Blockchai...,LimeChain - Blockchain & Web3 Solutions,Junior Backend Developer,Entry level,Sofia Metropolitan Area,3 weeks ago,174.0,2025-02-06,2025-01-16


Save to json file

In [54]:
jobs_DF.to_json("../data/linkedin_jobs_scraped.json")