# Scrape Linkedin Data

In [1]:
# Make sure we have installed the dependency
! pip freeze | grep linkedin

linkedin-scraper==2.11.2


In [2]:
! google-chrome-stable --version

Google Chrome 114.0.5735.90 


In [3]:
from linkedin_scraper import JobSearch, Job, actions
from typing import List
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
from pprint import pprint
import urllib
from time import sleep

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

class _JobSearch(JobSearch):
    def __init__(self, final_url=None, **kwargs):
        self.final_url = final_url
        self.current_url = None
        super().__init__(**kwargs)
    
    def search(self, search_term: str, page_n) -> List[Job]:
        if self.final_url is None:
            self.current_url = os.path.join(self.base_url, "search") + f"?keywords={urllib.parse.quote(search_term)}&refresh=true"
            self.driver.get(self.current_url)

            # Get redirection URL
            self.final_url = self.driver.current_url
        else:
            self.current_url = os.path.join(self.final_url, f"&start={25*(page_n-1)}")
            self.driver.get(self.current_url)
        
        self.scroll_to_bottom()
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        job_listing_class_name = "jobs-search-results-list"
        job_listing = self.wait_for_element_to_load(name=job_listing_class_name)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 0.3)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 0.6)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 1)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        job_results = []
        for job_card in self.wait_for_all_elements_to_load(name="job-card-list", base=job_listing):
            job = self.scrape_job_card(job_card)
            job_results.append(job)
        return job_results

def are_same(job1: Job, job2: Job):
    if job1.job_title == job2.job_title and job1.company == job2.company:
        return True
    return False

## 1. Scrape Job Search

Scrape the first 50 pages of the search result.

In [4]:
# Set up the lower-level services for scraping
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
print("... Logged in.")
job_search = _JobSearch(driver=driver, close_on_complete=False, scrape=False)

... Logged in.


In [5]:
%%time
from selenium.common.exceptions import TimeoutException

N_PAGES = 50
SEARCH_KEYWORD = "data"

jobs = []
for page_n in range(1, N_PAGES+1):
    pprint(f"Searching jobs... Keyword: {SEARCH_KEYWORD}; Page {page_n}/{N_PAGES}")
    try:
        new_batch = job_search.search(SEARCH_KEYWORD, page_n)
    except TimeoutException:
        pprint(f"SKIPPED PAGE: {page_n}")
        continue

    # Check if the new batch of jobs are duplicates, 
    # which means we have gone through all the pages and should quit scraping.
    if jobs and are_same(new_batch[0], jobs[0]):
        pprint("Found duplicate results! All the pages have been scraped. Quiting...")
        break
        
    jobs.extend(new_batch)
    pprint(f"FINISHED PAGE: {page_n}")

'FINISHED PAGE: 17'
'Searching jobs... Keyword: data; Page 18/50'
'FINISHED PAGE: 18'
'Searching jobs... Keyword: data; Page 19/50'
'FINISHED PAGE: 19'
'Searching jobs... Keyword: data; Page 20/50'
'FINISHED PAGE: 20'
'Searching jobs... Keyword: data; Page 21/50'
'SKIPPED PAGE: 21'
'Searching jobs... Keyword: data; Page 22/50'
'SKIPPED PAGE: 22'
'Searching jobs... Keyword: data; Page 23/50'
'SKIPPED PAGE: 23'
'Searching jobs... Keyword: data; Page 24/50'
'SKIPPED PAGE: 24'
'Searching jobs... Keyword: data; Page 25/50'
'SKIPPED PAGE: 25'
'Searching jobs... Keyword: data; Page 26/50'
'SKIPPED PAGE: 26'
'Searching jobs... Keyword: data; Page 27/50'
'SKIPPED PAGE: 27'
'Searching jobs... Keyword: data; Page 28/50'
'SKIPPED PAGE: 28'
'Searching jobs... Keyword: data; Page 29/50'
'SKIPPED PAGE: 29'
'Searching jobs... Keyword: data; Page 30/50'
'SKIPPED PAGE: 30'
'Searching jobs... Keyword: data; Page 31/50'
'SKIPPED PAGE: 31'
'Searching jobs... Keyword: data; Page 32/50'
'SKIPPED PAGE: 32'
'S

In [7]:
len(jobs)

362

In [8]:
# Save today's crawl temporarily
import pickle
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"helsinki_data_jobs_{current_date}.pkl"
with open(f"../data/tmp/{fname}", "wb") as f:
    dicted_jobs = [job.to_dict() for job in jobs]
    pickle.dump(dicted_jobs,f)

## 2. Scrape job postings

In [9]:
import logging
from linkedin_scraper import Job, actions

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class _Job(Job):
    def __init__(self, **kwargs):
       self.job_title = ""
       self.required_skills = ""
       self.job_type_1 = ""
       self.job_type_2 = ""
 
       super().__init__(**kwargs)
    
    def scrape_logged_in(self, close_on_complete=True):
        driver = self.driver
        
        driver.get(self.linkedin_url)
        self.focus()
        self.job_title = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-unified-top-card__job-title')]").text.strip()
        self.company = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a[1]").text.strip()
        self.company_linkedin_url = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a").get_attribute("href")
        self.location = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//*").text.strip()
        self.posted_date = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//span[3]").text.strip()
        self.job_type_1 = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')]/span").text.strip()
        self.job_description = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-description')]").text.strip()
        
        try:
            self.required_skills = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][1]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.required_skills += self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][2]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.job_type_2 = self.wait_for_element_to_load(by=By.XPATH, name="(//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')])[2]/span").text.strip()
        except TimeoutException:
            self.job_type_2 = ""
            
        try:
            self.applicant_count = self.wait_for_element_to_load(by=By.XPATH, name="jobs-unified-top-card__applicant-count").text.strip()
        except TimeoutException:
            self.applicant_count = 0
        
        try:
            self.benefits = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'salary-main-rail-card')]").text.strip()
        except TimeoutException:
            self.benefits = ""

        if close_on_complete:
            driver.close()

In [10]:
from typing import List
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
from pprint import pprint
import urllib
from time import sleep

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

In [11]:
# Set up low-level servies for scraping
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) 
print("... Logged in.")

... Logged in.


Ignore the error logs!

In [12]:
import pickle
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"helsinki_data_jobs_{current_date}.pkl"

with open(f"../data/tmp/{fname}", "rb") as f:
    jobs = pickle.load(f)

print(len(jobs))

362


In [13]:
%%time
from selenium.common.exceptions import StaleElementReferenceException
from time import sleep

N_JOBS = len(jobs)

crawled_jobs = []
for i, job in enumerate(jobs):
    print(f"Crawling... Jobs {i+1}/{N_JOBS}")
    try:
        _crawled_job = _Job(linkedin_url=job.get("linkedin_url"), driver=driver, close_on_complete=False, scrape=True)
        crawled_jobs.append(_crawled_job)
        sleep(1)
    except StaleElementReferenceException:
        print(f"... Skipped Job {i+1}/{N_JOBS}.")
        sleep(1)
        continue

Crawling... Jobs 154/362
Crawling... Jobs 155/362
Crawling... Jobs 156/362
Crawling... Jobs 157/362
Crawling... Jobs 158/362
Crawling... Jobs 159/362


ERROR:__main__:Message: 
Stacktrace:
#0 0x55bc3672b4e3 <unknown>
#1 0x55bc3645ac76 <unknown>
#2 0x55bc36496c96 <unknown>
#3 0x55bc36496dc1 <unknown>
#4 0x55bc364d07f4 <unknown>
#5 0x55bc364b603d <unknown>
#6 0x55bc364ce30e <unknown>
#7 0x55bc364b5de3 <unknown>
#8 0x55bc3648b2dd <unknown>
#9 0x55bc3648c34e <unknown>
#10 0x55bc366eb3e4 <unknown>
#11 0x55bc366ef3d7 <unknown>
#12 0x55bc366f9b20 <unknown>
#13 0x55bc366f0023 <unknown>
#14 0x55bc366be1aa <unknown>
#15 0x55bc367146b8 <unknown>
#16 0x55bc36714847 <unknown>
#17 0x55bc36724243 <unknown>
#18 0x7fd92b094ac3 <unknown>



Crawling... Jobs 160/362
Crawling... Jobs 161/362


ERROR:__main__:Message: 
Stacktrace:
#0 0x55bc3672b4e3 <unknown>
#1 0x55bc3645ac76 <unknown>
#2 0x55bc36496c96 <unknown>
#3 0x55bc36496dc1 <unknown>
#4 0x55bc364d07f4 <unknown>
#5 0x55bc364b603d <unknown>
#6 0x55bc364ce30e <unknown>
#7 0x55bc364b5de3 <unknown>
#8 0x55bc3648b2dd <unknown>
#9 0x55bc3648c34e <unknown>
#10 0x55bc366eb3e4 <unknown>
#11 0x55bc366ef3d7 <unknown>
#12 0x55bc366f9b20 <unknown>
#13 0x55bc366f0023 <unknown>
#14 0x55bc366be1aa <unknown>
#15 0x55bc367146b8 <unknown>
#16 0x55bc36714847 <unknown>
#17 0x55bc36724243 <unknown>
#18 0x7fd92b094ac3 <unknown>



Crawling... Jobs 162/362
Crawling... Jobs 163/362


ERROR:__main__:Message: 
Stacktrace:
#0 0x55bc3672b4e3 <unknown>
#1 0x55bc3645ac76 <unknown>
#2 0x55bc36496c96 <unknown>
#3 0x55bc36496dc1 <unknown>
#4 0x55bc364d07f4 <unknown>
#5 0x55bc364b603d <unknown>
#6 0x55bc364ce30e <unknown>
#7 0x55bc364b5de3 <unknown>
#8 0x55bc3648b2dd <unknown>
#9 0x55bc3648c34e <unknown>
#10 0x55bc366eb3e4 <unknown>
#11 0x55bc366ef3d7 <unknown>
#12 0x55bc366f9b20 <unknown>
#13 0x55bc366f0023 <unknown>
#14 0x55bc366be1aa <unknown>
#15 0x55bc367146b8 <unknown>
#16 0x55bc36714847 <unknown>
#17 0x55bc36724243 <unknown>
#18 0x7fd92b094ac3 <unknown>



Crawling... Jobs 164/362
Crawling... Jobs 165/362
Crawling... Jobs 166/362
Crawling... Jobs 167/362
Crawling... Jobs 168/362
Crawling... Jobs 169/362
Crawling... Jobs 170/362
Crawling... Jobs 171/362
Crawling... Jobs 172/362
Crawling... Jobs 173/362
Crawling... Jobs 174/362


TimeoutException: Message: 
Stacktrace:
#0 0x55bc3672b4e3 <unknown>
#1 0x55bc3645ac76 <unknown>
#2 0x55bc36496c96 <unknown>
#3 0x55bc36496dc1 <unknown>
#4 0x55bc364d07f4 <unknown>
#5 0x55bc364b603d <unknown>
#6 0x55bc364ce30e <unknown>
#7 0x55bc364b5de3 <unknown>
#8 0x55bc3648b2dd <unknown>
#9 0x55bc3648c34e <unknown>
#10 0x55bc366eb3e4 <unknown>
#11 0x55bc366ef3d7 <unknown>
#12 0x55bc366f9b20 <unknown>
#13 0x55bc366f0023 <unknown>
#14 0x55bc366be1aa <unknown>
#15 0x55bc367146b8 <unknown>
#16 0x55bc36714847 <unknown>
#17 0x55bc36724243 <unknown>
#18 0x7fd92b094ac3 <unknown>


In [14]:
import pandas as pd

In [15]:
df_crawled_jobs = pd.DataFrame([vars(job) for job in crawled_jobs]
                              ).drop(columns=["driver"]
                              ).drop_duplicates("linkedin_url")

In [16]:
df_crawled_jobs

Unnamed: 0,job_title,required_skills,job_type_1,job_type_2,linkedin_url,company,company_linkedin_url,location,posted_date,applicant_count,job_description,benefits
0,"Juristi-trainee, Data ja tietosuoja","Analytical Skills, Big Data, Data Analysis, Da...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3741023935/...,OP Financial Group,https://www.linkedin.com/company/op-financial-...,"OP Financial Group · Helsinki, Uusimaa, Finlan...",2 weeks ago,0,About the job\nHaku OP Ryhmän vuoden 2024 Kiit...,
1,Data Architect,Data Analytics and Data WarehousingData Archit...,Hybrid,Full-time,https://www.linkedin.com/jobs/view/3725618198/...,Nortal,https://www.linkedin.com/company/nortal/life,"Nortal · Helsinki, Uusimaa, Finland Reposted ...",Reposted 2 weeks ago,0,About the job\nOverview\n\nDo you enjoy being ...,
2,ETL Specialist,"Data Warehousing, English, Extract, Transform,...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3743271896/...,Gazelle Global,https://www.linkedin.com/company/gazelle-globa...,"Gazelle Global · Helsinki, Uusimaa, Finland 1...",1 week ago,0,About the job\nETL Specialist\n\n A great oppo...,
3,JVM Performance and Tuning Engineer,"Business Logic, Garbage Collection, Honeycomb,...",Remote,Full-time,https://www.linkedin.com/jobs/view/3734708994/...,RELEX Solutions,https://www.linkedin.com/company/relexsolution...,RELEX Solutions · Finland 3 weeks ago · 10 a...,3 weeks ago,0,About the job\nRELEX Solutions create cutting-...,
4,Data Science - Machine Learning Engineer,"Artificial Intelligence (AI), Computer Science...",Remote,Full-time,https://www.linkedin.com/jobs/view/3629670334/...,Wolt,https://www.linkedin.com/company/wolt-oy/life,"Wolt · Helsinki, Uusimaa, Finland Reposted 1 ...",Reposted 1 day ago,0,About the job\nJob Description\n\nTeam purpose...,
...,...,...,...,...,...,...,...,...,...,...,...,...
168,Remote Data Contributor – Image collection,Artificial Intelligence (AI) and Defining Requ...,Remote,Temporary,https://www.linkedin.com/jobs/view/3732069308/...,TransPerfect,https://www.linkedin.com/company/transperfect/...,TransPerfect · Helsinki Metropolitan Area 2 w...,2 weeks ago,0,About the job\nPosition: Data Contributor \nP...,
169,Remote Data Contributor – Image collection,Artificial Intelligence (AI) and Defining Requ...,Remote,Temporary,https://www.linkedin.com/jobs/view/3732024518/...,TransPerfect,https://www.linkedin.com/company/transperfect/...,"TransPerfect · Espoo, Uusimaa, Finland 2 week...",2 weeks ago,0,About the job\nPosition: Data Contributor \nP...,
170,Nepali Transcriber,"Data Analytics and EnglishAttention to Detail,...",On-site,Full-time,https://www.linkedin.com/jobs/view/3746515151/...,TELUS International,https://www.linkedin.com/company/telus-interna...,"TELUS International · Pirkanmaa, Finland 1 we...",1 week ago,0,About the job\nTELUS International is looking ...,
171,Financial Crime Prevention Senior/Master Exper...,"Analytical SkillsBudgeting, Communication, Cor...",On-site,Full-time,https://www.linkedin.com/jobs/view/3720314180/...,Nordea,https://www.linkedin.com/company/nordea/life,"Nordea · Helsinki, Uusimaa, Finland Reposted ...",Reposted 1 week ago,0,About the job\nJob ID: 19410 \n As a Senior Ex...,


In [15]:
# df_crawled_jobs.to_csv(f"../data/crawled_jobs_1-{len(crawled_jobs}_checkpoint.csv", index=False)

### 2.1 Continue from the failed point

In [17]:
# In case session expiration
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) 
print("... Logged in.")

... Logged in.


In [None]:
# Continue

CONTINUE_FROM = 297

for i, job in enumerate(jobs):
    if i+1<CONTINUE_FROM:
        continue
        
    print(f"Crawling... Jobs {i+1}/{N_JOBS}")
    try:
        _crawled_job = _Job(linkedin_url=job.get("linkedin_url"), driver=driver, close_on_complete=False, scrape=True)
        crawled_jobs.append(_crawled_job)
        sleep(1)
    except StaleElementReferenceException:
        print(f"... Skipped Job {i+1}/{N_JOBS}.")
        sleep(1)
        continue

Crawling... Jobs 357/362
Crawling... Jobs 358/362
Crawling... Jobs 359/362
Crawling... Jobs 360/362
Crawling... Jobs 361/362
Crawling... Jobs 362/362


In [None]:
df_crawled_jobs = pd.DataFrame([vars(job) for job in crawled_jobs]).drop(columns=["driver"]).drop_duplicates("linkedin_url")
df_crawled_jobs

In [32]:
# Save today's crawl
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"../data/crawled_jobs_{current_date}.csv"

df_crawled_jobs.to_csv(fname, index=False)