# Scrape Linkedin Data

In [1]:
# Make sure we have installed the dependency
! pip freeze | grep linkedin

linkedin-scraper==2.11.2


In [2]:
! google-chrome-stable --version

Google Chrome 114.0.5735.90 


In [3]:
from linkedin_scraper import JobSearch, Job, actions
from typing import List
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
from pprint import pprint
import urllib
from time import sleep

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

class _JobSearch(JobSearch):
    def __init__(self, final_url=None, **kwargs):
        self.final_url = final_url
        self.current_url = None
        super().__init__(**kwargs)
    
    def search(self, search_term: str, page_n) -> List[Job]:
        if self.final_url is None:
            self.current_url = os.path.join(self.base_url, "search") + f"?keywords={urllib.parse.quote(search_term)}&refresh=true"
            self.driver.get(self.current_url)

            # Get redirection URL
            self.final_url = self.driver.current_url
        else:
            self.current_url = os.path.join(self.final_url, f"&start={25*(page_n-1)}")
            self.driver.get(self.current_url)
        
        self.scroll_to_bottom()
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        job_listing_class_name = "jobs-search-results-list"
        job_listing = self.wait_for_element_to_load(name=job_listing_class_name)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 0.3)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 0.6)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 1)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        job_results = []
        for job_card in self.wait_for_all_elements_to_load(name="job-card-list", base=job_listing):
            job = self.scrape_job_card(job_card)
            job_results.append(job)
        return job_results

def are_same(job1: Job, job2: Job):
    if job1.job_title == job2.job_title and job1.company == job2.company:
        return True
    return False

## 1. Scrape Job Search

Scrape the first 50 pages of the search result.

In [4]:
# Set up the lower-level services for scraping
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
print("... Logged in.")
job_search = _JobSearch(driver=driver, close_on_complete=False, scrape=False)

... Logged in.


In [5]:
%%time
from selenium.common.exceptions import TimeoutException

N_PAGES = 50
SEARCH_KEYWORD = "data"

jobs = []
for page_n in range(1, N_PAGES+1):
    pprint(f"Searching jobs... Keyword: {SEARCH_KEYWORD}; Page {page_n}/{N_PAGES}")
    try:
        new_batch = job_search.search(SEARCH_KEYWORD, page_n)
    except TimeoutException:
        pprint(f"SKIPPED PAGE: {page_n}")
        continue

    # Check if the new batch of jobs are duplicates, 
    # which means we have gone through all the pages and should quit scraping.
    if jobs and are_same(new_batch[0], jobs[0]):
        pprint("Found duplicate results! All the pages have been scraped. Quiting...")
        break
        
    jobs.extend(new_batch)
    pprint(f"FINISHED PAGE: {page_n}")

'Searching jobs... Keyword: data; Page 1/50'
'FINISHED PAGE: 1'
'Searching jobs... Keyword: data; Page 2/50'
'FINISHED PAGE: 2'
'Searching jobs... Keyword: data; Page 3/50'
'FINISHED PAGE: 3'
'Searching jobs... Keyword: data; Page 4/50'
'FINISHED PAGE: 4'
'Searching jobs... Keyword: data; Page 5/50'
'FINISHED PAGE: 5'
'Searching jobs... Keyword: data; Page 6/50'
'FINISHED PAGE: 6'
'Searching jobs... Keyword: data; Page 7/50'
'FINISHED PAGE: 7'
'Searching jobs... Keyword: data; Page 8/50'
'FINISHED PAGE: 8'
'Searching jobs... Keyword: data; Page 9/50'
'FINISHED PAGE: 9'
'Searching jobs... Keyword: data; Page 10/50'
'FINISHED PAGE: 10'
'Searching jobs... Keyword: data; Page 11/50'
'FINISHED PAGE: 11'
'Searching jobs... Keyword: data; Page 12/50'
'FINISHED PAGE: 12'
'Searching jobs... Keyword: data; Page 13/50'
'FINISHED PAGE: 13'
'Searching jobs... Keyword: data; Page 14/50'
'FINISHED PAGE: 14'
'Searching jobs... Keyword: data; Page 15/50'
'FINISHED PAGE: 15'
'Searching jobs... Keyword: 

In [6]:
len(jobs)

381

In [7]:
# Save today's crawl temporarily
import pickle
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"helsinki_data_jobs_{current_date}.pkl"
with open(f"../data/tmp/{fname}", "wb") as f:
    dicted_jobs = [job.to_dict() for job in jobs]
    pickle.dump(dicted_jobs,f)

## 2. Scrape job postings

In [8]:
import logging
from linkedin_scraper import Job, actions

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class _Job(Job):
    def __init__(self, **kwargs):
       self.job_title = ""
       self.required_skills = ""
       self.job_type_1 = ""
       self.job_type_2 = ""
 
       super().__init__(**kwargs)
    
    def scrape_logged_in(self, close_on_complete=True):
        driver = self.driver
        
        driver.get(self.linkedin_url)
        self.focus()
        self.job_title = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-unified-top-card__job-title')]").text.strip()
        self.company = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a[1]").text.strip()
        self.company_linkedin_url = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a").get_attribute("href")
        self.location = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//*").text.strip()
        self.posted_date = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//span[3]").text.strip()
        self.job_type_1 = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')]/span").text.strip()
        self.job_description = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-description')]").text.strip()
        
        try:
            self.required_skills = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][1]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.required_skills += self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][2]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.job_type_2 = self.wait_for_element_to_load(by=By.XPATH, name="(//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')])[2]/span").text.strip()
        except TimeoutException:
            self.job_type_2 = ""
            
        try:
            self.applicant_count = self.wait_for_element_to_load(by=By.XPATH, name="jobs-unified-top-card__applicant-count").text.strip()
        except TimeoutException:
            self.applicant_count = 0
        
        try:
            self.benefits = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'salary-main-rail-card')]").text.strip()
        except TimeoutException:
            self.benefits = ""

        if close_on_complete:
            driver.close()

In [9]:
from typing import List
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
from pprint import pprint
import urllib
from time import sleep

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

In [10]:
# Set up low-level servies for scraping
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) 
print("... Logged in.")

... Logged in.


Ignore the error logs!

In [11]:
import pickle
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"helsinki_data_jobs_{current_date}.pkl"

with open(f"../data/tmp/{fname}", "rb") as f:
    jobs = pickle.load(f)

print(len(jobs))

381


In [12]:
%%time
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from time import sleep

N_JOBS = len(jobs)

crawled_jobs = []
for i, job in enumerate(jobs):
    print(f"Crawling... Jobs {i+1}/{N_JOBS}")
    try:
        _crawled_job = _Job(linkedin_url=job.get("linkedin_url"), driver=driver, close_on_complete=False, scrape=True)
        crawled_jobs.append(_crawled_job)
        sleep(1)
    except StaleElementReferenceException or TimeoutException:
        print(f"... Skipped Job {i+1}/{N_JOBS}.")
        sleep(1)
        continue

Crawling... Jobs 1/381
Crawling... Jobs 2/381
Crawling... Jobs 3/381
Crawling... Jobs 4/381


ERROR:__main__:Message: 
Stacktrace:
#0 0x55d8897ea4e3 <unknown>
#1 0x55d889519c76 <unknown>
#2 0x55d889555c96 <unknown>
#3 0x55d889555dc1 <unknown>
#4 0x55d88958f7f4 <unknown>
#5 0x55d88957503d <unknown>
#6 0x55d88958d30e <unknown>
#7 0x55d889574de3 <unknown>
#8 0x55d88954a2dd <unknown>
#9 0x55d88954b34e <unknown>
#10 0x55d8897aa3e4 <unknown>
#11 0x55d8897ae3d7 <unknown>
#12 0x55d8897b8b20 <unknown>
#13 0x55d8897af023 <unknown>
#14 0x55d88977d1aa <unknown>
#15 0x55d8897d36b8 <unknown>
#16 0x55d8897d3847 <unknown>
#17 0x55d8897e3243 <unknown>
#18 0x7f08c1a94ac3 <unknown>

ERROR:__main__:Message: 
Stacktrace:
#0 0x55d8897ea4e3 <unknown>
#1 0x55d889519c76 <unknown>
#2 0x55d889555c96 <unknown>
#3 0x55d889555dc1 <unknown>
#4 0x55d88958f7f4 <unknown>
#5 0x55d88957503d <unknown>
#6 0x55d88958d30e <unknown>
#7 0x55d889574de3 <unknown>
#8 0x55d88954a2dd <unknown>
#9 0x55d88954b34e <unknown>
#10 0x55d8897aa3e4 <unknown>
#11 0x55d8897ae3d7 <unknown>
#12 0x55d8897b8b20 <unknown>
#13 0x55d8897af02

Crawling... Jobs 5/381
Crawling... Jobs 6/381


WebDriverException: Message: unknown error: net::ERR_CONNECTION_REFUSED
  (Session info: headless chrome=114.0.5735.90)
Stacktrace:
#0 0x55d8897ea4e3 <unknown>
#1 0x55d889519c76 <unknown>
#2 0x55d889511c7f <unknown>
#3 0x55d889503ca2 <unknown>
#4 0x55d889505412 <unknown>
#5 0x55d8895040ca <unknown>
#6 0x55d889503168 <unknown>
#7 0x55d889502fa0 <unknown>
#8 0x55d8895019bf <unknown>
#9 0x55d889501fed <unknown>
#10 0x55d88951bb06 <unknown>
#11 0x55d88958d9e5 <unknown>
#12 0x55d889575012 <unknown>
#13 0x55d88958d30e <unknown>
#14 0x55d889574de3 <unknown>
#15 0x55d88954a2dd <unknown>
#16 0x55d88954b34e <unknown>
#17 0x55d8897aa3e4 <unknown>
#18 0x55d8897ae3d7 <unknown>
#19 0x55d8897b8b20 <unknown>
#20 0x55d8897af023 <unknown>
#21 0x55d88977d1aa <unknown>
#22 0x55d8897d36b8 <unknown>
#23 0x55d8897d3847 <unknown>
#24 0x55d8897e3243 <unknown>
#25 0x7f08c1a94ac3 <unknown>


In [13]:
import pandas as pd

In [14]:
df_crawled_jobs = pd.DataFrame([vars(job) for job in crawled_jobs]
                              ).drop(columns=["driver"]
                              ).drop_duplicates("linkedin_url")

In [15]:
df_crawled_jobs

Unnamed: 0,job_title,required_skills,job_type_1,job_type_2,linkedin_url,company,company_linkedin_url,location,posted_date,applicant_count,job_description,benefits
0,QARA Specialist / Senior Specialist,"EnglishAttention to Detail, Communication, IEC...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3745632980/...,Topcon Healthcare Europe,https://www.linkedin.com/company/topconhealthc...,"Topcon Healthcare Europe · Oulu, North Ostrobo...",3 days ago,0,About the job\nDo you want to have a direct im...,
1,Data Architect,Data Analytics and Data WarehousingData Archit...,Hybrid,Full-time,https://www.linkedin.com/jobs/view/3725618198/...,Nortal,https://www.linkedin.com/company/nortal/life,"Nortal · Helsinki, Uusimaa, Finland Reposted ...",Reposted 2 weeks ago,0,About the job\nOverview\n\nDo you enjoy being ...,
2,Technical Business Analyst in FCIIA,"Analytical SkillsAgile Project Management, Bus...",On-site,Full-time,https://www.linkedin.com/jobs/view/3733051402/...,Nordea,https://www.linkedin.com/company/nordea/life,"Nordea · Helsinki, Uusimaa, Finland 2 weeks a...",2 weeks ago,0,About the job\nJob ID: 19860 \n Do you have a ...,
3,Junior Data Scientist,,Remote,Full-time,https://www.linkedin.com/jobs/view/3756229497/...,Baleen Labs Ltd.,https://www.linkedin.com/company/baleen-labs/life,Baleen Labs Ltd. · European Economic Area 5 h...,5 hours ago,0,About the job\nSee more,
4,"Data Architect, Digital Society","Data Engineering, Extract, Transform, Load (ET...",Full-time,,https://www.linkedin.com/jobs/view/3755141358/...,CGI,https://www.linkedin.com/company/cgi/life,"CGI · Helsinki, Uusimaa, Finland 1 day ago ·...",1 day ago,0,About the job\nPosition Description\n\nVuonna ...,


In [None]:
# df_crawled_jobs.to_csv(f"../data/crawled_jobs_1-{len(crawled_jobs}_checkpoint.csv", index=False)

### 2.1 Continue from the failed point

In [30]:
# In case session expiration
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) 
print("... Logged in.")

... Logged in.


In [31]:
%%time
# Continue
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

CONTINUE_FROM = 348

for i, job in enumerate(jobs):
    if i+1<CONTINUE_FROM:
        continue
        
    print(f"Crawling... Jobs {i+1}/{N_JOBS}")
    try:
        _crawled_job = _Job(linkedin_url=job.get("linkedin_url"), driver=driver, close_on_complete=False, scrape=True)
        crawled_jobs.append(_crawled_job)
        sleep(1)
    except StaleElementReferenceException or TimeoutException:
        print(f"... Skipped Job {i+1}/{N_JOBS}.")
        sleep(1)
        continue

Crawling... Jobs 348/381
Crawling... Jobs 349/381
Crawling... Jobs 350/381
Crawling... Jobs 351/381
Crawling... Jobs 352/381
Crawling... Jobs 353/381
Crawling... Jobs 354/381
Crawling... Jobs 355/381
Crawling... Jobs 356/381
Crawling... Jobs 357/381
Crawling... Jobs 358/381
Crawling... Jobs 359/381
Crawling... Jobs 360/381
Crawling... Jobs 361/381
Crawling... Jobs 362/381
Crawling... Jobs 363/381
Crawling... Jobs 364/381
Crawling... Jobs 365/381
Crawling... Jobs 366/381
Crawling... Jobs 367/381
Crawling... Jobs 368/381
Crawling... Jobs 369/381
Crawling... Jobs 370/381
Crawling... Jobs 371/381
Crawling... Jobs 372/381
Crawling... Jobs 373/381
Crawling... Jobs 374/381
Crawling... Jobs 375/381
Crawling... Jobs 376/381
Crawling... Jobs 377/381
Crawling... Jobs 378/381
Crawling... Jobs 379/381
Crawling... Jobs 380/381
Crawling... Jobs 381/381
CPU times: user 1.58 s, sys: 268 ms, total: 1.85 s
Wall time: 7min 34s


In [32]:
import pandas as pd
df_crawled_jobs = pd.DataFrame([vars(job) for job in crawled_jobs]).drop(columns=["driver"]).drop_duplicates("linkedin_url")
df_crawled_jobs

Unnamed: 0,job_title,required_skills,job_type_1,job_type_2,linkedin_url,company,company_linkedin_url,location,posted_date,applicant_count,job_description,benefits
0,QARA Specialist / Senior Specialist,"EnglishAttention to Detail, Communication, IEC...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3745632980/...,Topcon Healthcare Europe,https://www.linkedin.com/company/topconhealthc...,"Topcon Healthcare Europe · Oulu, North Ostrobo...",3 days ago,0,About the job\nDo you want to have a direct im...,
1,Data Architect,Data Analytics and Data WarehousingData Archit...,Hybrid,Full-time,https://www.linkedin.com/jobs/view/3725618198/...,Nortal,https://www.linkedin.com/company/nortal/life,"Nortal · Helsinki, Uusimaa, Finland Reposted ...",Reposted 2 weeks ago,0,About the job\nOverview\n\nDo you enjoy being ...,
2,Technical Business Analyst in FCIIA,"Analytical SkillsAgile Project Management, Bus...",On-site,Full-time,https://www.linkedin.com/jobs/view/3733051402/...,Nordea,https://www.linkedin.com/company/nordea/life,"Nordea · Helsinki, Uusimaa, Finland 2 weeks a...",2 weeks ago,0,About the job\nJob ID: 19860 \n Do you have a ...,
3,Junior Data Scientist,,Remote,Full-time,https://www.linkedin.com/jobs/view/3756229497/...,Baleen Labs Ltd.,https://www.linkedin.com/company/baleen-labs/life,Baleen Labs Ltd. · European Economic Area 5 h...,5 hours ago,0,About the job\nSee more,
4,"Data Architect, Digital Society","Data Engineering, Extract, Transform, Load (ET...",Full-time,,https://www.linkedin.com/jobs/view/3755141358/...,CGI,https://www.linkedin.com/company/cgi/life,"CGI · Helsinki, Uusimaa, Finland 1 day ago ·...",1 day ago,0,About the job\nPosition Description\n\nVuonna ...,
...,...,...,...,...,...,...,...,...,...,...,...,...
371,"Senior Data Analyst, Ads","Data Analysis, Python (Programming Language), ...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3702242885/...,Rovio Entertainment Corporation,https://www.linkedin.com/company/rovio/life,"Rovio Entertainment Corporation · Helsinki, Uu...",Reposted 2 weeks ago,0,About the job\nAt Rovio you will get to work w...,
372,Data Warehouse Engineer,"Computer Science, Data Warehousing, Databases,...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3749884540/...,Schibsted Finland,https://www.linkedin.com/company/schibsted-fin...,"Schibsted Finland · Helsinki, Uusimaa, Finland...",3 days ago,0,About the job\nTHE OPPORTUNITY IN A NUTSHELL\n...,
373,Senior Analytics Engineer,"Analytical Skills, Data Analysis, English, and...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3744849073/...,Ageras,https://www.linkedin.com/company/ageras-global...,"Ageras · Espoo, Uusimaa, Finland 1 week ago ...",1 week ago,0,About the job\nWe are looking for a Senior Ana...,
374,Data Engineer,"Extract, Transform, Load (ETL), Python (Progra...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3740586858/...,Bean Solutions Oy,https://www.linkedin.com/company/bean-solution...,Bean Solutions Oy · Helsinki Metropolitan Area...,2 weeks ago,0,About the job\nSinä tietovarastoympäristöistä ...,


In [33]:
# Save today's crawl
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"../data/crawled_jobs_{current_date}.csv"

df_crawled_jobs.to_csv(fname, index=False)