# Scrape Linkedin Data

In [1]:
# Make sure we have installed the dependency
! pip freeze | grep linkedin

linkedin-scraper==2.11.2


In [2]:
! google-chrome-stable --version

Google Chrome 114.0.5735.90 


In [3]:
from linkedin_scraper import JobSearch, Job, actions
from typing import List
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
from pprint import pprint
import urllib
from time import sleep

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

class _JobSearch(JobSearch):
    def __init__(self, final_url=None, **kwargs):
        self.final_url = final_url
        self.current_url = None
        super().__init__(**kwargs)
    
    def search(self, search_term: str, page_n) -> List[Job]:
        if self.final_url is None:
            self.current_url = os.path.join(self.base_url, "search") + f"?keywords={urllib.parse.quote(search_term)}&refresh=true"
            self.driver.get(self.current_url)

            # Get redirection URL
            self.final_url = self.driver.current_url
        else:
            self.current_url = os.path.join(self.final_url, f"&start={25*(page_n-1)}")
            self.driver.get(self.current_url)
        
        self.scroll_to_bottom()
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        job_listing_class_name = "jobs-search-results-list"
        job_listing = self.wait_for_element_to_load(name=job_listing_class_name)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 0.3)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 0.6)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 1)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        job_results = []
        for job_card in self.wait_for_all_elements_to_load(name="job-card-list", base=job_listing):
            job = self.scrape_job_card(job_card)
            job_results.append(job)
        return job_results

def are_same(job1: Job, job2: Job):
    if job1.job_title == job2.job_title and job1.company == job2.company:
        return True
    return False

## 1. Scrape Job Search

Scrape the first 50 pages of the search result.

In [4]:
# Set up the lower-level services for scraping
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
print("... Logged in.")
job_search = _JobSearch(driver=driver, close_on_complete=False, scrape=False)

... Logged in.


In [5]:
%%time
from selenium.common.exceptions import TimeoutException

N_PAGES = 50
SEARCH_KEYWORD = "data"

jobs = []
for page_n in range(1, N_PAGES+1):
    pprint(f"Searching jobs... Keyword: {SEARCH_KEYWORD}; Page {page_n}/{N_PAGES}")
    try:
        new_batch = job_search.search(SEARCH_KEYWORD, page_n)
    except TimeoutException:
        pprint(f"SKIPPED PAGE: {page_n}")
        continue

    # Check if the new batch of jobs are duplicates, 
    # which means we have gone through all the pages and should quit scraping.
    if jobs and are_same(new_batch[0], jobs[0]):
        pprint("Found duplicate results! All the pages have been scraped. Quiting...")
        break
        
    jobs.extend(new_batch)
    pprint(f"FINISHED PAGE: {page_n}")

'FINISHED PAGE: 11'
'Searching jobs... Keyword: data; Page 12/50'
'FINISHED PAGE: 12'
'Searching jobs... Keyword: data; Page 13/50'
'FINISHED PAGE: 13'
'Searching jobs... Keyword: data; Page 14/50'
'FINISHED PAGE: 14'
'Searching jobs... Keyword: data; Page 15/50'
'FINISHED PAGE: 15'
'Searching jobs... Keyword: data; Page 16/50'
'FINISHED PAGE: 16'
'Searching jobs... Keyword: data; Page 17/50'
'FINISHED PAGE: 17'
'Searching jobs... Keyword: data; Page 18/50'
'FINISHED PAGE: 18'
'Searching jobs... Keyword: data; Page 19/50'
'FINISHED PAGE: 19'
'Searching jobs... Keyword: data; Page 20/50'
'FINISHED PAGE: 20'
'Searching jobs... Keyword: data; Page 21/50'
'FINISHED PAGE: 21'
'Searching jobs... Keyword: data; Page 22/50'
'FINISHED PAGE: 22'
'Searching jobs... Keyword: data; Page 23/50'
'SKIPPED PAGE: 23'
'Searching jobs... Keyword: data; Page 24/50'
'SKIPPED PAGE: 24'
'Searching jobs... Keyword: data; Page 25/50'
'SKIPPED PAGE: 25'
'Searching jobs... Keyword: data; Page 26/50'
'SKIPPED PAGE

In [6]:
len(jobs)

384

In [7]:
# Save today's crawl temporarily
import pickle
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"helsinki_data_jobs_{current_date}.pkl"
with open(f"../data/tmp/{fname}", "wb") as f:
    dicted_jobs = [job.to_dict() for job in jobs]
    pickle.dump(dicted_jobs,f)

## 2. Scrape job postings

In [8]:
import logging
from linkedin_scraper import Job, actions

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class _Job(Job):
    def __init__(self, **kwargs):
       self.job_title = ""
       self.required_skills = ""
       self.job_type_1 = ""
       self.job_type_2 = ""
 
       super().__init__(**kwargs)
    
    def scrape_logged_in(self, close_on_complete=True):
        driver = self.driver
        
        driver.get(self.linkedin_url)
        self.focus()
        self.job_title = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-unified-top-card__job-title')]").text.strip()
        self.company = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a[1]").text.strip()
        self.company_linkedin_url = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a").get_attribute("href")
        self.location = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//*").text.strip()
        self.posted_date = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//span[3]").text.strip()
        self.job_type_1 = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')]/span").text.strip()
        self.job_description = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-description')]").text.strip()
        
        try:
            self.required_skills = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][1]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.required_skills += self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][2]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.job_type_2 = self.wait_for_element_to_load(by=By.XPATH, name="(//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')])[2]/span").text.strip()
        except TimeoutException:
            self.job_type_2 = ""
            
        try:
            self.applicant_count = self.wait_for_element_to_load(by=By.XPATH, name="jobs-unified-top-card__applicant-count").text.strip()
        except TimeoutException:
            self.applicant_count = 0
        
        try:
            self.benefits = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'salary-main-rail-card')]").text.strip()
        except TimeoutException:
            self.benefits = ""

        if close_on_complete:
            driver.close()

In [9]:
from typing import List
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
from pprint import pprint
import urllib
from time import sleep

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

In [10]:
# Set up low-level servies for scraping
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) 
print("... Logged in.")

... Logged in.


Ignore the error logs!

In [11]:
import pickle
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"helsinki_data_jobs_{current_date}.pkl"

with open(f"../data/tmp/{fname}", "rb") as f:
    jobs = pickle.load(f)

print(len(jobs))

384


In [12]:
%%time
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from time import sleep

N_JOBS = len(jobs)

crawled_jobs = []
for i, job in enumerate(jobs):
    print(f"Crawling... Jobs {i+1}/{N_JOBS}")
    try:
        _crawled_job = _Job(linkedin_url=job.get("linkedin_url"), driver=driver, close_on_complete=False, scrape=True)
        crawled_jobs.append(_crawled_job)
        sleep(1)
    except StaleElementReferenceException or TimeoutException:
        print(f"... Skipped Job {i+1}/{N_JOBS}.")
        sleep(1)
        continue

Crawling... Jobs 188/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 189/384
Crawling... Jobs 190/384
Crawling... Jobs 191/384
Crawling... Jobs 192/384


TimeoutException: Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>


In [13]:
import pandas as pd

In [14]:
df_crawled_jobs = pd.DataFrame([vars(job) for job in crawled_jobs]
                              ).drop(columns=["driver"]
                              ).drop_duplicates("linkedin_url")

In [15]:
df_crawled_jobs

Unnamed: 0,job_title,required_skills,job_type_1,job_type_2,linkedin_url,company,company_linkedin_url,location,posted_date,applicant_count,job_description,benefits
0,Accounts Receivable Accountant,"FinnishAccounting, Finance, Infor Enterprise R...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3746211692/...,Walki Group,https://www.linkedin.com/company/walki-group/life,"Walki Group · Jakobstad, Ostrobothnia, Finland...",6 days ago,0,About the job\nWe are looking for an\nACCOUNTA...,
1,Azure Data Engineer,"Artificial Intelligence (AI), Cloud Computing,...",On-site,Full-time,https://www.linkedin.com/jobs/view/3684400225/...,Cloud1 Oy,https://www.linkedin.com/company/cloud1-oy/life,"Cloud1 Oy · Helsinki, Uusimaa, Finland Reposte...",Reposted 3 weeks ago,0,About the job\nCloud1 tunnetaan vaativien inte...,
2,Data Architect,Data Analytics and Data WarehousingData Archit...,Hybrid,Full-time,https://www.linkedin.com/jobs/view/3725618198/...,Nortal,https://www.linkedin.com/company/nortal/life,"Nortal · Helsinki, Uusimaa, Finland Reposted ...",Reposted 2 weeks ago,0,About the job\nOverview\n\nDo you enjoy being ...,
3,ETL Specialist,"Data Warehousing, English, Extract, Transform,...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3743271896/...,Gazelle Global,https://www.linkedin.com/company/gazelle-globa...,"Gazelle Global · Helsinki, Uusimaa, Finland 1...",1 week ago,0,About the job\nETL Specialist\n\n A great oppo...,
4,"Specialist, Data Protection","DatabasesCommunication, Data Modeling, Data Pr...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3754374323/...,VTT,https://www.linkedin.com/company/vtt/life,"VTT · Espoo, Uusimaa, Finland 1 day ago · 2 ...",1 day ago,0,About the job\nAre you looking for a new oppor...,
...,...,...,...,...,...,...,...,...,...,...,...,...
186,"Internal Audit Manager, Model Risk","Communication, Communication Training, Diploma...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3744389039/...,Nordea,https://www.linkedin.com/company/nordea/life,"Nordea · Helsinki, Uusimaa, Finland Reposted ...",Reposted 4 days ago,0,About the job\nJob ID: 10821 \n #GIA #modelris...,
187,Senior IT Security Specialist,"Application Security, Communication, Cyberark,...",On-site,Full-time,https://www.linkedin.com/jobs/view/3739448796/...,Nordea,https://www.linkedin.com/company/nordea/life,"Nordea · Helsinki, Uusimaa, Finland 1 week ag...",1 week ago,0,About the job\nJob ID:19983 \n Would you like ...,
188,Senior Cloud Platform Engineer (FinOps),Analytical Skills and Cloud ComputingBudgeting...,Full-time,,https://www.linkedin.com/jobs/view/3720773085/...,AlphaSense,https://www.linkedin.com/company/alphasense/life,"AlphaSense · Helsinki, Uusimaa, Finland Repost...",Reposted 4 hours ago,0,About the job\nAbout AlphaSense\n\nAlphaSense ...,
189,Senior Engineering Director (Infrastructure),"Cloud ComputingBudget Management, Budgeting, C...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3730268828/...,Supermetrics,https://www.linkedin.com/company/supermetrics/...,"Supermetrics · Helsinki, Uusimaa, Finland Repo...",Reposted 1 week ago,0,About the job\nWould you like to work with a t...,


In [15]:
# df_crawled_jobs.to_csv(f"../data/crawled_jobs_1-{len(crawled_jobs}_checkpoint.csv", index=False)

### 2.1 Continue from the failed point

In [17]:
# In case session expiration
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) 
print("... Logged in.")

... Logged in.


In [19]:
%%time
# Continue
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

CONTINUE_FROM = 193

for i, job in enumerate(jobs):
    if i+1<CONTINUE_FROM:
        continue
        
    print(f"Crawling... Jobs {i+1}/{N_JOBS}")
    try:
        _crawled_job = _Job(linkedin_url=job.get("linkedin_url"), driver=driver, close_on_complete=False, scrape=True)
        crawled_jobs.append(_crawled_job)
        sleep(1)
    except StaleElementReferenceException or TimeoutException:
        print(f"... Skipped Job {i+1}/{N_JOBS}.")
        sleep(1)
        continue

Crawling... Jobs 305/384
Crawling... Jobs 306/384
Crawling... Jobs 307/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 308/384
Crawling... Jobs 309/384
Crawling... Jobs 310/384
Crawling... Jobs 311/384
Crawling... Jobs 312/384
Crawling... Jobs 313/384
Crawling... Jobs 314/384
Crawling... Jobs 315/384
Crawling... Jobs 316/384
Crawling... Jobs 317/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 318/384
Crawling... Jobs 319/384
Crawling... Jobs 320/384
Crawling... Jobs 321/384
Crawling... Jobs 322/384
Crawling... Jobs 323/384
Crawling... Jobs 324/384
Crawling... Jobs 325/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 326/384
Crawling... Jobs 327/384
Crawling... Jobs 328/384
Crawling... Jobs 329/384
Crawling... Jobs 330/384
Crawling... Jobs 331/384
Crawling... Jobs 332/384
Crawling... Jobs 333/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 334/384
Crawling... Jobs 335/384
Crawling... Jobs 336/384
Crawling... Jobs 337/384
Crawling... Jobs 338/384
Crawling... Jobs 339/384
Crawling... Jobs 340/384
Crawling... Jobs 341/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 342/384
Crawling... Jobs 343/384
Crawling... Jobs 344/384
Crawling... Jobs 345/384
Crawling... Jobs 346/384
Crawling... Jobs 347/384
Crawling... Jobs 348/384
Crawling... Jobs 349/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 350/384
Crawling... Jobs 351/384
Crawling... Jobs 352/384
Crawling... Jobs 353/384
Crawling... Jobs 354/384
Crawling... Jobs 355/384
Crawling... Jobs 356/384
Crawling... Jobs 357/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 358/384
Crawling... Jobs 359/384
Crawling... Jobs 360/384
Crawling... Jobs 361/384
Crawling... Jobs 362/384
Crawling... Jobs 363/384
Crawling... Jobs 364/384
Crawling... Jobs 365/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 366/384
Crawling... Jobs 367/384
Crawling... Jobs 368/384
Crawling... Jobs 369/384
Crawling... Jobs 370/384
Crawling... Jobs 371/384
Crawling... Jobs 372/384
Crawling... Jobs 373/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 374/384
Crawling... Jobs 375/384
Crawling... Jobs 376/384
Crawling... Jobs 377/384
Crawling... Jobs 378/384
Crawling... Jobs 379/384
Crawling... Jobs 380/384
Crawling... Jobs 381/384


ERROR:__main__:Message: 
Stacktrace:
#0 0x55c9a48e24e3 <unknown>
#1 0x55c9a4611c76 <unknown>
#2 0x55c9a464dc96 <unknown>
#3 0x55c9a464ddc1 <unknown>
#4 0x55c9a46877f4 <unknown>
#5 0x55c9a466d03d <unknown>
#6 0x55c9a468530e <unknown>
#7 0x55c9a466cde3 <unknown>
#8 0x55c9a46422dd <unknown>
#9 0x55c9a464334e <unknown>
#10 0x55c9a48a23e4 <unknown>
#11 0x55c9a48a63d7 <unknown>
#12 0x55c9a48b0b20 <unknown>
#13 0x55c9a48a7023 <unknown>
#14 0x55c9a48751aa <unknown>
#15 0x55c9a48cb6b8 <unknown>
#16 0x55c9a48cb847 <unknown>
#17 0x55c9a48db243 <unknown>
#18 0x7f9762694ac3 <unknown>



Crawling... Jobs 382/384
Crawling... Jobs 383/384
Crawling... Jobs 384/384
CPU times: user 9.57 s, sys: 1.79 s, total: 11.4 s
Wall time: 49min 10s


In [20]:
df_crawled_jobs = pd.DataFrame([vars(job) for job in crawled_jobs]).drop(columns=["driver"]).drop_duplicates("linkedin_url")
df_crawled_jobs

Unnamed: 0,job_title,required_skills,job_type_1,job_type_2,linkedin_url,company,company_linkedin_url,location,posted_date,applicant_count,job_description,benefits
0,Accounts Receivable Accountant,"FinnishAccounting, Finance, Infor Enterprise R...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3746211692/...,Walki Group,https://www.linkedin.com/company/walki-group/life,"Walki Group · Jakobstad, Ostrobothnia, Finland...",6 days ago,0,About the job\nWe are looking for an\nACCOUNTA...,
1,Azure Data Engineer,"Artificial Intelligence (AI), Cloud Computing,...",On-site,Full-time,https://www.linkedin.com/jobs/view/3684400225/...,Cloud1 Oy,https://www.linkedin.com/company/cloud1-oy/life,"Cloud1 Oy · Helsinki, Uusimaa, Finland Reposte...",Reposted 3 weeks ago,0,About the job\nCloud1 tunnetaan vaativien inte...,
2,Data Architect,Data Analytics and Data WarehousingData Archit...,Hybrid,Full-time,https://www.linkedin.com/jobs/view/3725618198/...,Nortal,https://www.linkedin.com/company/nortal/life,"Nortal · Helsinki, Uusimaa, Finland Reposted ...",Reposted 2 weeks ago,0,About the job\nOverview\n\nDo you enjoy being ...,
3,ETL Specialist,"Data Warehousing, English, Extract, Transform,...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3743271896/...,Gazelle Global,https://www.linkedin.com/company/gazelle-globa...,"Gazelle Global · Helsinki, Uusimaa, Finland 1...",1 week ago,0,About the job\nETL Specialist\n\n A great oppo...,
4,"Specialist, Data Protection","DatabasesCommunication, Data Modeling, Data Pr...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3754374323/...,VTT,https://www.linkedin.com/company/vtt/life,"VTT · Espoo, Uusimaa, Finland 1 day ago · 2 ...",1 day ago,0,About the job\nAre you looking for a new oppor...,
...,...,...,...,...,...,...,...,...,...,...,...,...
378,(Senior) Data Engineer - Tietoevry Tech Services,"Analytics, Data Analytics, Data Engineering, D...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3717016713/...,Tietoevry,https://www.linkedin.com/company/tietoevry/life,"Tietoevry · Espoo, Uusimaa, Finland Reposted ...",Reposted 4 days ago,0,About the job\nYou may apply to Tietoevry by s...,
379,(Senior) Quantitative Risk Analyst (Data Analy...,"Business Requirements, Credit Risk Management,...",On-site,Full-time,https://www.linkedin.com/jobs/view/3733080582/...,Nordea,https://www.linkedin.com/company/nordea/life,"Nordea · Helsinki, Uusimaa, Finland 2 weeks a...",2 weeks ago,0,About the job\nJob ID: 17007 \nWe are looking ...,
380,Azure Data Engineer,"Artificial Intelligence (AI), Cloud Computing,...",On-site,Full-time,https://www.linkedin.com/jobs/view/3684400225/...,Cloud1 Oy,https://www.linkedin.com/company/cloud1-oy/life,"Cloud1 Oy · Helsinki, Uusimaa, Finland Reposte...",Reposted 3 weeks ago,0,About the job\nCloud1 tunnetaan vaativien inte...,
381,"Data Engineer, OP Life Insurance","Data Analytics, Data Engineering, Data Warehou...",Hybrid,Full-time,https://www.linkedin.com/jobs/view/3749553229/...,OP Financial Group,https://www.linkedin.com/company/op-financial-...,"OP Financial Group · Helsinki, Uusimaa, Finlan...",5 days ago,0,About the job\nAre you ready to embark on an e...,


In [23]:
# Save today's crawl
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"../data/crawled_jobs_{current_date}.csv"

df_crawled_jobs.to_csv(fname, index=False)