# Scrape Linkedin Data

In [1]:
# Make sure we have installed the dependency
! pip freeze | grep linkedin

linkedin-scraper==2.11.2


In [2]:
import logging
from linkedin_scraper import JobSearch, Job, actions

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class _Job(Job):
    def __init__(self, **kwargs):
       self.job_title = ""
       self.required_skills = ""
       self.job_type_1 = ""
       self.job_type_2 = ""
 
       super().__init__(**kwargs)
    
    def scrape_logged_in(self, close_on_complete=True):
        driver = self.driver
        
        driver.get(self.linkedin_url)
        self.focus()
        self.job_title = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-unified-top-card__job-title')]").text.strip()
        self.company = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a[1]").text.strip()
        self.company_linkedin_url = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a").get_attribute("href")
        self.location = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//*").text.strip()
        self.posted_date = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//span[3]").text.strip()
        self.job_type_1 = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')]/span").text.strip()
        self.job_description = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-description')]").text.strip()
        
        try:
            self.required_skills = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][1]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.required_skills += self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][2]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.job_type_2 = self.wait_for_element_to_load(by=By.XPATH, name="(//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')])[2]/span").text.strip()
        except TimeoutException:
            self.job_type_2 = ""
            
        try:
            self.applicant_count = self.wait_for_element_to_load(by=By.XPATH, name="jobs-unified-top-card__applicant-count").text.strip()
        except TimeoutException:
            self.applicant_count = 0
        
        try:
            self.benefits = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'salary-main-rail-card')]").text.strip()
        except TimeoutException:
            self.benefits = ""

        if close_on_complete:
            driver.close()

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from linkedin_scraper import actions

import os

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

def scrape_job_search(keyword):
    driver = webdriver.Chrome(options=set_chrome_options())
    actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
    print("... Logged in.")
    job_search = JobSearch(driver=driver, close_on_complete=False, scrape=False)

    job_listings = job_search.search(keyword) # returns the list of `Job` from the first page
    return job_listings

def scrape_job(job_link):
    driver = webdriver.Chrome(options=set_chrome_options())
    actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
    job = _Job(linkedin_url=job_link, driver=driver, close_on_complete=False, scrape=True)
    return job

In [4]:
job_listings_1 = scrape_job_search("data")
job_listings_1

... Logged in.


[<Job Senior Game Analyst Next Games, A Netflix Game Studio>,
 <Job IT Support Eng I (ONS) Amazon Web Services (AWS)>,
 <Job Data Analyst, Merge Mansion Metacore>,
 <Job Senior Legal and Privacy Counsel, Veho Group Veho Oy Ab>,
 <Job Data Science - Machine Learning Engineer Wolt>,
 <Job Data Engineer The Hub>,
 <Job Data Platform Engineer COR Group Oy>,
 <Job Monitoring & Surveillance Specialist Nordea>,
 <Job Machine Learning Engineer - MLOps Wolt>,
 <Job Data Platform and AI Trainees OP Financial Group>,
 <Job Data Scientist, Data Platform & AI OP Financial Group>,
 <Job Lead Developer, Banking Data Platform (Helsinki or Oulu) OP Financial Group>,
 <Job RWE Scientist / Epidemiologist MedEngine>,
 <Job Data Engineer The Hub>]

In [6]:
# job_listings_2 = scrape_job_search("data")
job_listings_2.base_url

AttributeError: 'list' object has no attribute 'base_url'

In [9]:
import urllib

os.path.join("https://www.linkedin.com/jobs/", "search") + f"?keywords={urllib.parse.quote('data')}&refresh=true"

'https://www.linkedin.com/jobs/search?keywords=data&refresh=true'