# Scrape Linkedin Data

Crawling Linkedin is against the user agreement of Linkedin! This is used for educational purpose only.

In [1]:
# Make sure we have installed the dependency
! pip freeze | grep linkedin

linkedin-scraper==2.11.2


In [2]:
! google-chrome-stable --version

Google Chrome 114.0.5735.90 


In [3]:
from linkedin_scraper import JobSearch, Job, actions
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
from pprint import pprint

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

def scrape_job_search(keyword):
    driver = webdriver.Chrome(options=set_chrome_options())
    actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
    print("... Logged in.")
    job_search = JobSearch(driver=driver, close_on_complete=False, scrape=False)

    job_listings = job_search.search(keyword) # returns the list of `Job` from the first page
    return job_listings

def scrape_job(job_link):
    driver = webdriver.Chrome(options=set_chrome_options())
    actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
    job = Job(job_link, driver=driver, close_on_complete=False)
    return job

## 1. Define the keywords of data jobs

In [4]:
_data_job_keywords_specific = ["data analyst"] + [prefix + " " + suffix 
                      for prefix in ["data", "machine learning", "computer vision", "nlp", "robotics", "multimodal", "ai"]
                      for suffix in ["scientist", "engineer", "researcher"]
                     ]
_data_job_keywords_general = ["data", "machine learning", "computer vision", "nlp", "robotics", "multimodal", "ai"]
pprint(_data_job_keywords_specific)
pprint(_data_job_keywords_general)

['data analyst',
 'data scientist',
 'data engineer',
 'data researcher',
 'machine learning scientist',
 'machine learning engineer',
 'machine learning researcher',
 'computer vision scientist',
 'computer vision engineer',
 'computer vision researcher',
 'nlp scientist',
 'nlp engineer',
 'nlp researcher',
 'robotics scientist',
 'robotics engineer',
 'robotics researcher',
 'multimodal scientist',
 'multimodal engineer',
 'multimodal researcher',
 'ai scientist',
 'ai engineer',
 'ai researcher']
['data',
 'machine learning',
 'computer vision',
 'nlp',
 'robotics',
 'multimodal',
 'ai']


## 2. Crawl job listings

### Test run

In [5]:
_job_listings = scrape_job_search("data")
_job_listings

... Logged in.


[<Job Data Analyst, Dream Blast Rovio Entertainment Corporation>,
 <Job Data Scientist MedEngine>,
 <Job Data Platform Lead Architect - Tietoevry Care Data and Analytics Tietoevry>,
 <Job Digital Cloud Solution Architect - Azure - Danish, Finnish or Norwegian speakers Microsoft>,
 <Job Data Engineer The Hub>,
 <Job Data Engineer The Hub>,
 <Job Monitoring & Surveillance Specialist Nordea>,
 <Job Data Science - Machine Learning Engineer Wolt>,
 <Job Business Analyst / Senior Business Analyst to Data and Analysis in KYC Nordea>,
 <Job Data Engineer Nixu Corporation>,
 <Job Nordic IT Data Engineer - Nordic Marketing Company AstraZeneca>,
 <Job Senior SOC Analyst Fortum>,
 <Job Senior Game Analyst Netflix>,
 <Job Grow as Data Engineer with us – Solita´s personalised onboarding program Solita>]

In [6]:
vars(_job_listings[0])

{'driver': <selenium.webdriver.chrome.webdriver.WebDriver (session="1f293d2e43047a8f28e39a32c5b8d8ff")>,
 'linkedin_url': 'https://www.linkedin.com/jobs/view/3714372433/?eBP=CwEAAAGLTdUQpGQF09-CdoWc4cusOWm2TWIvEtSvECM1xlqP_afKMjBs7FBkDqUaztu-25__cZWKZpbOxsDNCwt_3gOFrxADsg7mdhXdkC3RikGDvmr4yZClqua33seWZQr_8ZGev4iaDe2a8Wv4G3XUeGX5C_q61BrXKDLP5ZJhyqeux_H6TXFQNw2Gk5onS3v7UGb1zOaYe7M533VBpmAh66p9dgFhnwfeXLpRTL_du8_7zLg4DrEKKkYvurw4g5keHYDK9sZnnX_VkxyMonT0oOXESInx16-pjjx367szmw-vLZmXhWiXz7wPopucJddZT4eKiy9zlKlpIspAWb35fVjSaUDWeB-TBaDgYtrrgg_uAklhZVfIOpE-cNTOYmkA5pgPv9CGxP1mNJxq4VXmeurSYlVcl6U&refId=brZtKrySfWZiFfEufTSwbw%3D%3D&trackingId=oxvrW%2B0EmnW4w51kDgqkug%3D%3D&trk=flagship3_search_srp_jobs',
 'job_title': 'Data Analyst, Dream Blast',
 'company': 'Rovio Entertainment Corporation',
 'company_linkedin_url': None,
 'location': 'Helsinki, Uusimaa, Finland (Hybrid)',
 'posted_date': None,
 'applicant_count': None,
 'job_description': None,
 'benefits': None}

In [11]:
_job_listings[0].WAIT_FOR_ELEMENT_TIMEOUT = 30

In [12]:
# This doesn't work: Timeout Error
# _job_listings[0].scrape(close_on_complete=False)
# vars(_job_listings[0])

TimeoutException: Message: 
Stacktrace:
#0 0x556cccdef4e3 <unknown>
#1 0x556cccb1ec76 <unknown>
#2 0x556cccb5ac96 <unknown>
#3 0x556cccb5adc1 <unknown>
#4 0x556cccb947f4 <unknown>
#5 0x556cccb7a03d <unknown>
#6 0x556cccb9230e <unknown>
#7 0x556cccb79de3 <unknown>
#8 0x556cccb4f2dd <unknown>
#9 0x556cccb5034e <unknown>
#10 0x556cccdaf3e4 <unknown>
#11 0x556cccdb33d7 <unknown>
#12 0x556cccdbdb20 <unknown>
#13 0x556cccdb4023 <unknown>
#14 0x556cccd821aa <unknown>
#15 0x556cccdd86b8 <unknown>
#16 0x556cccdd8847 <unknown>
#17 0x556cccde8243 <unknown>
#18 0x7f3e74894ac3 <unknown>


In [29]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By

with open("job_page_example.html", "w") as f:
    f.write(_job_listings[0].driver.page_source)

In [36]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

# job_title
WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "//*[contains(@class, 'jobs-unified-top-card__job-title')]"
        )
    )
).text.strip()

'Data Analyst, Dream Blast'

In [94]:
# company
WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a[1]"
        )
    )
).text.strip()

'Rovio Entertainment Corporation'

In [118]:
# Location
WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//*"
        )
    )
).text.strip()

'Rovio Entertainment Corporation · Helsinki, Uusimaa, Finland Reposted  4 days ago  · 232 applicants'

In [130]:
# Job type
WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')]/span"
        )
    )
).text.strip()

'Hybrid'

In [134]:
# Job type 2
WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "(//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')])[2]//span"
        )
    )
).text.strip()

'Full-time'

In [83]:
# Post Date
WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//span[3]"
        )
    )
).text.strip()

'Reposted  4 days ago'

In [51]:
# company linkedin url
WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a"
        )
    )
).get_attribute("href")

'https://www.linkedin.com/company/rovio/life'

In [52]:
# Job description
job_description_elem = WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "//*[contains(@class, 'jobs-description')]"
        )
    )
)

job_description_elem.text.strip()

'About the job\nAt Rovio you will get to work with multiple groundbreaking IP’s including one of the most famous game IP’s in the world: Angry Birds! We craft joy with player-focused gaming experiences that last for decades. In order to do that, we know that people need to bring their own joy to what we do. That’s why we value work-life balance, say no to crunch culture, and welcome people from all walks of life to join the flock. Today, we are a proud team of 500+ caring and talented professionals representing over 50 different nations.\n\nWe trust our teams to work autonomously by providing them the right tools and level of responsibility. We believe in our teams to remain creative and to keep learning – as well as ensuring everyone has opportunities for personal growth.\n\nData and Analytics is the craft that is behind all the critical decisions here at Rovio and as a Game Data Analyst you will be an integral part of a cross functional Game team working closely with Product and Desi

In [69]:
# First set of skills added by the job poster
WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "//*[contains(@class, 'job-details-how-you-match__skills-item')][1]//a"
        )
    )
).text

'Analytical Skills, Analytics, Data Analysis, English, Product Management, Python (Programming Language), and SQL'

In [68]:
# 2nd set of skills added by the job poster
WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "//*[contains(@class, 'job-details-how-you-match__skills-item')][2]//a"
        )
    )
).text

'Apache Spark, Dashboards, and Team Leadership'

In [59]:
# Salary range and benefits
WebDriverWait(_job_listings[0].driver, 3).until(
    EC.presence_of_element_located(
        (By.XPATH,
        "//*[contains(@class, 'salary-main-rail-card')]"
        )
    )
).text.strip()

''

## Overwrite the crawler methods to debug

In [1]:
import logging
from linkedin_scraper import Job

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class _Job(Job):
    def __init__(self, **kwargs):
       self.job_title = ""
       self.required_skills = ""
       self.job_type_1 = ""
       self.job_type_2 = ""
 
       super().__init__(**kwargs)
    
    def scrape_logged_in(self, close_on_complete=True):
        driver = self.driver
        
        driver.get(self.linkedin_url)
        self.focus()
        self.job_title = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-unified-top-card__job-title')]").text.strip()
        self.company = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a[1]").text.strip()
        self.company_linkedin_url = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a").get_attribute("href")
        self.location = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//*").text.strip()
        self.posted_date = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//span[3]").text.strip()
        self.job_type_1 = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')]/span").text.strip()
        self.job_description = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-description')]").text.strip()
        
        try:
            self.required_skills = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][1]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.required_skills += self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][2]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.job_type_2 = self.wait_for_element_to_load(by=By.XPATH, name="(//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')])[2]/span").text.strip()
        except TimeoutException:
            self.job_type_2 = ""
            
        try:
            self.applicant_count = self.wait_for_element_to_load(by=By.XPATH, name="jobs-unified-top-card__applicant-count").text.strip()
        except TimeoutException:
            self.applicant_count = 0
        
        try:
            self.benefits = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'salary-main-rail-card')]").text.strip()
        except TimeoutException:
            self.benefits = ""

        if close_on_complete:
            driver.close()

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from linkedin_scraper import actions

import os

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

def _scrape_job(job_link):
    driver = webdriver.Chrome(options=set_chrome_options())
    actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
    job = _Job(linkedin_url=job_link, driver=driver, close_on_complete=False, scrape=True)
    return job

_test_job = _scrape_job('https://www.linkedin.com/jobs/view/3714372433/?eBP=CwEAAAGLTdUQpGQF09-CdoWc4cusOWm2TWIvEtSvECM1xlqP_afKMjBs7FBkDqUaztu-25__cZWKZpbOxsDNCwt_3gOFrxADsg7mdhXdkC3RikGDvmr4yZClqua33seWZQr_8ZGev4iaDe2a8Wv4G3XUeGX5C_q61BrXKDLP5ZJhyqeux_H6TXFQNw2Gk5onS3v7UGb1zOaYe7M533VBpmAh66p9dgFhnwfeXLpRTL_du8_7zLg4DrEKKkYvurw4g5keHYDK9sZnnX_VkxyMonT0oOXESInx16-pjjx367szmw-vLZmXhWiXz7wPopucJddZT4eKiy9zlKlpIspAWb35fVjSaUDWeB-TBaDgYtrrgg_uAklhZVfIOpE-cNTOYmkA5pgPv9CGxP1mNJxq4VXmeurSYlVcl6U&refId=brZtKrySfWZiFfEufTSwbw%3D%3D&trackingId=oxvrW%2B0EmnW4w51kDgqkug%3D%3D&trk=flagship3_search_srp_jobs')

In [3]:
vars(_test_job)

{'job_title': 'Data Analyst, Dream Blast',
 'required_skills': 'Analytical Skills, Analytics, Data Analysis, English, Product Management, Python (Programming Language), and SQLApache Spark, Dashboards, and Team Leadership',
 'job_type_1': 'Hybrid',
 'job_type_2': 'Full-time',
 'driver': <selenium.webdriver.chrome.webdriver.WebDriver (session="5912e2b35080dc2c19243c3bb1906f2d")>,
 'linkedin_url': 'https://www.linkedin.com/jobs/view/3714372433/?eBP=CwEAAAGLTdUQpGQF09-CdoWc4cusOWm2TWIvEtSvECM1xlqP_afKMjBs7FBkDqUaztu-25__cZWKZpbOxsDNCwt_3gOFrxADsg7mdhXdkC3RikGDvmr4yZClqua33seWZQr_8ZGev4iaDe2a8Wv4G3XUeGX5C_q61BrXKDLP5ZJhyqeux_H6TXFQNw2Gk5onS3v7UGb1zOaYe7M533VBpmAh66p9dgFhnwfeXLpRTL_du8_7zLg4DrEKKkYvurw4g5keHYDK9sZnnX_VkxyMonT0oOXESInx16-pjjx367szmw-vLZmXhWiXz7wPopucJddZT4eKiy9zlKlpIspAWb35fVjSaUDWeB-TBaDgYtrrgg_uAklhZVfIOpE-cNTOYmkA5pgPv9CGxP1mNJxq4VXmeurSYlVcl6U&refId=brZtKrySfWZiFfEufTSwbw%3D%3D&trackingId=oxvrW%2B0EmnW4w51kDgqkug%3D%3D&trk=flagship3_search_srp_jobs',
 'company': 'Rovio E

## Debug JobSearch to make it scrape new pages

In [5]:
from linkedin_scraper import JobSearch

driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
print("... Logged in.")
job_search = JobSearch(driver=driver, close_on_complete=False, scrape=False)

... Logged in.


In [9]:
import urllib
url = os.path.join(job_search.base_url, "search") + f"?keywords={urllib.parse.quote('data')}&refresh=true"

In [10]:
job_search.driver.get(url)

In [11]:
# Final redirection
job_search.driver.current_url

'https://www.linkedin.com/jobs/search/?currentJobId=3717037977&keywords=data&refresh=true'

In [7]:
from linkedin_scraper import JobSearch, Job, actions
from typing import List
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
from pprint import pprint
import urllib
from time import sleep

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

class _JobSearch(JobSearch):
    def __init__(self, final_url=None, **kwargs):
        self.final_url = final_url
        self.current_url = None
        super().__init__(**kwargs)
    
    def search(self, search_term: str, page_n) -> List[Job]:
        if self.final_url is None:
            self.current_url = os.path.join(self.base_url, "search") + f"?keywords={urllib.parse.quote(search_term)}&refresh=true"
            self.driver.get(self.current_url)

            # Get redirection URL
            self.final_url = self.driver.current_url
        else:
            self.current_url = os.path.join(self.final_url, f"&start={25*(page_n-1)}")
            self.driver.get(self.current_url)
        
        self.scroll_to_bottom()
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        job_listing_class_name = "jobs-search-results-list"
        job_listing = self.wait_for_element_to_load(name=job_listing_class_name)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 0.3)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 0.6)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 1)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        job_results = []
        for job_card in self.wait_for_all_elements_to_load(name="job-card-list", base=job_listing):
            job = self.scrape_job_card(job_card)
            job_results.append(job)
        return job_results

def _scrape_job_search(final_url, keyword, page_n):
    driver = webdriver.Chrome(options=set_chrome_options())
    actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
    print("... Logged in.")
    job_search = _JobSearch(driver=driver, close_on_complete=False, scrape=False)

    job_listings = job_search.search(keyword, page_n) # returns the list of `Job` from the first page
    return job_search, job_listings

In [8]:
_test_job_search, _test_listings = _scrape_job_search("data", page_n=1)
_test_listings

... Logged in.


[<Job Legal & Compliance Officer Sambla>,
 <Job Senior Game Analyst Next Games, A Netflix Game Studio>,
 <Job Data Analyst, Merge Mansion Metacore>,
 <Job Data Scientist, Data Platform & AI OP Financial Group>,
 <Job Data Science - Machine Learning Engineer Wolt>,
 <Job Data Engineer The Hub>,
 <Job Senior Threat Intelligence Officer, Nordic Nordea>,
 <Job RWE Scientist / Epidemiologist MedEngine>,
 <Job Data Platform Engineer COR Group Oy>,
 <Job Data Engineer Sievo>,
 <Job Senior Data Analyst, Ads Rovio Entertainment Corporation>,
 <Job Data & AI Fin Informatica MDM Accenture Nordics>,
 <Job Senior Legal and Privacy Counsel, Veho Group Veho Oy Ab>,
 <Job Data Engineer The Hub>]

In [13]:
_test_listings_2 = _test_job_search.search("data", page_n=2)
_test_listings_2

[<Job Data Scientist / Senior Data Scientist Basware>,
 <Job Data Engineer - Tietoevry Care Data and Analytics Tietoevry>,
 <Job Data Scientist MedEngine>,
 <Job Data Engineer - Tietoevry Care Data and Analytics Tietoevry>,
 <Job Data Platform Engineer Scandit>,
 <Job Data Engineer (Level Up) Loihde Advance>,
 <Job ETL Specialist Gazelle Global>,
 <Job Data Engineer - Tietoevry Care Data and Analytics Tietoevry>,
 <Job Data Engineer - Tietoevry Care Data and Analytics Tietoevry>,
 <Job IT-asiantuntija Anfra Oy>,
 <Job (Senior) Data Engineer - Tietoevry Tech Services Tietoevry>,
 <Job Machine Learning Engineer - MLOps Wolt>,
 <Job Data Engineer Suomen Palloliitto - Football Association of Finland>,
 <Job Quantitative Risk Analyst Nordea>,
 <Job Cloud & Enterprise Architect Samlink – A Kyndryl Company>,
 <Job Data Engineer Epical>]