# Scrape Linkedin Data

In [1]:
# Make sure we have installed the dependency
! pip freeze | grep linkedin

linkedin-scraper==2.11.2


In [2]:
! google-chrome-stable --version

In [3]:
from linkedin_scraper import JobSearch, Job, actions
from typing import List
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
from pprint import pprint
import urllib
from time import sleep

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

class _JobSearch(JobSearch):
    def __init__(self, final_url=None, **kwargs):
        self.final_url = final_url
        self.current_url = None
        super().__init__(**kwargs)
    
    def search(self, search_term: str, page_n) -> List[Job]:
        if self.final_url is None:
            self.current_url = os.path.join(self.base_url, "search") + f"?keywords={urllib.parse.quote(search_term)}&refresh=true"
            self.driver.get(self.current_url)

            # Get redirection URL
            self.final_url = self.driver.current_url
        else:
            self.current_url = os.path.join(self.final_url, f"&start={25*(page_n-1)}")
            self.driver.get(self.current_url)
        
        self.scroll_to_bottom()
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        job_listing_class_name = "jobs-search-results-list"
        job_listing = self.wait_for_element_to_load(name=job_listing_class_name)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 0.3)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 0.6)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        self.scroll_class_name_element_to_page_percent(job_listing_class_name, 1)
        self.focus()
        sleep(self.WAIT_FOR_ELEMENT_TIMEOUT)

        job_results = []
        for job_card in self.wait_for_all_elements_to_load(name="job-card-list", base=job_listing):
            job = self.scrape_job_card(job_card)
            job_results.append(job)
        return job_results

def are_same(job1: Job, job2: Job):
    if job1.job_title == job2.job_title and job1.company == job2.company:
        return True
    return False

## 1. Scrape Job Search

Scrape the first 50 pages of the search result.

In [4]:
# Set up the lower-level services for scraping
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) # if email and password isnt given, it'll prompt in terminal
print("... Logged in.")
job_search = _JobSearch(driver=driver, close_on_complete=False, scrape=False)

... Logged in.


In [5]:
%%time
from selenium.common.exceptions import TimeoutException

N_PAGES = 50
SEARCH_KEYWORD = "data"

jobs = []
for page_n in range(1, N_PAGES+1):
    pprint(f"Searching jobs... Keyword: {SEARCH_KEYWORD}; Page {page_n}/{N_PAGES}")
    try:
        new_batch = job_search.search(SEARCH_KEYWORD, page_n)
    except TimeoutException:
        pprint(f"SKIPPED PAGE: {page_n}")
        continue

    # Check if the new batch of jobs are duplicates, 
    # which means we have gone through all the pages and should quit scraping.
    if jobs and are_same(new_batch[0], jobs[0]):
        pprint("Found duplicate results! All the pages have been scraped. Quiting...")
        break
        
    jobs.extend(new_batch)
    pprint(f"FINISHED PAGE: {page_n}")

'SKIPPED PAGE: 35'
'Searching jobs... Keyword: data; Page 36/50'
'SKIPPED PAGE: 36'
'Searching jobs... Keyword: data; Page 37/50'
'SKIPPED PAGE: 37'
'Searching jobs... Keyword: data; Page 38/50'
'SKIPPED PAGE: 38'
'Searching jobs... Keyword: data; Page 39/50'
'SKIPPED PAGE: 39'
'Searching jobs... Keyword: data; Page 40/50'
'SKIPPED PAGE: 40'
'Searching jobs... Keyword: data; Page 41/50'
'SKIPPED PAGE: 41'
'Searching jobs... Keyword: data; Page 42/50'
'FINISHED PAGE: 42'
'Searching jobs... Keyword: data; Page 43/50'
'FINISHED PAGE: 43'
'Searching jobs... Keyword: data; Page 44/50'
'FINISHED PAGE: 44'
'Searching jobs... Keyword: data; Page 45/50'
'FINISHED PAGE: 45'
'Searching jobs... Keyword: data; Page 46/50'


WebDriverException: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: headless chrome=114.0.5735.90)
Stacktrace:
#0 0x558cae90e4e3 <unknown>
#1 0x558cae63dc76 <unknown>
#2 0x558cae635c7f <unknown>
#3 0x558cae627ca2 <unknown>
#4 0x558cae629412 <unknown>
#5 0x558cae6280ca <unknown>
#6 0x558cae627168 <unknown>
#7 0x558cae626fa0 <unknown>
#8 0x558cae6259bf <unknown>
#9 0x558cae625fed <unknown>
#10 0x558cae63fb06 <unknown>
#11 0x558cae6b19e5 <unknown>
#12 0x558cae699012 <unknown>
#13 0x558cae6b130e <unknown>
#14 0x558cae698de3 <unknown>
#15 0x558cae66e2dd <unknown>
#16 0x558cae66f34e <unknown>
#17 0x558cae8ce3e4 <unknown>
#18 0x558cae8d23d7 <unknown>
#19 0x558cae8dcb20 <unknown>
#20 0x558cae8d3023 <unknown>
#21 0x558cae8a11aa <unknown>
#22 0x558cae8f76b8 <unknown>
#23 0x558cae8f7847 <unknown>
#24 0x558cae907243 <unknown>
#25 0x7fcfa7494ac3 <unknown>


In [6]:
len(jobs)

334

In [7]:
# Save today's crawl temporarily
import pickle
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"helsinki_data_jobs_{current_date}.pkl"
with open(f"../data/tmp/{fname}", "wb") as f:
    dicted_jobs = [job.to_dict() for job in jobs]
    pickle.dump(dicted_jobs,f)

## 2. Scrape job postings

In [8]:
import logging
from linkedin_scraper import Job, actions

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class _Job(Job):
    def __init__(self, **kwargs):
       self.job_title = ""
       self.required_skills = ""
       self.job_type_1 = ""
       self.job_type_2 = ""
 
       super().__init__(**kwargs)
    
    def scrape_logged_in(self, close_on_complete=True):
        driver = self.driver
        
        driver.get(self.linkedin_url)
        self.focus()
        self.job_title = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-unified-top-card__job-title')]").text.strip()
        self.company = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a[1]").text.strip()
        self.company_linkedin_url = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//a").get_attribute("href")
        self.location = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//*").text.strip()
        self.posted_date = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-jobs-unified-top-card__primary-description')]//span[3]").text.strip()
        self.job_type_1 = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')]/span").text.strip()
        self.job_description = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'jobs-description')]").text.strip()
        
        try:
            self.required_skills = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][1]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.required_skills += self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'job-details-how-you-match__skills-item')][2]//a").text.strip()
        except TimeoutException as e:
            logger.error(str(e))

        try:
            self.job_type_2 = self.wait_for_element_to_load(by=By.XPATH, name="(//*[contains(@class, 'ui-label ui-label--accent-3 text-body-small')])[2]/span").text.strip()
        except TimeoutException:
            self.job_type_2 = ""
            
        try:
            self.applicant_count = self.wait_for_element_to_load(by=By.XPATH, name="jobs-unified-top-card__applicant-count").text.strip()
        except TimeoutException:
            self.applicant_count = 0
        
        try:
            self.benefits = self.wait_for_element_to_load(by=By.XPATH, name="//*[contains(@class, 'salary-main-rail-card')]").text.strip()
        except TimeoutException:
            self.benefits = ""

        if close_on_complete:
            driver.close()

In [9]:
from typing import List
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import os
from pprint import pprint
import urllib
from time import sleep

def set_chrome_options() -> Options:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

In [13]:
# Set up low-level servies for scraping
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) 
print("... Logged in.")

... Logged in.


Ignore the error logs!

In [14]:
import pickle
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"helsinki_data_jobs_{current_date}.pkl"

with open(f"../data/tmp/{fname}", "rb") as f:
    jobs = pickle.load(f)

print(len(jobs))

334


In [None]:
%%time
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from time import sleep

N_JOBS = len(jobs)

crawled_jobs = []
for i, job in enumerate(jobs):
    print(f"Crawling... Jobs {i+1}/{N_JOBS}")
    try:
        _crawled_job = _Job(linkedin_url=job.get("linkedin_url"), driver=driver, close_on_complete=False, scrape=True)
        crawled_jobs.append(_crawled_job)
        sleep(1)
    except StaleElementReferenceException or TimeoutException:
        print(f"... Skipped Job {i+1}/{N_JOBS}.")
        sleep(1)
        continue

In [None]:
import pandas as pd

In [None]:
df_crawled_jobs = pd.DataFrame([vars(job) for job in crawled_jobs]
                              ).drop(columns=["driver"]
                              ).drop_duplicates("linkedin_url")

In [None]:
df_crawled_jobs

In [None]:
# df_crawled_jobs.to_csv(f"../data/crawled_jobs_1-{len(crawled_jobs}_checkpoint.csv", index=False)

### 2.1 Continue from the failed point

In [32]:
# In case session expiration
driver = webdriver.Chrome(options=set_chrome_options())
actions.login(driver, os.environ["EMAIL"], os.environ["PWORD"]) 
print("... Logged in.")

... Logged in.


In [33]:
%%time
# Continue
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

CONTINUE_FROM = 291

for i, job in enumerate(jobs):
    if i+1<CONTINUE_FROM:
        continue
        
    print(f"Crawling... Jobs {i+1}/{N_JOBS}")
    try:
        _crawled_job = _Job(linkedin_url=job.get("linkedin_url"), driver=driver, close_on_complete=False, scrape=True)
        crawled_jobs.append(_crawled_job)
        sleep(1)
    except StaleElementReferenceException or TimeoutException:
        print(f"... Skipped Job {i+1}/{N_JOBS}.")
        sleep(1)
        continue

Crawling... Jobs 291/334
Crawling... Jobs 292/334


ERROR:__main__:Message: 
Stacktrace:
#0 0x55ee225714e3 <unknown>
#1 0x55ee222a0c76 <unknown>
#2 0x55ee222dcc96 <unknown>
#3 0x55ee222dcdc1 <unknown>
#4 0x55ee223167f4 <unknown>
#5 0x55ee222fc03d <unknown>
#6 0x55ee2231430e <unknown>
#7 0x55ee222fbde3 <unknown>
#8 0x55ee222d12dd <unknown>
#9 0x55ee222d234e <unknown>
#10 0x55ee225313e4 <unknown>
#11 0x55ee225353d7 <unknown>
#12 0x55ee2253fb20 <unknown>
#13 0x55ee22536023 <unknown>
#14 0x55ee225041aa <unknown>
#15 0x55ee2255a6b8 <unknown>
#16 0x55ee2255a847 <unknown>
#17 0x55ee2256a243 <unknown>
#18 0x7f5405894ac3 <unknown>



Crawling... Jobs 293/334
Crawling... Jobs 294/334
Crawling... Jobs 295/334
Crawling... Jobs 296/334


ERROR:__main__:Message: 
Stacktrace:
#0 0x55ee225714e3 <unknown>
#1 0x55ee222a0c76 <unknown>
#2 0x55ee222dcc96 <unknown>
#3 0x55ee222dcdc1 <unknown>
#4 0x55ee223167f4 <unknown>
#5 0x55ee222fc03d <unknown>
#6 0x55ee2231430e <unknown>
#7 0x55ee222fbde3 <unknown>
#8 0x55ee222d12dd <unknown>
#9 0x55ee222d234e <unknown>
#10 0x55ee225313e4 <unknown>
#11 0x55ee225353d7 <unknown>
#12 0x55ee2253fb20 <unknown>
#13 0x55ee22536023 <unknown>
#14 0x55ee225041aa <unknown>
#15 0x55ee2255a6b8 <unknown>
#16 0x55ee2255a847 <unknown>
#17 0x55ee2256a243 <unknown>
#18 0x7f5405894ac3 <unknown>



Crawling... Jobs 297/334
Crawling... Jobs 298/334
Crawling... Jobs 299/334
Crawling... Jobs 300/334


ERROR:__main__:Message: 
Stacktrace:
#0 0x55ee225714e3 <unknown>
#1 0x55ee222a0c76 <unknown>
#2 0x55ee222dcc96 <unknown>
#3 0x55ee222dcdc1 <unknown>
#4 0x55ee223167f4 <unknown>
#5 0x55ee222fc03d <unknown>
#6 0x55ee2231430e <unknown>
#7 0x55ee222fbde3 <unknown>
#8 0x55ee222d12dd <unknown>
#9 0x55ee222d234e <unknown>
#10 0x55ee225313e4 <unknown>
#11 0x55ee225353d7 <unknown>
#12 0x55ee2253fb20 <unknown>
#13 0x55ee22536023 <unknown>
#14 0x55ee225041aa <unknown>
#15 0x55ee2255a6b8 <unknown>
#16 0x55ee2255a847 <unknown>
#17 0x55ee2256a243 <unknown>
#18 0x7f5405894ac3 <unknown>



Crawling... Jobs 301/334
Crawling... Jobs 302/334
Crawling... Jobs 303/334
Crawling... Jobs 304/334
Crawling... Jobs 305/334
Crawling... Jobs 306/334
Crawling... Jobs 307/334
Crawling... Jobs 308/334
Crawling... Jobs 309/334
Crawling... Jobs 310/334
Crawling... Jobs 311/334
Crawling... Jobs 312/334
Crawling... Jobs 313/334
Crawling... Jobs 314/334
Crawling... Jobs 315/334
Crawling... Jobs 316/334
Crawling... Jobs 317/334
Crawling... Jobs 318/334
Crawling... Jobs 319/334
Crawling... Jobs 320/334
Crawling... Jobs 321/334
Crawling... Jobs 322/334
Crawling... Jobs 323/334
Crawling... Jobs 324/334
Crawling... Jobs 325/334
Crawling... Jobs 326/334
Crawling... Jobs 327/334
Crawling... Jobs 328/334
Crawling... Jobs 329/334
Crawling... Jobs 330/334
Crawling... Jobs 331/334
Crawling... Jobs 332/334
Crawling... Jobs 333/334
Crawling... Jobs 334/334
CPU times: user 2.19 s, sys: 342 ms, total: 2.54 s
Wall time: 10min 23s


In [None]:
import pandas as pd
df_crawled_jobs = pd.DataFrame([vars(job) for job in crawled_jobs]).drop(columns=["driver"]).drop_duplicates("linkedin_url")
df_crawled_jobs

In [None]:
# Save today's crawl
import datetime

current_date = datetime.datetime.now().strftime("%Y-%m-%d")
fname = f"../data/crawled_jobs_{current_date}.csv"

df_crawled_jobs.to_csv(fname, index=False)